
#	Simple Map - XML, text or HTML Site Map  Generator
#	A free open source Scriptol program,  by (c) Denis G. Sureau
#	http://www.scriptol.com/
     

include "path.sol"
include "libphp.sol"
include "tools.sol"
include "options.sol"
include "url.sol"
include "page.sol"

array siteList = []  	// source of sitemap, list of pages in full URL with site address + path + filename
dict siteTime = {}		// dict of last mod date+time for each URL 
array checkedList = []	// list of check links
array skipped			// page skipped once must be ignored again
dict defaultList = {}   // dict of internal directories and their home pages

text pageText
array siteMap = []
int counter = 0


void output(text t)
	siteMap.push(t)
return



# get the part of tag inside quotes or double-quotes, thus the url

text extractLink(int off)
	text c
	text link = ""
	while off < pageText.length()
		c = pageText[off]
		if c = '='  continue
		if c = ' '  continue
		if c = "\""  : off + 1; break; /if
		if c = '\'' : off + 1; break; /if
		break
	let off + 1
	while off < pageText.length()
		c = pageText[off]
		if c = '\'' break
		if c = "\""  break
		if c = '>'  break
		link + c             // anything else is part of the url
	let off + 1
return link.trim()


// Test if page already scanned

boolean inCheckedList(text page)
	if not hasProtocol(page)
		page = createLinkFromRelative(page)
	/if
	if page in checkedList return true
return false	


// Test if page already added to list

boolean inSiteMap(text page)
	if not hasProtocol(page)
		page = createLinkFromRelative(page)
	/if
	if page in siteList return true
return false	


// Add page to list, even if not valid, to not scan it again

void addFileList(text page)
	if not hasProtocol(page)
		page = createLinkFromRelative(page)
	/if
	if page in checkedList return 
	checkedList.push(page)
return 


// Add to the site map list of all pages to be referenced


boolean addLink(text page, real lmod)

    text url = setURL(page) 	// convert to slashes
	
	if not hasProtocol(url)
		url = Path.merge(siteURL, url)
	/if	
	
	if url in siteList return false  // already added, if possible
	
	siteList.push(url)

    if LASTMOD	
	   siteTime[url] = date(dateFormat, lmod)
	/if   
	
	if VERBOSE ? display("Added link $url")
return true	



// Get the URL from the text, skipping other chars
// Return a list of files with a subdirectory or not, and of URLs

array getLinks(text page)
	array x = []
	
	x.load(page)

	if DEBUG print "Getting links from ", getcwd(), page

	array pageLinks = []		// local links found on this page
  
	// removing ending special codes and building a text from the array
	scan x let x[] = x[].toText().rtrim()
	pageText = x.join("")            // merging lines into a text

	int offset = 0
	int srcoff = 0
	int shifting = 0
	
	while forever
		if offset <> -1 let offset = pageText.find("href", shifting)
		if srcoff <> -1 
			srcoff = pageText.find("frame src", shifting)
			if srcoff = -1
				srcoff = pageText.find("frame src", shifting)
			/if	
		/if	
        //print "DEBUG $fname offset=$offset frame=$srcoff"

		if offset < 1
			if srcoff < 1	break
			shifting = srcoff + 5
		else
			if srcoff < 1
				shifting = offset
			else
				if offset < srcoff
					shifting = offset
				else
					shifting = srcoff + 5
				/if
			/if
		/if	
		
		shifting + 4
		
		text link = extractLink(shifting)
		
		shifting + link.length()
	
		if link = "" 			continue
		if link[0] = '#' 		continue	// anchors are omitted
		if link in siteList 	continue	// page already scanned, don't add it to list
		if link in pageLinks 	continue	// page not scanned, but link already added
		
		if not validExtension(link) and not isDirectory(link) continue
		
		if "../" in link
			display(" ../ such path is not valid, use absolute URL instead (in " + link+").")
			continue
		/if
		
		if hasProtocol(link)
			text host = getURL(link)
			if host = siteURL	// this is the same current host, add the link
				pageLinks.push(link)
			/if
			continue	// same host or not, go to next line
		/if
    
		// local link, without web address, 
        // may be a simple html file or a subdirectory with a filename
		

        text realLink = link
        if isDirectory(link) 
            link = findDefault(link)
        /if                  
 		if WINDOWS let link = setWindows(link)		

		if file_exists(link)
			pageLinks.push(realLink)					
		else
			if DEBUG or VERBOSE	let display("Broken link: $link not in " + getcwd())
		/if
	
  /while
  
  //pageLinks.display()
  
return pageLinks



void buildTag(text tagname, text value)
	if value = "" return
	output("     <" + tagname + ">" + value + "</" + tagname + ">")
return



# from the list of files
# build now a xml file

void buildTheXmlFile()

	text name

	output("<?xml version=\"1.0\" encoding=\"UTF-8\"?>") 
	output("<urlset xmlns=\"http://www.google.com/schemas/sitemap/0.84\" ")
	output("xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" ")
	output("xsi:schemaLocation=\"http://www.google.com/schemas/sitemap/0.84 ")
	output("http://www.google.com/schemas/sitemap/0.84/sitemap.xsd\">")
	
	for name in siteList
	
		output("  <url>")
		
		buildTag("loc", textToUTF8(name))	// create a tag, content UTF-8 compatible

		if PRIORITY = true
			buildTag("priority", DEFAULT_PRIORITY)
		/if
	
		if (LASTMOD = true) or (LASTMODSHORT = true)
			text lmod = siteTime[name]
			if lmod <> nil	let buildTag("lastmod", lmod)
		/if
		
		if FREQUENCY = true
			buildTag("changefreq", DEFAULT_FREQUENCY)
		/if	
		
		output("  </url>")
	/for	

	output("</urlset>")
	
return


int levelOffset



# create a tag, content UTF-8 compatible

void buildLink(text link)

	text linkpath, name	

	output("<li>")
	link = textToUTF8(link)
	linkpath, name = Path.splitFile(link)
	output("<a href=" + link + ">" + name + "</a>")	
	output("</li>")
	
return

// extract dir

void openSubdir(text dirname)
	output("<br>")
	output("<h4>")
	output(dirname)
	output("</h4>")
	output("<ul>")
return

void closeSubdir()
	output("</ul>")
return



text removeBase(text linkpath, text base)
	int l = base.length()
	
	if l <  linkpath.length()
		if linkpath[0 -- l] = base return linkpath[l + 1 .. ]
	/if	

return linkpath

# test if the local part of the URL holds a directory

boolean hasDir(text name)
	int l = name.length()
	if l < 2 return false
	if name.find("/", 1) <> nil return true
return false	

# get dir

text getMainDir(text name)
	if name.length() = 0 return ""
	int i = name.find("/")
	if i < 1 return ""
return name[ -- i]
	


// parse the list of link at offset
// extract files
// and get position of first subdir
// base is the url plus the current path but filename

void processDirs(text base)

	text page
	text name, linkpath
	boolean empty = true
	text currdir = ""
	text thisdir
	
	# get all the files, display them
	
	for int i in 0 -- siteList.size()	
		if siteList[i] = "" continue
		page = siteList[i]
		name = removeBase(page, base)
		
		if not Path.hasDir(name)
			buildLink(page)
			siteList[i] = ""
		else
			empty = false
		/if	
	/for	

	if empty = true return

	// now we have only paths with dirs inside
	// for each dir dir in the list we have to call this function
	// this identify this dir and all other dirs

	for int i in 0 -- siteList.size()	
		if siteList[i] = "" continue
		page = siteList[i]
		name = removeBase(page, base)
	
		thisdir = getMainDir(name)
		if thisdir <> currdir
			openSubdir(thisdir)
			processDirs(base + "/" + thisdir)
			closeSubdir()	
			currdir = thisdir
		/if	
	/for

return


# scan list of links
# build a text array of page in sub-dirs
# make an entry for file in current directory

void processRoot()

	text urlpart	
	text name
	text link
	
	// process files

	output("<br>")
	output("<h1>")
	output(siteURL)
	output("</h1>")

	scan siteList
		link = siteList[]
		urlpart, name = splitURL(link)
		if Path.hasDir(name) <> true
			buildLink(link)
			siteList[] = ""
		/if	
	/scan	

	processDirs(siteURL)

return

# from the list of files
# build now a html page
# process file at the level
# then get first sub-dir, and loop

void buildTheHtmlTree()
	text name

	output("<html>")
	output("<head>")
	output("</head>")
	output("<body>")

	levelOffset = siteURL.length()
	processRoot()

	output("</body>")
	output("</html>")
	
return


# Check if meta tag robots allows to index the page
# If the tag is not added, the page will be indexed

boolean isIndexable(text fullpath)
    dict tags = get_meta_tags(fullpath)
    if not array_key_exists('robots', tags) ? return true
    text robots = tags['robots']
	if stripos(robots, "noindex") <> false return false
	if stripos(robots, "none") <> false return false   
return true

# Scan repository, build the list of files to add to the site map

void scanLocal(text locpath, text locdir)

	array content = scandir(locpath)
	boolean returned
	text url
	text src
	
	if content.empty() return
	
	// processing files
	
	text exdir = Path.merge(locdir, "*")
	if exdir not in excludedDirs
	
	for text name in content

        // .htaccess and such files must be uploaded manually and are ignored

        if name[0] = "."     
            if VERBOSE ? display("$locdir/$name skipped")
            continue 
        /if
        
        if name in exclusions ? continue

        src = Path.merge(locpath, name)
        url = Path.merge(locdir, name)
        url = Path.merge(siteURL, url)
		if VERBOSE ? display("Processing $src to $url")

		if filetype(src) = "file"
		    counter + 1
		    //text ext = Path.getExtension(src)
		    //if ext[1 ..] in extensions
		    if validExtension(src)
		        if filesize(src) < MINFILESIZE ? continue
		        if not isIndexable(src) ? continue
                real lmod = filetime(src)	
	            addLink(url, lmod) 
	        /if     
		/if
	/for
	
	else
	    display(locdir + " content skipped.")
	/if

	// processing subdirs
	
	int sulen = localRoot.length()
	
	for text name in content
	    if name[0] = '.' continue
		src = Path.merge(locpath, name)		    
		//print "EXCLUDE? ", src[sulen ..] 
		if src[sulen ..] in excludedDirs 
		    display(src + " skipped.")
		    continue
		/if    

		if filetype(src) = "dir"
			scanLocal(src, Path.merge(locdir, name))
		/if
	/for	

return



# Main Simple Map function

int SimpleMap(int num, array args)
	
	options(num, args)
	print
	print version
	
	display("Web adress: " + siteURL)
	display("Local repository: " + localRoot)

	if  "\\"  in localRoot ? WINDOWS = true

	scanLocal(localRoot, "/")
	
	display("")
	siteList.sort()

	if not hasProtocol(mapname)
	    localMap = Path.merge(localRoot, mapname)
	    mapname = Path.merge(siteURL, mapname)
	else
	    localMap = str_replace(siteURL, localRoot, mapname)
	/if	

	if VERBOSE = true ? display("Creating " + localMap)

	if mapType
	= OUT_XML:		buildTheXmlFile()
	= OUT_TEXT:		siteList.store(localMap) 		// a text file
	= OUT_HTML:		buildTheHtmlTree()
	/if

	if mapType <> OUT_TEXT
	    file f = fopen(localMap, "w")
	    for text line in siteMap
	        f.write(line + "\n")
	    /for
	    f.close()
		//if file_put_contents(localMap, siteMap) < 0 ? display("Map not created...")
	/if

	if GRAPHICAL ? logfile.store(logname)	
	
	display(text(counter) + " files found.")
	display(mapname + " will have  " + text(siteList.size()) + " links.")

return 0

