#!/net/u/1/f/fdc/kermit/wermit + # # ksitemap - sitemap.xml file constructor # .version = 1.03 # Version of this script .testingdest = ~/tmp/ # Result directory for testing - change if necessary # # Builds a sitemap.xml file for a website, with Google image extensions. # Requires: C-Kermit 9.0 Alpha.03 or later. # This file should be stored with execute permission. # Top line must give full pathname of C-Kermit 9.0 executable. # # Documentation: http://www.kermit.edu/ksitemap.html # # Optional command-line argument: path of filelist file. If the argument is # given the web directory is assumed to be the same directory where the # filelist is. If not given, a file named "filelist" in the current directory # is assumed if it exists. For details see the documentation or read below. # # Data file summary... ON TOP: # enconding=value -- The character set in which the fileliest is encoded # home=value -- URL of website home directory (required) # geo=value -- String: Default location for images (optional) # lic=value -- Filename of page containing copyright/license info (optional) # .xxx=value -- Define a macro named xxx (optional) # # REST OF FILE: Information for each page to be included in sitemap: # url=value[=value] -- Filename of web page (required, second value optional) # pri=value -- Priority for indexing (0.0 to 1.0) (optional) # # AND FOR EACH IMAGE IN A URL THAT YOU WANT TO BE INDEXED: # img=value -- Filename of an image used in this web page (optional) # cap=value -- Caption for this image (optional) # title=value - Title for this image (optional) # (You can list as many images as you like for each URL, within reason) # # Version history: # 1.00 Wed Dec 8 10:51:32 2010: Initial version # 1.01 Sat Dec 11 19:02:20 2010: Fix bug in change frequency calculation # 1.02 Tue Dec 28 15:18:15 2010: Allow for redirects # 1.03 Thu Dec 30 09:51:46 2010: Add macro capability # # Author: Frank da Cruz, December 2010. # if llt \v(version) 900299 exit 1 "C-Kermit 9.0 or later required" if def \$(DEBUG) set debug message on # DEBUG env variable requests debugging .unix = 0 # For "if unix ..." if equ "\v(system)" "UNIX" .unix = 1 .usedenv = 0 def errexit { # Fatal error macro echo \v(timestamp) \v(dir) sitemap.ksc: echo Error: \%* exit 1 } if def \%1 { # Command-line argument if any .webdirectory := \fdirname(\%1) # is pathname of file list file. if def webdirectory { # If it includes a directory part if not directory \m(webdirectory) { # Check it errexit NOT A DIRECTORY: \m(webdirectory) } cd \m(webdirectory) # and CD to it if fail errexit CD FAILED: \m(webdirectory) } .filelist := \fbasename(\%1) # And this is the name of the file if not def filelist .filelist = filelist } else if def \$(KSITEMAPDIR) { # Env variable KSITEMAPDIR exists .webdirectory := \$(KSITEMAPDIR) if not directory \m(webdirectory) { # Check it errexit "NOT A DIRECTORY: \m(webdirectory) [From $KSITEMAPDIR]" } cd \m(webdirectory) # and CD to it if fail errexit "CD \m(webdirectory) [From $KSITEMAPDIR]" .filelist = filelist # And the file-list file is filelist .usedenv = 1 } else { # Otherwise .webdirectory := \v(dir) # assume the current directory .filelist = filelist # And default the filename to filelist } .resultdirectory := \m(webdirectory) # Where to put sitemap.xml if debug { # Debugging .resultdirectory := \m(testingdest) echo DEBUGGING... echo \fbasename(\%0) V\m(version) if usedenv echo Parameters obtained from $KSITEMAPDIR environment variable: show mac webdirectory filelist echo Writing result to \m(testingdest)\m(filelist) echo current directory is \v(dir) } if not exist \m(filelist) { # Check that the file list file exists errexit FILE LIST NOT FOUND: \m(webdirectory)\m(filelist) } # Define some macros... define FERREXIT { # Fatal error reading file list file exit 1 [\flpad(\m(lineno),3,0)] \%1 [\m(line)] } define FERRWARN { # Warning about a file list line echo [\flpad(\m(lineno),3,0)] \%1 [\m(line)] } define FINISHIMAGE { # Macro to write Image epilog if inimg { # If we were doing an image... if def geo { # if location defined .\%9 := \m(geo) fwrite /line \%o " \%9" # add it. } if def lic { # If license URL defined .\%9 := \m(home)\m(lic) fwrite /line \%o " \%9" # Add it } fwrite /line \%o " " # Close image clause .inimg = 0 # No longer doing an image } } def FINISHURL { # Macro to write URL epilog if not inurl end 0 if > imginurl 0 { # Were there some images in this URL? finishimage # Finish current image } else { # No images - write priority now fwrite /line \%o " \m(priority)" } fwrite /line \%o # End of this URL .inurl = 0 # No longer doing a URL } # Begin execution... fopen /read \%c \m(filelist) # Open the file-list file if fail errexit "\v(lastcommand)" # Make sure it is open fopen /write \%o sitemap.tmp # Open the temporary sitemap file. if fail errexit "OPEN /WRITE FAILED" # Check # Write XML prolog to sitemap file... fwrite /line \%o # First line if fail errexit "WRITE FAILED: sitemap.tmp" # Check that FWRITE succeeded # If we get here all writes should succeed - continue the XML prolog... fwrite /line \%o } .tags = |url|pri|img|cap|title|home|geo|lic|encoding| # Valid tags .urls = 0 # Initialize URL counter .imgs = 0 # Image counter .imginurl = 0 # Image within URL counter .inurl = 0 # State flag: doing a URL .inimg = 0 # State flag: doing an image (in a URL) .lineno = 0 # File list file line number .doutf8 = 0 # Convert encoding to UTF-8 .mjd := \fmjd(today) # Today's date MJD format .encoding = UTF-8 # Default encoding for file-list file .home = # Web home directory (none yet) .globalgeo = # Global geographic location (ditto) .globallic = # Global license page (ditto) while true { # Loop to read and process file list fread /line /trim \%c line # Read a line and trim trailing blanks if fail break # Fail = end of file = all done increment lineno # Count this line if not defined line continue # If empty line read the next one .line := \fltrim(\m(line)) # Trim leading blanks if equ "\s(line[1:1])" "#" continue # If it's a comment line keep reading if equ "\v(version)" "900299" { # If C-Kermit is 9.0.299 if not lgt "\v(test)" "Alpha.09" { # Alpha.09 or earlier... # Work around "Quoting Hell" bug in CSV splitting if \findex(\\,\m(line)) .line := \freplace(\m(line),\\,\\\\) } } .\%9 := \fsplit(\m(line),&x,=,CSV) # Split line on '=' if < \%9 2 { ferrwarn "TAG WITH NO VALUE", continue } undef s1 s2 s3 .s1 := \&x[1] # Tag .s2 := \fcontents(\&x[2]) # Value if def \&x[3] .s3 := \fcontents(\&x[3]) # Optional second value (redirect) if doutf8 { # Converting character set? if not equ "7BIT" "\fstringtype(\m(s2))" { # Need to convert this one? .s2 := \fcvtcset(\m(s2),\m(encoding),utf-8) # Convert to UTF-8 } } if ( > \flen(s1) 1 && equ "\s(s1[1:1])" "." ) { # Macro definition _asg \s(s1[2]) \m(s2) # See Using C-Kermit p.457 if debug { message MACRO DEFINITION show mac \s(s1[2]) } continue } if not \findex(|\m(s1)|,\m(tags)) { # Preverify tag ferrwarn "UNKNOWN TAG '\m(s1)'- SKIPPING" continue } if match "\m(s2)" "*\\m(*)*" { # Check for macro reference in s2 .s2 := \frecurse(\m(s2)) # Replace macro with its expansion if debug { echo s2 MACRO EXPANSION, show mac s2 } } if match "\m(s3)" "*\\m(*)*" { # Check for macro reference in s2 .s3 := \frecurse(\m(s3)) if debug { echo s3 MACRO EXPANSION, show mac s3 } } # Handle each kind of tag... if equ "\m(s1)" "encoding" { # Encoding of filelist file .encoding := \m(s2) # Save it here message Encoding=\m(encoding) if not equ "\m(s2)" "UTF-8" .doutf8 = 1 # Must convert to UTF-8 continue } if equ "\m(s1)" "home" { # Website home directory # In Unix supply trailing slash if necessary if unix if neq "\fright(\m(s2),1)" "/" .s2 := \m(s2)/ .home := \m(s2) # to be used in building URLs continue } if equ "\m(s1)" "geo" { # Image geographic location if == 0 urls { # If geo given at the head of filelist .globalgeo := \m(s2) # make it the global default value } else { .geo := \m(s2) # set the local value } continue } if equ "\m(s1)" "lic" { # Website license page if == 0 urls { # Same as for geo .globallic := \m(s2) } else { .lic := \m(s2) } continue } if equ "\m(s1)" "url" { # Web page URL if not def home errexit "URL BEFORE HOME DEFINED" if inurl do finishurl # Finish previous URL if any .imginurl = 0 # How many images in this URL .priority = 0.5 # Default page priority (0.0-1.0) .name := \m(s2) # Filename of this web page .redirect := \m(s3) # Name (if any) it is redirected to if exist \m(s3) { # If redirect is indicated .redirect := \m(name) # swap the names .name := \m(s3) } if not exist \m(name) { ferrwarn "NOT EXIST \m(name)", continue } if not readable \m(name) { ferrwarn "NOT READABLE \m(name)" } .inurl = 1 # We are doing a URL now message \m(name)... # List the name if debugging incr urls # Count the URL fwrite /line \%o # Start the XML URL section # Add URL of this file to sitemap... if eq "\m(name)" "index.html" { # Special for home page fwrite /line \%o " \m(home)" } else if def redirect { # Special for redirected names fwrite /line \%o " \m(home)\m(redirect)" } else { # Normal case fwrite /line \%o " \m(home)\m(name)" } .s := \fcvtd(\fdate(\m(name)),3) # Modification date of file .s := \s(s[1:4])-\s(s[5:2])-\s(s[7:2]) # Just the date is enough fwrite /line \%o " \m(s)" # Add to sitemap .\%x := \fmjd(\fdate(\m(name))) # Modification date as MJD .\%y ::= \fmjd(\m(today)) - \%x # How many days ago .c = yearly # Default change frequency is yearly if < \%y 8 .c = daily # If modified in last 7 days daily else if < \%y 30 .c = weekly # or in last 30 days say weekly else if < \%y 100 .c = monthly # or in last 100 days say monthly fwrite /line \%o " \m(c)" # Add to sitemap continue } if equ "\m(s1)" "pri" { # Page priority if not inurl ferrexit "PRIORITY NOT IN URL" if not float \m(s2) ferrexit "PRIORITY NOT NUMERIC" if ( > \m(s2) 1.0 || < \m(s2) 0.0 ) ferrexit "PRIORITY OUT OF RANGE" .priority := \m(s2) # Save it for epilog (see finishurl) continue } if equ "\m(s1)" "img" { # Image finishimage # Finish previous image if any if == 0 imginurl { # First image for this URL? # Add page priority before listing any images fwrite /line \%o " \m(priority)" } .geo := \m(globalgeo) # If a global one defined use it .lic := \m(globallic) # wherever a local one is not given. if not inurl ferrexit "img not in url" if not exist \m(s2) { ferrwarn "IMG NOT EXIST: \m(s2)", continue } if not readabl \m(s2) { ferrwarn "IMG NOT READABLE: \m(s2)", continue } increment imgs # Count this image incr imginurl # Count image for this URL fwrite /line \%o " " # Start image clause fwrite /line \%o " \m(home)\m(s2)" # Put URL .inimg = 1 # We are doing an image now continue } if equ "\m(s1)" "cap" { # Image caption if not inimg ferrexit "CAP WITH NO IMG" if def s2 { # If the caption is not empty add it fwrite /line \%o " \m(s2)" } continue } if equ "\m(s1)" "title" { # Image title if not inimg ferrexit "TITLE WITH NO IMG" if def s2 { # If the title is not empty add it fwrite /line \%o " \m(s2)" } } } do finishurl # End of file list - finish last URL fwrite /line \%o # Finish the sitemap fclose \%o # Close the temporary sitemap file if exist sitemap.xml { # Rotate previous ones if exist sitemap.ayer copy /preserve sitemap.ayer sitemap.ante if fail message "FAILURE TO ROTATE OLD SITEMAP[1]" copy /preserve sitemap.xml sitemap.ayer if fail message "FAILURE TO ROTATE OLD SITEMAP[2]" } rename sitemap.tmp \m(resultdirectory)sitemap.xml # Install the new sitemap if fail errexit "FAILURE TO INSTALL NEW SITEMAP" if unix { # Unix... chmod 644 \m(resultdirectory)sitemap.xml # Make it world readable if fail errexit "CHMOD FAILURE - \m(resultdirectory)sitemap.xml" } # When run in a cron job this message arrives in email exit 0 "[\v(timestamp)] sitemap.ksc: OK - URLs: \m(urls); IMGs: \m(imgs)" ; Local Variables: ; comment-column:40 ; comment-start:"# " ; End: