#!/net/u/1/f/fdc/kermit/wermit +
#
# ksitemap - sitemap.xml file constructor
#
.version = 1.03 # Version of this script
.testingdest = ~/tmp/ # Result directory for testing - change if necessary
#
# Builds a sitemap.xml file for a website, with Google image extensions.
# Requires: C-Kermit 9.0 Alpha.03 or later.
# This file should be stored with execute permission.
# Top line must give full pathname of C-Kermit 9.0 executable.
#
# Documentation: http://www.kermit.edu/ksitemap.html
#
# Optional command-line argument: path of filelist file. If the argument is
# given the web directory is assumed to be the same directory where the
# filelist is. If not given, a file named "filelist" in the current directory
# is assumed if it exists. For details see the documentation or read below.
#
# Data file summary... ON TOP:
# enconding=value -- The character set in which the fileliest is encoded
# home=value -- URL of website home directory (required)
# geo=value -- String: Default location for images (optional)
# lic=value -- Filename of page containing copyright/license info (optional)
# .xxx=value -- Define a macro named xxx (optional)
#
# REST OF FILE: Information for each page to be included in sitemap:
# url=value[=value] -- Filename of web page (required, second value optional)
# pri=value -- Priority for indexing (0.0 to 1.0) (optional)
#
# AND FOR EACH IMAGE IN A URL THAT YOU WANT TO BE INDEXED:
# img=value -- Filename of an image used in this web page (optional)
# cap=value -- Caption for this image (optional)
# title=value - Title for this image (optional)
# (You can list as many images as you like for each URL, within reason)
#
# Version history:
# 1.00 Wed Dec 8 10:51:32 2010: Initial version
# 1.01 Sat Dec 11 19:02:20 2010: Fix bug in change frequency calculation
# 1.02 Tue Dec 28 15:18:15 2010: Allow for redirects
# 1.03 Thu Dec 30 09:51:46 2010: Add macro capability
#
# Author: Frank da Cruz, December 2010.
#
if llt \v(version) 900299 exit 1 "C-Kermit 9.0 or later required"
if def \$(DEBUG) set debug message on # DEBUG env variable requests debugging
.unix = 0 # For "if unix ..."
if equ "\v(system)" "UNIX" .unix = 1
.usedenv = 0
def errexit { # Fatal error macro
echo \v(timestamp) \v(dir) sitemap.ksc:
echo Error: \%*
exit 1
}
if def \%1 { # Command-line argument if any
.webdirectory := \fdirname(\%1) # is pathname of file list file.
if def webdirectory { # If it includes a directory part
if not directory \m(webdirectory) { # Check it
errexit NOT A DIRECTORY: \m(webdirectory)
}
cd \m(webdirectory) # and CD to it
if fail errexit CD FAILED: \m(webdirectory)
}
.filelist := \fbasename(\%1) # And this is the name of the file
if not def filelist .filelist = filelist
} else if def \$(KSITEMAPDIR) { # Env variable KSITEMAPDIR exists
.webdirectory := \$(KSITEMAPDIR)
if not directory \m(webdirectory) { # Check it
errexit "NOT A DIRECTORY: \m(webdirectory) [From $KSITEMAPDIR]"
}
cd \m(webdirectory) # and CD to it
if fail errexit "CD \m(webdirectory) [From $KSITEMAPDIR]"
.filelist = filelist # And the file-list file is filelist
.usedenv = 1
} else { # Otherwise
.webdirectory := \v(dir) # assume the current directory
.filelist = filelist # And default the filename to filelist
}
.resultdirectory := \m(webdirectory) # Where to put sitemap.xml
if debug { # Debugging
.resultdirectory := \m(testingdest)
echo DEBUGGING...
echo \fbasename(\%0) V\m(version)
if usedenv echo Parameters obtained from $KSITEMAPDIR environment variable:
show mac webdirectory filelist
echo Writing result to \m(testingdest)\m(filelist)
echo current directory is \v(dir)
}
if not exist \m(filelist) { # Check that the file list file exists
errexit FILE LIST NOT FOUND: \m(webdirectory)\m(filelist)
}
# Define some macros...
define FERREXIT { # Fatal error reading file list file
exit 1 [\flpad(\m(lineno),3,0)] \%1 [\m(line)]
}
define FERRWARN { # Warning about a file list line
echo [\flpad(\m(lineno),3,0)] \%1 [\m(line)]
}
define FINISHIMAGE { # Macro to write Image epilog
if inimg { # If we were doing an image...
if def geo { # if location defined
.\%9 := \m(geo)
fwrite /line \%o " \%9" # add it.
}
if def lic { # If license URL defined
.\%9 := \m(home)\m(lic)
fwrite /line \%o " \%9" # Add it
}
fwrite /line \%o " " # Close image clause
.inimg = 0 # No longer doing an image
}
}
def FINISHURL { # Macro to write URL epilog
if not inurl end 0
if > imginurl 0 { # Were there some images in this URL?
finishimage # Finish current image
} else { # No images - write priority now
fwrite /line \%o " \m(priority)"
}
fwrite /line \%o # End of this URL
.inurl = 0 # No longer doing a URL
}
# Begin execution...
fopen /read \%c \m(filelist) # Open the file-list file
if fail errexit "\v(lastcommand)" # Make sure it is open
fopen /write \%o sitemap.tmp # Open the temporary sitemap file.
if fail errexit "OPEN /WRITE FAILED" # Check
# Write XML prolog to sitemap file...
fwrite /line \%o # First line
if fail errexit "WRITE FAILED: sitemap.tmp" # Check that FWRITE succeeded
# If we get here all writes should succeed - continue the XML prolog...
fwrite /line \%o }
.tags = |url|pri|img|cap|title|home|geo|lic|encoding| # Valid tags
.urls = 0 # Initialize URL counter
.imgs = 0 # Image counter
.imginurl = 0 # Image within URL counter
.inurl = 0 # State flag: doing a URL
.inimg = 0 # State flag: doing an image (in a URL)
.lineno = 0 # File list file line number
.doutf8 = 0 # Convert encoding to UTF-8
.mjd := \fmjd(today) # Today's date MJD format
.encoding = UTF-8 # Default encoding for file-list file
.home = # Web home directory (none yet)
.globalgeo = # Global geographic location (ditto)
.globallic = # Global license page (ditto)
while true { # Loop to read and process file list
fread /line /trim \%c line # Read a line and trim trailing blanks
if fail break # Fail = end of file = all done
increment lineno # Count this line
if not defined line continue # If empty line read the next one
.line := \fltrim(\m(line)) # Trim leading blanks
if equ "\s(line[1:1])" "#" continue # If it's a comment line keep reading
if equ "\v(version)" "900299" { # If C-Kermit is 9.0.299
if not lgt "\v(test)" "Alpha.09" { # Alpha.09 or earlier...
# Work around "Quoting Hell" bug in CSV splitting
if \findex(\\,\m(line)) .line := \freplace(\m(line),\\,\\\\)
}
}
.\%9 := \fsplit(\m(line),&x,=,CSV) # Split line on '='
if < \%9 2 { ferrwarn "TAG WITH NO VALUE", continue }
undef s1 s2 s3
.s1 := \&x[1] # Tag
.s2 := \fcontents(\&x[2]) # Value
if def \&x[3] .s3 := \fcontents(\&x[3]) # Optional second value (redirect)
if doutf8 { # Converting character set?
if not equ "7BIT" "\fstringtype(\m(s2))" { # Need to convert this one?
.s2 := \fcvtcset(\m(s2),\m(encoding),utf-8) # Convert to UTF-8
}
}
if ( > \flen(s1) 1 && equ "\s(s1[1:1])" "." ) { # Macro definition
_asg \s(s1[2]) \m(s2) # See Using C-Kermit p.457
if debug {
message MACRO DEFINITION
show mac \s(s1[2])
}
continue
}
if not \findex(|\m(s1)|,\m(tags)) { # Preverify tag
ferrwarn "UNKNOWN TAG '\m(s1)'- SKIPPING"
continue
}
if match "\m(s2)" "*\\m(*)*" { # Check for macro reference in s2
.s2 := \frecurse(\m(s2)) # Replace macro with its expansion
if debug { echo s2 MACRO EXPANSION, show mac s2 }
}
if match "\m(s3)" "*\\m(*)*" { # Check for macro reference in s2
.s3 := \frecurse(\m(s3))
if debug { echo s3 MACRO EXPANSION, show mac s3 }
}
# Handle each kind of tag...
if equ "\m(s1)" "encoding" { # Encoding of filelist file
.encoding := \m(s2) # Save it here
message Encoding=\m(encoding)
if not equ "\m(s2)" "UTF-8" .doutf8 = 1 # Must convert to UTF-8
continue
}
if equ "\m(s1)" "home" { # Website home directory
# In Unix supply trailing slash if necessary
if unix if neq "\fright(\m(s2),1)" "/" .s2 := \m(s2)/
.home := \m(s2) # to be used in building URLs
continue
}
if equ "\m(s1)" "geo" { # Image geographic location
if == 0 urls { # If geo given at the head of filelist
.globalgeo := \m(s2) # make it the global default value
} else {
.geo := \m(s2) # set the local value
}
continue
}
if equ "\m(s1)" "lic" { # Website license page
if == 0 urls { # Same as for geo
.globallic := \m(s2)
} else {
.lic := \m(s2)
}
continue
}
if equ "\m(s1)" "url" { # Web page URL
if not def home errexit "URL BEFORE HOME DEFINED"
if inurl do finishurl # Finish previous URL if any
.imginurl = 0 # How many images in this URL
.priority = 0.5 # Default page priority (0.0-1.0)
.name := \m(s2) # Filename of this web page
.redirect := \m(s3) # Name (if any) it is redirected to
if exist \m(s3) { # If redirect is indicated
.redirect := \m(name) # swap the names
.name := \m(s3)
}
if not exist \m(name) { ferrwarn "NOT EXIST \m(name)", continue }
if not readable \m(name) { ferrwarn "NOT READABLE \m(name)" }
.inurl = 1 # We are doing a URL now
message \m(name)... # List the name if debugging
incr urls # Count the URL
fwrite /line \%o # Start the XML URL section
# Add URL of this file to sitemap...
if eq "\m(name)" "index.html" { # Special for home page
fwrite /line \%o " \m(home)"
} else if def redirect { # Special for redirected names
fwrite /line \%o " \m(home)\m(redirect)"
} else { # Normal case
fwrite /line \%o " \m(home)\m(name)"
}
.s := \fcvtd(\fdate(\m(name)),3) # Modification date of file
.s := \s(s[1:4])-\s(s[5:2])-\s(s[7:2]) # Just the date is enough
fwrite /line \%o " \m(s)" # Add to sitemap
.\%x := \fmjd(\fdate(\m(name))) # Modification date as MJD
.\%y ::= \fmjd(\m(today)) - \%x # How many days ago
.c = yearly # Default change frequency is yearly
if < \%y 8 .c = daily # If modified in last 7 days daily
else if < \%y 30 .c = weekly # or in last 30 days say weekly
else if < \%y 100 .c = monthly # or in last 100 days say monthly
fwrite /line \%o " \m(c)" # Add to sitemap
continue
}
if equ "\m(s1)" "pri" { # Page priority
if not inurl ferrexit "PRIORITY NOT IN URL"
if not float \m(s2) ferrexit "PRIORITY NOT NUMERIC"
if ( > \m(s2) 1.0 || < \m(s2) 0.0 ) ferrexit "PRIORITY OUT OF RANGE"
.priority := \m(s2) # Save it for epilog (see finishurl)
continue
}
if equ "\m(s1)" "img" { # Image
finishimage # Finish previous image if any
if == 0 imginurl { # First image for this URL?
# Add page priority before listing any images
fwrite /line \%o " \m(priority)"
}
.geo := \m(globalgeo) # If a global one defined use it
.lic := \m(globallic) # wherever a local one is not given.
if not inurl ferrexit "img not in url"
if not exist \m(s2) { ferrwarn "IMG NOT EXIST: \m(s2)", continue }
if not readabl \m(s2) { ferrwarn "IMG NOT READABLE: \m(s2)", continue }
increment imgs # Count this image
incr imginurl # Count image for this URL
fwrite /line \%o " " # Start image clause
fwrite /line \%o " \m(home)\m(s2)" # Put URL
.inimg = 1 # We are doing an image now
continue
}
if equ "\m(s1)" "cap" { # Image caption
if not inimg ferrexit "CAP WITH NO IMG"
if def s2 { # If the caption is not empty add it
fwrite /line \%o " \m(s2)"
}
continue
}
if equ "\m(s1)" "title" { # Image title
if not inimg ferrexit "TITLE WITH NO IMG"
if def s2 { # If the title is not empty add it
fwrite /line \%o " \m(s2)"
}
}
}
do finishurl # End of file list - finish last URL
fwrite /line \%o # Finish the sitemap
fclose \%o # Close the temporary sitemap file
if exist sitemap.xml { # Rotate previous ones
if exist sitemap.ayer copy /preserve sitemap.ayer sitemap.ante
if fail message "FAILURE TO ROTATE OLD SITEMAP[1]"
copy /preserve sitemap.xml sitemap.ayer
if fail message "FAILURE TO ROTATE OLD SITEMAP[2]"
}
rename sitemap.tmp \m(resultdirectory)sitemap.xml # Install the new sitemap
if fail errexit "FAILURE TO INSTALL NEW SITEMAP"
if unix { # Unix...
chmod 644 \m(resultdirectory)sitemap.xml # Make it world readable
if fail errexit "CHMOD FAILURE - \m(resultdirectory)sitemap.xml"
}
# When run in a cron job this message arrives in email
exit 0 "[\v(timestamp)] sitemap.ksc: OK - URLs: \m(urls); IMGs: \m(imgs)"
; Local Variables:
; comment-column:40
; comment-start:"# "
; End: