#!/usr/local/bin/wermit +
#
# THE FIRST LINE SHOULD INDICATE THE ACTUAL PATH OF C-KERMIT 9.0
# and the script should be given execute permission.
#
# Usage: weblog filename
#
# Reads TSV-format web log for a bilingual English-Spanish website.
# Extracts all Google searches.
# Converts the coded search string into plain text.
# If charset of search string is declared, converts it to ISO-8859-1.
# If charset is not declared, it is tested for UTF-8 and converted.
# Normalized search strings are tabulated using associative arrays.
#
# Illustrates:
# . new \fsplit behavior - CK9.0
# . new \fsqueeze() function
# . message and if debug commands - New in CK9.0 Alpha.03
# . decodehex function - New in CK9.0 Alpha.05
# . stringtype function - New in CK9.0 Alpha.05
# . use of MIME charset names - New in CK9.0 Alpha.05
# . associative arrays (not new but little known)
#
# Frank da Cruz, Columbia University, April 2010

if not def \%1 exit 1 "usage: weblog logfilename"

fopen /read \%i \fcontents(\%1)
if fail exit 1 

.lines = 0
.google = 0

if def \$(DEBUG) set debug message on   # To print debugging messages

while true {
    fread /line \%i line		# Read a record
    if fail break			# End of file
    incr lines				# Have record - count it
    void \fsplit(\m(line),&a,\9,ALL,,1)	# Split it into fields
    if not \findex(.html,\&a[5]) continue # Reject all non-HTML accesses
    .isgoogle := \findex(.google.,\&a[8]) # Reject all non-Google accesses
    if not isgoogle continue

    increment google			# Have a Google HTML record
    void \fsplit(\&a[8],&b,&?,ALL,,1)	# Split it into 'clauses'
    if debug show array b
    undef charset string		# Clear result variables
    for i 1 \fdim(&b) 1 {		# Loop through clauses
        void \fsplit(\&b[i],&c,=,ALL)	# Split clause into ID and value
        if equ "\&c[1]" "q" .string := \&c[2] # Query string
        else if equ "\&c[1]" "ie" .charset := \&c[2] # Character set
    }
    if not def string continue		# No string - skip this record
    if debug show mac charset string

    # Normalize the string....

    .string := \fsqueeze(\flower(\fdecodehex(\freplace(\m(string),+,\32))))
    if debug show mac string
    if def charset {
        _increment cset<\fupper(\m(charset))>
        if debug echo "CONVERTING [\m(string)] \m(charset)->ISO-8859-1"
	.string := \fcvtcset(\m(string),\m(charset),iso-8859-1)
	if debug show mac string
    } else if equal "\fstringtype(\m(string))" "UTF8" {
	.string := \fcvtcset(\m(string),UTF-8,iso-8859-1)
	if debug show mac string
        _increment cset<UNDECLARED_UTF-8_DETECTED> 
    } else {
        _increment cset<UNDECLARED>
    }
    .string := \fsubstitute(\m(string),,)
    .string := \freplace(\m(string),espanol,espaol)
    .string := \freplace(\m(string),ingles,ingls)
    _increment search<\m(string)>
}
# Finished - Display statistics

echo
echo "Records:  \flpad(\m(lines),8)"
echo "Google:   \flpad(\m(google),8)"
.n := \faaconvert(search,&a,&b)
echo "Unique:   \flpad(\m(n),8)"
.m := \faaconvert(cset,&c,&d)
echo "Charsets: \flpad(\m(m),8)"
echo
echo Charsets by frequency...
array sort /reverse /numeric &d &c
for i 1 m 1 {
    echo \flpad(\m(i),3). \flpad([\&d[i]],8) \&c[i]
}
if > n 20 .n = 20
echo
echo Top \m(n) searches by frequency...
array sort /reverse /numeric &b &a
for i 1 n 1 {
    echo \flpad(\m(i),3). \flpad([\&b[i]],8) \&a[i]
}
exit 0

; Local Variables:
; comment-column:40
; comment-start:"# "
; End:
