#!/usr/local/bin/wermit + # # THE FIRST LINE SHOULD INDICATE THE ACTUAL PATH OF C-KERMIT 9.0 # and the script should be given execute permission. # # Usage: weblog filename # # Reads TSV-format web log for a bilingual English-Spanish website. # Extracts all Google searches. # Converts the coded search string into plain text. # If charset of search string is declared, converts it to ISO-8859-1. # If charset is not declared, it is tested for UTF-8 and converted. # Normalized search strings are tabulated using associative arrays. # # Illustrates: # . new \fsplit behavior - CK9.0 # . new \fsqueeze() function # . message and if debug commands - New in CK9.0 Alpha.03 # . decodehex function - New in CK9.0 Alpha.05 # . stringtype function - New in CK9.0 Alpha.05 # . use of MIME charset names - New in CK9.0 Alpha.05 # . associative arrays (not new but little known) # # Frank da Cruz, Columbia University, April 2010 if not def \%1 exit 1 "usage: weblog logfilename" fopen /read \%i \fcontents(\%1) if fail exit 1 .lines = 0 .google = 0 if def \$(DEBUG) set debug message on # To print debugging messages while true { fread /line \%i line # Read a record if fail break # End of file incr lines # Have record - count it void \fsplit(\m(line),&a,\9,ALL,,1) # Split it into fields if not \findex(.html,\&a[5]) continue # Reject all non-HTML accesses .isgoogle := \findex(.google.,\&a[8]) # Reject all non-Google accesses if not isgoogle continue increment google # Have a Google HTML record void \fsplit(\&a[8],&b,&?,ALL,,1) # Split it into 'clauses' if debug show array b undef charset string # Clear result variables for i 1 \fdim(&b) 1 { # Loop through clauses void \fsplit(\&b[i],&c,=,ALL) # Split clause into ID and value if equ "\&c[1]" "q" .string := \&c[2] # Query string else if equ "\&c[1]" "ie" .charset := \&c[2] # Character set } if not def string continue # No string - skip this record if debug show mac charset string # Normalize the string.... .string := \fsqueeze(\flower(\fdecodehex(\freplace(\m(string),+,\32)))) if debug show mac string if def charset { _increment cset<\fupper(\m(charset))> if debug echo "CONVERTING [\m(string)] \m(charset)->ISO-8859-1" .string := \fcvtcset(\m(string),\m(charset),iso-8859-1) if debug show mac string } else if equal "\fstringtype(\m(string))" "UTF8" { .string := \fcvtcset(\m(string),UTF-8,iso-8859-1) if debug show mac string _increment cset } else { _increment cset } .string := \fsubstitute(\m(string),ÁÉÍÓÚÜÑ,áéíóúüñ) .string := \freplace(\m(string),espanol,español) .string := \freplace(\m(string),ingles,inglés) _increment search<\m(string)> } # Finished - Display statistics echo echo "Records: \flpad(\m(lines),8)" echo "Google: \flpad(\m(google),8)" .n := \faaconvert(search,&a,&b) echo "Unique: \flpad(\m(n),8)" .m := \faaconvert(cset,&c,&d) echo "Charsets: \flpad(\m(m),8)" echo echo Charsets by frequency... array sort /reverse /numeric &d &c for i 1 m 1 { echo \flpad(\m(i),3). \flpad([\&d[i]],8) \&c[i] } if > n 20 .n = 20 echo echo Top \m(n) searches by frequency... array sort /reverse /numeric &b &a for i 1 n 1 { echo \flpad(\m(i),3). \flpad([\&b[i]],8) \&a[i] } exit 0 ; Local Variables: ; comment-column:40 ; comment-start:"# " ; End: