#!/usr/bin/perl -w # $Id: accesslog-queryterms,v 1.2 2002/07/03 08:21:18 friedman Exp $ sub qpdecode { local $_ = shift; y/+/ /; s/%([\dabcdef]{2})/chr(hex($1))/iego; return $_; } my $re = ("(?:" . join ("|", ('/(?:' . join ("|", 'default', 'find', 'q', 'query(?:|_..)', 'redirect', '[^/]*results?', '[^/]*search', 'buscador', ) . ')' . ('(?:' . join ("|", '\.a[ds]p', '\.cgi', '\.dll', '\.fcg', '\.p?html?', '\.gw', '\.jsp', '\.php3?', '\.pl', '\.psp', '\.tmpl', ) . ')') ), "/spbasic\.htm", # MSN "/cat\.adp", # aolsearch.aol.com; others? "/crawler", # metacrawler "/setup\.asp", # lycos "/google\.tmpl", # netscape "/pursuit", # lycos "/search/web", # altavista "/AskJeeves\.asp", # ask jeeves "/DirectHitWeb\.fcg", # directhit.com "/altavista\.php3", # many domains '/director\.asp', # hotbot '/gt\.dll', # altavista "/search", # google.ca, others "/(?:metaAnswer|moreResults)\.asp", # ask.com "/search/fast/meta", # brightgate.com '/search/index\.cgi', # cadshack.com, others '\.lycos\.com/.*', '\.mamma\.com/.*', '\.overture\.com/.*', '\.searchalot\.com/.*', '\.google\.com/(?:groups|images)', '\.netscape.com/netscape', '/goto.com/d/search/.*', '/goto.earthlink.net/d/search/p/earthlink/.*', 'naver.com/search.naver', 'supereva.it/cgi-bin/gsearch.chm', 'c4.com/return.html', 'dotzup.com/h.asp', 'ilor.com/searchilor.lor', 'teoma.com/gs', ) . ')\?(?:|.*&)' . ('(?:' . join ("|", # Do not sort this. 'query(?:|terms?)', 'search(?:|for|text)', 'qr?', 'p', 'as_q', 'mt', 'key(?:|words?)', 'general', 'terms?', 'ask', 'palabra', # what language is this? used by buscador.com 'words', 's', 'pa', 'x') . ')') . '=([^&"]+)'); while (<>) { print qpdecode ($1), "\n" if /$re/io; }