#!/usr/bin/perl # # [1997-11-21] look_http_access.pl # (C) René Scholz <http://www.thur.de/~Voland/> # # Time-stamp: <1999-10-09, 14:47:59, mrz@isun34> # # USE: reads with tail -f the weblog file and searches for certain accesses # of your homepage. # You better have a colored terminal for this! (like rxvt) # # For the http logfile, the following format is used (defined in httpd.conf): # # LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined # ################################################################################### use strict; use vars qw($bold $noatt $Fmy $Frobot $Flocal $Fpost $Fbabel $Fpeople $Fauth $Fsearch $Fprot $Ffavicon $Flsearch1 $Flsearch2 $off $TLINES $LOGFILE $NO_ROBOTS $SHORT_DATE $NOTEXPR $LOOKEXPR $ME $ROBOTS $LOCAL $PEOPLE $BABELFISH $ROBNAMES $FAVICON $RZPROXY $Fprojects $PROJECTS $THURNET $Fthurnet ); use Encode qw(from_to); # set rxvt title print "]0;Showing http access for voland"; $TLINES = 500; # parameter for tail. If smaller, lines will be written faster. $bold = `tput smso`; $noatt = `tput sgr0`; # $alt = `tput smacs`; $noalt = `tput rmacs`; $enacs = `tput enacs`; # Color codes (ANSI): # BG:FG with: # FG: 30=white 31=red 32=green 33=yellow 34=blue 35=magenta 36=cyan 37=black # BG: 40=black 41=red 42=green 43=yellow 44=blue 45=magenta 46=cyan 47=gray $Fmy = "\e[41m\e[37m"; $Frobot = "\e[44m\e[33m"; $Flocal = "\e[45m\e[37m"; $Fpost = "\e[41m\e[33m"; $Fprot = "\e[41m\e[30m"; $Fprojects = "\e[41m\e[34m"; $Fthurnet = "\e[35m"; $Fbabel = "\e[43m\e[34m"; $Fpeople = "\e[31m\e[47m"; $Ffavicon = "\e[42m\e[35m"; $Fauth = "\e[42m\e[35m"; $Fsearch = "\e[34m"; $Flsearch1 = "\e[42m\e[35m"; $Flsearch2 = "\e[31m"; $off = "\e[0m"; $LOGFILE = "/var/log/apache2/access.log"; $NO_ROBOTS = 1; # if 1: don't show accesses from webrobots $SHORT_DATE = 1; # short the date to show only the time $NOTEXPR = '^isun|htdig.+\(www@thur.de\)|"HEAD|wwwcount.cgi|Count.cgi|forum\/css|styles.+css |\.(gif|jpg|png|bmp).+(200|206|304|301)|POST \/mos88\/base\/ping\/index'; # don't show this lines #$NOTEXPR = '^isun|htdig.+\(www@thur.de\)|"HEAD|wwwcount.cgi|Count.cgi|forum\/css|styles.+css |\.(gif|jpg|png|bmp).+(200|206|304|301)'; # don't show this lines $NOTEXPR .= "|members30\/sabele1|GET \/thurnet\/.phprojekt\/chat|phprojekt.*\.css|\/mos88\.* 304 "; $LOOKEXPR = '^.*fan-mjh|^.*[vV]oland|\/md90|\.prot|thurnet\/\.phprojekt|\/thurnet|\/mos88\/'; # only look for this pages in the web-logfile $ME = '(/~[Vv]oland[^"]*)'; # my personal homepage $RZPROXY = '(141.35.3.10)'; # Proxy of the university jena $ROBOTS = "(suchen.eule.de|mserv.rrzn.uni-hannover.de|.+.fireball.de|crawler.tivra.com|" . "inktomi.+?planet.net.uk|.+.inktomi.*com|scooter.+?pa-x.dec.com|" . "idefix.sda.t-online.de|lycosidae.lycos.com|webrobot.ndv.de|almaden.ibm.com|" . "crawl.+?atext.com|crawlit.crawler.de|webrobot.ndv.de|\\S+?bos.lycos.com|search.pta.at|" . "\\S+.infoseek.com|robot.+?cab.infoweb.ne.jp|www.acoon.de|ix.+?alltheweb.com|" . "news2.oberland.net|c?.googlebot.com|209.67.247.155|\\S+.excite.com|.+.nikoma.de|" . "infra.euroseek.net|we-24-130-20-109.we.mediaone.net|green.alexa.com|crawl.+?alexa.com|" . "sarah.alexa.com|scooter.*alta-vista.net|ix.+?.fast-search.net|scooter.+?co.uk|" . "scooter.sv.av.com|.+.openfind.com|.+.sv.av.com|.+.fastsearch.net|.+.directhit.com|" . ".+.picsearch.com|petula.laurion.net|.+.entireweb.com|.+.overture.com|.*search.scd.yahoo.com|.+.crawl.yahoo.net|.+.wotbox.com|" . "search.msn.com/msnbot.htm|www.turnitin.com/robot/crawlerinfo.html|.+.scoutjet.com|crawl.*.dotnetdotcom.org|.+.linguee.com|" . "spider.*.yandex.*|crawler.*.ask.com|.+.search.msn.com|crawl.*.exabot.com|.+.static.twtelecom.net|.+.pixray.com|crawler.sistrix.net|" . "clients.your-server.de|a.ahrefs.com|startdedicated.de|mj12bot.com" . ")"; $PROJECTS = "(\.phprojekt|\/mos88\/)"; $THURNET = "(\/thurnet\/)"; $FAVICON = "(favicon.ico|impressum|pr_dino_2004-09.phtml)"; $ROBNAMES = "(ArchitextSpider|Scooter\\S+|Slurp\\S+|Eule Robot v3.00.+?\\)|KIT-Fireball\\S+|" . "InfoSeek Sidewinder|InfoNaviRobot|Ultraseek|Acoon Robot|Crawler|Wget\\S+|" . "Lycos_Spider\\w+|ramBot|Googlebot\\S*|FAST-WebCrawler\\S*|Arachnoidea|ia_archiver|" . "DIIbot\\S+|WebFountain|SearchTone\\S+|Spinne\\S+|ScourHTTP\\S+|GentleSpider\\S+|" . "Openfind Robot\\S+|Mercator\\S+|WebZIP\\S+|marvin\\S+|gigabaz\\S+|psbot\\S+|" . "Speedy Spider|msnbot|Ocelli|linguatools-bot)"; $LOCAL = "(beutenberg.tip-jena.de|tantalus.jena.thur.de)"; # access from the localhosts # some known persons: $PEOPLE = "([^ ]+.spektracom.de|[^ ]+.magnet.ch|[A-Za-z0-9._%$@-]+jena.de|" . "netconsult.netconx.de|[^ ]+.intershop.de |[^ ]+thur.de |" . "193.141.171.254|jpiros|jvoland |[A-Za-z0-9._%$@-]+.iks-jena.de|[^ ]+.pppool.de)"; $BABELFISH = "(babelfish.a.av.pa-x.dec.com|babel2.pa.alta-vista.net|snat-babel.sv.alta-vista.net)"; # someone using babelfish open(LOG, "/usr/bin/tail -${TLINES}f $LOGFILE |") or die "Can't read $LOGFILE: $!\n"; while(<LOG>) { if( /$LOOKEXPR/ && $_!~m%$NOTEXPR% ) { # if( /$LOOKEXPR/ ) { next if($NO_ROBOTS && /$ROBOTS/); chomp; ### remove some stuff; s|\[.+?:(\d\d:\d\d:\d\d).+?\]|[$1]| if($SHORT_DATE); s/(^.+") "(Mozilla\S+).*/$1 $2/; s/(^.+") "(Lynx\S+).*/$1 $2/; s/(^.+") "(Go!Zilla \S+).*/$1 $2/; s/(^\S+ )- - /$1/; # remove " - - " s| HTTP/1..||; ### short referers from same host (www.thur.de/fan-mjh/) s|( [\d-]+ )"http(s?)://www.thur.de/fan-mjh(/?)|$1"$2FAN-MJH$3|g; s|( \d+ )"http(s?)://www.thur.de(/?)|$1"$2THUR$3|g; s/%([a-fA-F0-9]{2})/pack("C", hex($1))/eg; # unpack MIME-coded url's from_to($_, "utf8", "iso-8859-1"); # handle UTF-8 # colorize some expressions: s/(POST)/$Fpost$1$off/; s/(\.prot\/)/$Fprot$1$off/; s/$PROJECTS/$Fprojects$1$off/; s/$THURNET/$Fthurnet$1$off/; s/$ROBOTS/$Frobot$1$off/gi; s/$ROBNAMES/$Frobot$1$off/g; s/$LOCAL/$Flocal$1$off/; s/$BABELFISH/$Fbabel$1$off/; s/$ME/$Fmy$1$off/i; s/$RZPROXY/$Fpeople$1$off/; s/$PEOPLE/$Fpeople$1$off/; s/$FAVICON/$Ffavicon$1$off/i; s/(\.prot.+\.pl)/$Fbabel$1$off/; s/" ([45]\d\d|301)/" $bold$1$noatt/; # show failed accesses (404 etc.) in bold s/(^\S+ - )([^ ]+?)( .+)/$1$Fauth$2$off$3/; # show user names ### highlight the search expressions used with the search engines: s/(https?:\/\/\S+)([\?&][qp]=|&ask=)(.+?)(&|")/$1$2$Fsearch$3$off$4/; # s/(https?:\/\/\S+)(\??&?[qp]=)(.+?)(&|")/$1$2$Fsearch$3$off$4/; s/(https?:\/\/\S+)(&query=)(.+?)(&|")/$1$2$Fsearch$3$off$4/i; s/(hotbot.*?com|www.radaruol.com|search.*?.msn|www.swiss-?search.ch)(.+?MT=)(.+?)(&|")/$1$2$Fsearch$3$off$4/; s/(lycos|lotse.de|google.*.com|netfind.*?.aol.\w+|scour.net|www.alltheweb.com|crawler.de|buscaweb.starmedia.com|www.anzwers.com.au|search.filequest.com|aolsearch.aol.\w+)(.+?query=)(.+?)(&|")/$1$2$Fsearch$3$off$4/; # s/(www.dogpile.com|www.intersearch.de|search.msn.\w+|search.aon.at|sear.ch|evreka.com|av4000.belbone.be)(.+?&q=)(.+?)(&|")/$1$2$Fsearch$3$off$4/; s/(google.*.\w+|miner.uol.com.br|www.suchen.com|fireball.de|seek.t-online.de|www.go.co|directhit.com|busqueda.infosel.com|de.intersearch.net)(.+?[&?o]qt?=)(.+?)(&|")/$1$2$Fsearch$3$off$4/; s/(excite|netfind.*?aol.com|www.electronsearch.com)(.+?)(search=|&s=)(\S+?)(&|")/$1$2$3$Fsearch$4$off$5/; s/(alta-?vista.+?&?[q]=)(\S+?)(&|")/$1$Fsearch$2$off$3/i; s/(goto.com|snap.com)(.+?[kK]eywords?=)(\S+?)(&|")/$1$2$Fsearch$3$off$4/; s/(imdb.com.+?\?)(\S+?)\"/$1$Fsearch$2$off&/; # s/(infoseek.+?)(qt=|oq=|&query=)(\S+?)(&|")/$1$2$Fsearch$3$off$4/; s/(suche2?.web.de.+?&su=)(\S+?)(&|")/$1$Fsearch$2$off$3/; # s/(search.cnet.com|www.apollo7.de)(.+?&QUERY=)(\S+?)(&|")/$1$2$Fsearch$3$off$4/; s/(go2net.com|metacrawler.com)(.+?general=)(\S+?)(&|")/$1$2$Fsearch$3$off$4/; s/(looksmart.com.+?[&?]key=)(\S+?)(&|")/$1$Fsearch$2$off$3/; s/(euroferret.com.+?&P=)(\S+?)(&|")/$1$Fsearch$2$off$3/; # s/(yahoo.+\\?p=[+\"]?)(\S+?"?)(&|")/$1$Fsearch$2$off$3/; s/(suchen.eule.de.+?begriff=)(\S+?)(&|")/$1$Fsearch$2$off$3/; s/(www.searchopolis.com.+?request=)(\S+?)(&|")/$1$Fsearch$2$off$3/; s/(www.askjeeves.com.+?ask=)(\S+?)(&|")/$1$Fsearch$2$off$3/; s/(www.kolibri.de.+?&Suchwort=)(\S+?)(&|")/$1$Fsearch$2$off$3/; s/(www.austronaut.at.+?\?suche=)(\S+?)(&|")/$1$Fsearch$2$off$3/; s/(megaspider.com\/search.*html\?)(\S+?)(&|")/$1$Fsearch$2$off$3/; #s/(http:\S+search.cgi\?)(\S+?)(&|")/$1$Fsearch$2$off$3/; s/(http:\S+results.asp\?q=)(\S+?)(&|")/$1$Fsearch$2$off$3/; #s/(cgi-bin.+?\.pl)\?(\S+)/$bold$1$noatt?$Fpost$2$off/; s|(/fan-mjh/cgi-bin.search.pl.+?EXP=)(\S*?)&|$1$Flsearch1$2$off&|; s!(/fan-mjh/cgi-bin/)(pack.pl\?|save-archiv.pl\?)([^"]+)!$1$2$Fpost$3$off!g; s!(FAN-MJH/cgi-bin/search.pl\?EXP=)(\S+?)(&|")!$1$Flsearch2$2$off$3!; # s/%([a-fA-F0-9]{2})/pack("C", hex($1))/eg; # unpack MIME-coded url's s|" |" |g; s| "| "|g; s| \[| [|; print "\n$_"; # print the (colored) line } }