#!/usr/bin/perl
#
# [1997-11-21]  look_http_access.pl
#               (C) René Scholz  <http://www.thur.de/~Voland/>
#
# Time-stamp:   <1999-10-09, 14:47:59, mrz@isun34>
#
# USE:  reads with tail -f the weblog file and searches for certain accesses
#       of your homepage.
#       You better have a colored terminal for this!  (like rxvt)
#
# For the http logfile, the following format is used (defined in httpd.conf):
#
# LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined
#
###################################################################################

use strict;
use vars qw($bold $noatt $Fmy $Frobot $Flocal $Fpost $Fbabel $Fpeople $Fauth $Fsearch
            $Fprot
	    $Ffavicon $Flsearch1 $Flsearch2 $off $TLINES $LOGFILE $NO_ROBOTS $SHORT_DATE
	    $NOTEXPR $LOOKEXPR $ME $ROBOTS $LOCAL $PEOPLE $BABELFISH $ROBNAMES $FAVICON
	    $RZPROXY $Fprojects $PROJECTS $THURNET $Fthurnet
	   );

use Encode qw(from_to);

# set rxvt title
print "]0;Showing http access for voland";


$TLINES    = 500;  # parameter for tail. If smaller, lines will be written faster.

$bold      = `tput smso`;   $noatt = `tput sgr0`;
# $alt      = `tput smacs`;  $noalt = `tput rmacs`; $enacs    = `tput enacs`;
# Color codes (ANSI):
# BG:FG with:
# FG:   30=white 31=red  32=green  33=yellow  34=blue  35=magenta  36=cyan  37=black
# BG:   40=black 41=red  42=green  43=yellow  44=blue  45=magenta  46=cyan  47=gray
$Fmy       = "\e[41m\e[37m";
$Frobot    = "\e[44m\e[33m";
$Flocal    = "\e[45m\e[37m";
$Fpost     = "\e[41m\e[33m";
$Fprot     = "\e[41m\e[30m";
$Fprojects = "\e[41m\e[34m";
$Fthurnet  = "\e[35m";
$Fbabel    = "\e[43m\e[34m";
$Fpeople   = "\e[31m\e[47m";
$Ffavicon  = "\e[42m\e[35m";
$Fauth     = "\e[42m\e[35m";
$Fsearch   = "\e[34m";
$Flsearch1 = "\e[42m\e[35m";
$Flsearch2 = "\e[31m";

$off       = "\e[0m";

$LOGFILE   = "/var/log/apache2/access.log";

$NO_ROBOTS  = 1;  # if 1: don't show accesses from webrobots
$SHORT_DATE = 1;  # short the date to show only the time

$NOTEXPR   = '^isun|htdig.+\(www@thur.de\)|"HEAD|wwwcount.cgi|Count.cgi|forum\/css|styles.+css |\.(gif|jpg|png|bmp).+(200|206|304|301)|POST \/mos88\/base\/ping\/index';  # don't show this lines
#$NOTEXPR   = '^isun|htdig.+\(www@thur.de\)|"HEAD|wwwcount.cgi|Count.cgi|forum\/css|styles.+css |\.(gif|jpg|png|bmp).+(200|206|304|301)';  # don't show this lines
$NOTEXPR  .= "|members30\/sabele1|GET \/thurnet\/.phprojekt\/chat|phprojekt.*\.css|\/mos88\.* 304 ";
$LOOKEXPR  = '^.*fan-mjh|^.*[vV]oland|\/md90|\.prot|thurnet\/\.phprojekt|\/thurnet|\/mos88\/';  # only look for this pages in the web-logfile

$ME        = '(/~[Vv]oland[^"]*)'; # my personal homepage
$RZPROXY   = '(141.35.3.10)';   # Proxy of the university jena

$ROBOTS    = "(suchen.eule.de|mserv.rrzn.uni-hannover.de|.+.fireball.de|crawler.tivra.com|"
           . "inktomi.+?planet.net.uk|.+.inktomi.*com|scooter.+?pa-x.dec.com|"
           . "idefix.sda.t-online.de|lycosidae.lycos.com|webrobot.ndv.de|almaden.ibm.com|"
           . "crawl.+?atext.com|crawlit.crawler.de|webrobot.ndv.de|\\S+?bos.lycos.com|search.pta.at|"
           . "\\S+.infoseek.com|robot.+?cab.infoweb.ne.jp|www.acoon.de|ix.+?alltheweb.com|"
           . "news2.oberland.net|c?.googlebot.com|209.67.247.155|\\S+.excite.com|.+.nikoma.de|"
           . "infra.euroseek.net|we-24-130-20-109.we.mediaone.net|green.alexa.com|crawl.+?alexa.com|"
           . "sarah.alexa.com|scooter.*alta-vista.net|ix.+?.fast-search.net|scooter.+?co.uk|"
           . "scooter.sv.av.com|.+.openfind.com|.+.sv.av.com|.+.fastsearch.net|.+.directhit.com|"
	   . ".+.picsearch.com|petula.laurion.net|.+.entireweb.com|.+.overture.com|.*search.scd.yahoo.com|.+.crawl.yahoo.net|.+.wotbox.com|"
	   . "search.msn.com/msnbot.htm|www.turnitin.com/robot/crawlerinfo.html|.+.scoutjet.com|crawl.*.dotnetdotcom.org|.+.linguee.com|"
	   . "spider.*.yandex.*|crawler.*.ask.com|.+.search.msn.com|crawl.*.exabot.com|.+.static.twtelecom.net|.+.pixray.com|crawler.sistrix.net|"
	   . "clients.your-server.de|a.ahrefs.com|startdedicated.de|mj12bot.com"
           . ")";

$PROJECTS  = "(\.phprojekt|\/mos88\/)";
$THURNET   = "(\/thurnet\/)";
$FAVICON   = "(favicon.ico|impressum|pr_dino_2004-09.phtml)";

$ROBNAMES  = "(ArchitextSpider|Scooter\\S+|Slurp\\S+|Eule Robot v3.00.+?\\)|KIT-Fireball\\S+|"
           . "InfoSeek Sidewinder|InfoNaviRobot|Ultraseek|Acoon Robot|Crawler|Wget\\S+|"
           . "Lycos_Spider\\w+|ramBot|Googlebot\\S*|FAST-WebCrawler\\S*|Arachnoidea|ia_archiver|"
           . "DIIbot\\S+|WebFountain|SearchTone\\S+|Spinne\\S+|ScourHTTP\\S+|GentleSpider\\S+|"
           . "Openfind Robot\\S+|Mercator\\S+|WebZIP\\S+|marvin\\S+|gigabaz\\S+|psbot\\S+|"
           . "Speedy Spider|msnbot|Ocelli|linguatools-bot)";

$LOCAL     = "(beutenberg.tip-jena.de|tantalus.jena.thur.de)"; # access from the localhosts

# some known persons:
$PEOPLE    = "([^ ]+.spektracom.de|[^ ]+.magnet.ch|[A-Za-z0-9._%$@-]+jena.de|"
           . "netconsult.netconx.de|[^ ]+.intershop.de |[^ ]+thur.de |"
           . "193.141.171.254|jpiros|jvoland |[A-Za-z0-9._%$@-]+.iks-jena.de|[^ ]+.pppool.de)";
$BABELFISH = "(babelfish.a.av.pa-x.dec.com|babel2.pa.alta-vista.net|snat-babel.sv.alta-vista.net)";       # someone using babelfish


open(LOG, "/usr/bin/tail -${TLINES}f $LOGFILE |") or die "Can't read $LOGFILE: $!\n";

while(<LOG>) {
  if( /$LOOKEXPR/  &&  $_!~m%$NOTEXPR% ) {
#  if( /$LOOKEXPR/ ) {
     next if($NO_ROBOTS && /$ROBOTS/);
     chomp;

     ### remove some stuff;
     s|\[.+?:(\d\d:\d\d:\d\d).+?\]|[$1]| if($SHORT_DATE);
     s/(^.+") "(Mozilla\S+).*/$1 $2/;
     s/(^.+") "(Lynx\S+).*/$1 $2/;
     s/(^.+") "(Go!Zilla \S+).*/$1 $2/;
     s/(^\S+ )- - /$1/; # remove " - - "
     s| HTTP/1..||;

     ### short referers from same host (www.thur.de/fan-mjh/)
     s|( [\d-]+ )"http(s?)://www.thur.de/fan-mjh(/?)|$1"$2FAN-MJH$3|g;
     s|( \d+ )"http(s?)://www.thur.de(/?)|$1"$2THUR$3|g;

     s/%([a-fA-F0-9]{2})/pack("C", hex($1))/eg;   # unpack MIME-coded url's
     from_to($_, "utf8", "iso-8859-1");           # handle UTF-8

     # colorize some expressions:
     s/(POST)/$Fpost$1$off/;
     s/(\.prot\/)/$Fprot$1$off/;
     s/$PROJECTS/$Fprojects$1$off/;
     s/$THURNET/$Fthurnet$1$off/;
     s/$ROBOTS/$Frobot$1$off/gi;
     s/$ROBNAMES/$Frobot$1$off/g;
     s/$LOCAL/$Flocal$1$off/;
     s/$BABELFISH/$Fbabel$1$off/;
     s/$ME/$Fmy$1$off/i;
     s/$RZPROXY/$Fpeople$1$off/;
     s/$PEOPLE/$Fpeople$1$off/;
     s/$FAVICON/$Ffavicon$1$off/i;
     s/(\.prot.+\.pl)/$Fbabel$1$off/;
     s/" ([45]\d\d|301)/" $bold$1$noatt/;  # show failed accesses (404 etc.) in bold

     s/(^\S+ - )([^ ]+?)( .+)/$1$Fauth$2$off$3/; # show user names

     ### highlight the search expressions used with the search engines:

     s/(https?:\/\/\S+)([\?&][qp]=|&ask=)(.+?)(&|")/$1$2$Fsearch$3$off$4/;
     # s/(https?:\/\/\S+)(\??&?[qp]=)(.+?)(&|")/$1$2$Fsearch$3$off$4/;
     s/(https?:\/\/\S+)(&query=)(.+?)(&|")/$1$2$Fsearch$3$off$4/i;

     s/(hotbot.*?com|www.radaruol.com|search.*?.msn|www.swiss-?search.ch)(.+?MT=)(.+?)(&|")/$1$2$Fsearch$3$off$4/;
     s/(lycos|lotse.de|google.*.com|netfind.*?.aol.\w+|scour.net|www.alltheweb.com|crawler.de|buscaweb.starmedia.com|www.anzwers.com.au|search.filequest.com|aolsearch.aol.\w+)(.+?query=)(.+?)(&|")/$1$2$Fsearch$3$off$4/;
     # s/(www.dogpile.com|www.intersearch.de|search.msn.\w+|search.aon.at|sear.ch|evreka.com|av4000.belbone.be)(.+?&q=)(.+?)(&|")/$1$2$Fsearch$3$off$4/;
     s/(google.*.\w+|miner.uol.com.br|www.suchen.com|fireball.de|seek.t-online.de|www.go.co|directhit.com|busqueda.infosel.com|de.intersearch.net)(.+?[&?o]qt?=)(.+?)(&|")/$1$2$Fsearch$3$off$4/;
 
     s/(excite|netfind.*?aol.com|www.electronsearch.com)(.+?)(search=|&s=)(\S+?)(&|")/$1$2$3$Fsearch$4$off$5/;
     s/(alta-?vista.+?&?[q]=)(\S+?)(&|")/$1$Fsearch$2$off$3/i;
     s/(goto.com|snap.com)(.+?[kK]eywords?=)(\S+?)(&|")/$1$2$Fsearch$3$off$4/;
     s/(imdb.com.+?\?)(\S+?)\"/$1$Fsearch$2$off&/;
     # s/(infoseek.+?)(qt=|oq=|&query=)(\S+?)(&|")/$1$2$Fsearch$3$off$4/;
     s/(suche2?.web.de.+?&su=)(\S+?)(&|")/$1$Fsearch$2$off$3/;
     # s/(search.cnet.com|www.apollo7.de)(.+?&QUERY=)(\S+?)(&|")/$1$2$Fsearch$3$off$4/;
     s/(go2net.com|metacrawler.com)(.+?general=)(\S+?)(&|")/$1$2$Fsearch$3$off$4/;
     s/(looksmart.com.+?[&?]key=)(\S+?)(&|")/$1$Fsearch$2$off$3/;
     s/(euroferret.com.+?&P=)(\S+?)(&|")/$1$Fsearch$2$off$3/;
     # s/(yahoo.+\\?p=[+\"]?)(\S+?"?)(&|")/$1$Fsearch$2$off$3/;
     s/(suchen.eule.de.+?begriff=)(\S+?)(&|")/$1$Fsearch$2$off$3/;
     s/(www.searchopolis.com.+?request=)(\S+?)(&|")/$1$Fsearch$2$off$3/;
     s/(www.askjeeves.com.+?ask=)(\S+?)(&|")/$1$Fsearch$2$off$3/;
     s/(www.kolibri.de.+?&Suchwort=)(\S+?)(&|")/$1$Fsearch$2$off$3/;
     s/(www.austronaut.at.+?\?suche=)(\S+?)(&|")/$1$Fsearch$2$off$3/;
     s/(megaspider.com\/search.*html\?)(\S+?)(&|")/$1$Fsearch$2$off$3/;

     #s/(http:\S+search.cgi\?)(\S+?)(&|")/$1$Fsearch$2$off$3/;
     s/(http:\S+results.asp\?q=)(\S+?)(&|")/$1$Fsearch$2$off$3/;

	
     #s/(cgi-bin.+?\.pl)\?(\S+)/$bold$1$noatt?$Fpost$2$off/;
     s|(/fan-mjh/cgi-bin.search.pl.+?EXP=)(\S*?)&|$1$Flsearch1$2$off&|;
     s!(/fan-mjh/cgi-bin/)(pack.pl\?|save-archiv.pl\?)([^"]+)!$1$2$Fpost$3$off!g;
     s!(FAN-MJH/cgi-bin/search.pl\?EXP=)(\S+?)(&|")!$1$Flsearch2$2$off$3!;

#     s/%([a-fA-F0-9]{2})/pack("C", hex($1))/eg;   # unpack MIME-coded url's

     s|" |"  |g;
     s| "|  "|g;
     s| \[|  [|;
     print "\n$_";  # print the (colored) line
   }
}