#!/usr/bin/perl # spiderBytes.pl # # alotlikeyesterday@gmail.org # # Reads both apache and squid log files and creates a simple report # detailing the usage of non-standard user agents - # example:googlebot,slurp(yahoo),baiduspider(asian search engine etc.) # # you must: # -change the file name: currently set up to use format of # access_logYYYYMMDD.log, where YYYYMMDD is the current day. # this will depend on how you name your logs and rotate them. # you can just change $file to whatever you like below for ease. # -have the Mail::Sendmail module installed and specify the sender/recipient # use Mail::Sendmail; #Get current day/month/year to construct file name ($Second, $Minute, $Hour, $Day, $Month, $Year, $WeekDay, $DayOfYear, $IsDST) = localtime(time) ; $Year += 1900; $Month += 1; if ($Month < 10) {$Month = '0' . $Month; } if ($Day < 10) {$Day = '0' . $Day; } $date_info = "$Year$Month$Day"; #Specify the input log file $file = '/var/log/httpd/access_log' . $date_info . '.log'; print "Input File: $file\n"; #Sort by pages, name or bytes $sort_by = 'bytes'; #User Agents to omit @ua_list = ("msie", "firefox", "opera", "safari", "lynx", "netscape", "konqueror", "mozilla"); #User Agents to pass (i.e. yahoo slurp uses mozilla agent, so needs to be explicitely allowed to prevent mozilla browsers from being counted) @ua_allow = ("slurp" ); $total_bits = 0; $entry_count = 0; $start_time = ""; $stop_time = ""; $spider_count = 0; $spider_bits = 0; %spider_data = (); #the hash with uniq spider data stats open (IN,$file); while(){ ++$entry_count; ($Domain,$rfc931,$authuser,$TimeDate,$Request,$Status,$Bytes,$Referrer,$Agent) = /^(\S+) (\S+) (\S+) \[([^\]\[]+)\] \"([^"]*)\" (\S+) (\S+) \"?([^"]*)\"? \"([^"]*)\"/o; $total_bits += $Bytes; $Agent =~ s/^\s*(.*?)\s*$/$1/; #if ($Agent =~ /(\(.*\))/ ) {$Agent = $`; } #remove parentheses, including contents #slurp is hidden in Mozilla/5.0, so expose it by force if ($Agent =~ /(\(.*\))/ ) {$Agent = $` unless $Agent =~ /slurp/i } if ($Agent =~ /slurp/i ) {$Agent = 'Yahoo Slurp'; } $count = 0; #count the number of characters $limit = 30; #remove if exceeds, truncate while ( $Agent =~ /./g) {$count++; } if ($count gt $limit ) { if ($Agent =~ /(.{$limit})/ ) {$Agent = $1; } } $flag = 0; foreach $ua (@ua_list) { if ($Agent =~ /$ua/i ) { $flag = 1; } } if ($flag == 1 ) { foreach $ua2 (@ua_allow) { if ($Agent =~ /$ua2/i ) {$flag = 0} } } iF ( $flag == 1 ) { next;} &analyze; $spider_count += 1; if ($spider_count == 1) { $start_time = $TimeDate; } $spider_bits += $Bytes; } #endwhile close(IN); $stop_time = $TimeDate; $total_bits = sprintf("%.2f", $total_bits/1000000); #convert to Megabytes $spider_bits = sprintf("%.2f", $spider_bits/1000000); #convert to Megabytes $percent_bits = sprintf("%.2f", 100*($spider_bits/$total_bits)); $percent_pages = sprintf("%.2f", 100*($spider_count/$entry_count)); my $message =<<"EOF"; Input File: $file -------------------------------- Non-Standard User Agent Overview -------------------------------- Start time: $start_time Stop time: $stop_time Total log entrys searched is $entry_count Total MB's in the logfile: $total_bits Potential spiders consumed $spider_bits MB of data in $spider_count pages This amounts to $percent_bits\% of the total traffic in the log file This also amounts to $percent_pages\% of the total pages served ------------------------------------------------------------------ Non-Standard User Agent Details ------------------------------------------------------------------ Traffic (MB,%) | # of pages (%) | Agent ------------------------------------------------------------------ EOF if ($sort_by =~ 'bytes' ) { foreach my $spc ( sort { $spider_data{$b}->{'total_bytes'} <=> $spider_data{$a}->{'total_bytes'} } keys %spider_data) { $pages = ${spider_data}{$spc}->{'n_elements'}; $tot_bytes = ${spider_data}{$spc}{total_bytes}; $tot_bytes = sprintf("%6.1f",$tot_bytes/1000000.); $per_bytes = sprintf("%4.1f",100*($tot_bytes/$total_bits)); $per_pages = sprintf("%4.1f",100*($pages/$entry_count)); $message = $message . "$tot_bytes ($per_bytes\%) \t\t $pages ($per_pages\%) \t\t $spc\n"; } } if ($sort_by =~ 'pages' ) { foreach my $spc ( sort { $spider_data{$b}->{'n_elements'} <=> $spider_data{$a}->{'n_elements'} } keys %spider_data){ $pages = ${spider_data}{$spc}->{'n_elements'}; $tot_bytes = ${spider_data}{$spc}{total_bytes}; $tot_bytes = sprintf("%5.2f",$tot_bytes/1000000.); $per_bytes = sprintf("%4.1f",100*($tot_bytes/$total_bits)); $per_pages = sprintf("%4.1f",100*($pages/$entry_count)); $message = $message . "$tot_bytes ($per_bytes\%) \t\t $pages ($per_pages\%) \t\t $spc\n"; } } if ($sort_by =~ 'name' ) { foreach $key ( sort keys %spider_data ) { $pages = ${spider_data}{$key}->{'n_elements'}; $tot_bytes = ${spider_data}{$key}{total_bytes}; $tot_bytes = sprintf("%5.2f",$tot_bytes/1000000.); $per_bytes = sprintf("%4.1f",100*($tot_bytes/$total_bits)); $per_pages = sprintf("%4.1f",100*($pages/$entry_count)); $message = $message . "$tot_bytes ($per_bytes\%) \t\t $pages ($per_pages\%) \t\t $key\n"; } } $message = $message . "------------------------------------------------------------------\n"; $message = $message . " $spider_bits ($percent_bits\%) \t $spider_count ($percent_pages\%)\n"; $esubject = 'Daily Spider Report ' . $date_info; my %mail = ( 'To' => 'alotlikesterday@gmail.com', 'From' => 'mrt@localhost', 'Reply-to'=> 'whomever@localhost', 'Subject' => $esubject, 'Message' => $message ); sendmail(%mail); sub analyze { if (defined $spider_data{ $Agent } ) { $spider_data{$Agent}->{'n_elements'}++; $tmp = $spider_data{$Agent}->{'total_bytes'}; $update = $tmp + $Bytes; $spider_data{$Agent}->{'total_bytes'} = $update; } else { $spider_data{$Agent} = {'n_elements' => 1, 'total_bytes' => $Bytes}; } }