#!/usr/bin/perl -w use strict; # $Id: summarizelog,v 2.11 2009/09/29 18:43:26 dean2 Exp $ use Time::Local; use Time::JulianDay; use Getopt::Std; use POSIX; use Carp; # keep as a list first, to preserve order for output my(@patterns) = ('hits' => ' /tide/', 'calcs' => ' /tide/(nph-|)tideshow.cgi', 'barfs' => ' /tide/.*HTTP/[\d\.]+" (403|503)', ); # most of the time, use as a hashlist my(%patterns) = @patterns; my(@colors) = qw(3 2 1); # gnuplot png colors corresponding to those patterns my($recentthresh) = local_julian_day(time()) - 28; # my($rootdir) = '/var/www/html/tide'; my($outdaily) = "$rootdir/usage/daily"; my($outdailyyear) = "$rootdir/usage/dailyyear"; my($outdow) = "$rootdir/usage/dow"; my($outtod) = "$rootdir/usage/tod"; my($plotfile) = "$rootdir/usage/plot.gnu"; my($sumfile) = "$rootdir/usage/sumlog"; my($logfile) = "/var/log/httpd/access_log*"; my($gnuplot) = '/usr/bin/gnuplot'; my($raw) = 'raw'; my($bars) = 'dat'; my($fill) = 'fil'; my($png) = 'png'; my($bitmaplines) = 300; my($oldlogthresh) = 30 * 24 * 60 * 60; # seconds before now my($debug) = 0; # Use cmdline args for testing, don't change these my($verbose) = 0; my($plotonly) = 0; # do some fussy things my(@logfiles) = glob($logfile); my(%logend); my(%months) = qw(Jan 0 Feb 1 Mar 2 Apr 3 May 4 Jun 5 Jul 6 Aug 7 Sep 8 Oct 9 Nov 10 Dec 11); my(@mdays) = qw(31 29 31 30 31 30 31 31 30 31 30 31); # preserve only the even members (pattern names) of @patterns @patterns = @patterns[grep {$_ % 2 == 0} (0 .. $#patterns)]; { no strict 'vars'; getopts('dvp'); $debug = 1 if ($opt_d); $verbose = 1 if ($opt_v); $plotonly = 1 if ($opt_p); if ($opt_d && $opt_v && $opt_p) { ; } # shut up } $verbose = 1 if ($debug); $| = 1 if ($verbose || $debug); unless ($plotonly) { # find time of the very end of the summary file my($summaryend) = get_summary_end($sumfile); print "$sumfile\t$summaryend ", ctime($summaryend) if ($verbose); # find times of ends of all log files my(@usefiles) = (); print "Checking logfiles:\n\t", join("\n\t", @logfiles), "\n" if ($verbose); foreach (@logfiles) { $logend{$_} = get_log_end($_); print "$_\t$logend{$_} ", ctime($logend{$_}) if ($verbose); # remove any logfile names for which we couldn't get a logend time push(@usefiles, $_) unless $logend{$_} == 0; } # get the list of which ones enclose or follow the summary file end @usefiles = sort {$logend{$a} <=> $logend{$b}} @usefiles; shift(@usefiles) while (@usefiles && $logend{$usefiles[0]} < $summaryend); print "Logfiles to summarize: ", join(' ', @usefiles), "\n" if ($verbose); # get an array with summary lines for all days in those logfiles my(@summs) = summarize_logfiles(\@usefiles, \%patterns); # now we know we're done with them, unlink old log files #my($threshold) = time() - $oldlogthresh; #foreach (sort keys %logend) { # if ($logend{$_} < $threshold) { # print "Deleting logfile '$_'\n" if ($verbose); # unlink($_) if (!$debug); # } #} # discard redundant summaries, then tack the rest onto the summary file my($d, $m, $y) = (localtime($summaryend))[3,4,5]; my($endtag) = sprintf "%4d:%02d:%02d", $y+1900, $m+1, $d; my($len) = length($endtag); shift(@summs) while (@summs && substr($summs[0], 0, $len) le $endtag); print scalar(@summs), " days to append to '$sumfile'\n" if ($verbose); if (@summs) { open(SF, ">>$sumfile") || die("Failed to update '$sumfile': $!"); print SF join("\n", @summs), "\n"; close(SF); } } # run through all the summary entries accumulating summaries open(SF, "$sumfile") || die("Suddenly failed to open '$sumfile': $!"); my($lastmonth) = -1; my($cumulativemonth) = 0; my(%dow, %ndow, %tod, @daily, @dailyyear); my($ntod) = 0; my($maxdaily) = 0; my($maxtod) = 0; my($maxdow) = 0; my($lastdatetime) = undef; my($juliantoday) = local_julian_day(time()); while () { my($tag, @datalines) = split(/\|/); # split leading tag from data my($y, $m, $d, $dow) = split(/:/, $tag); # get parts of tag my($datetime) = julian_day($y, $m, $d); my(%counts) = (); for (@datalines) { my($pat, $data) = /^(\S*)\s*(.*)/; # split pattern tag from data $counts{$pat} = [split(' ', $data)]; # split each pattern's data } print "$y/$m/$d/$dow\[$datetime\] " if ($verbose); if ($debug && $lastmonth < 0) { print "flavors: ", join(' ', sort keys %counts), "\n"; for (sort keys %counts) { print "$_: (", scalar(@{$counts{$_}}), ") ", join(' ', @{$counts{$_}}), "\n"; } } if ($m != $lastmonth) { print "New month: $m " if ($debug); if ($lastmonth < 0) { print "First one!\n" if ($debug); $lastmonth = $cumulativemonth = $m; } my($diff) = $m - $lastmonth; $diff += 12 while ($diff < 0); $cumulativemonth += $diff; $lastmonth = $m; print "Cumulative month set to: $cumulativemonth " if ($debug); } print "Accumulating totals:" if ($debug); my(%totals) = (); for (keys %counts) { my($pat) = $_; print " $pat" if ($debug); $totals{$pat} = 0 if (!exists($totals{$pat})); # grep {($totals{$pat} += $_) && 0} @{$counts{$_}}; map {$totals{$pat} += $_;} @{$counts{$_}}; } #### get the day as a fraction of the month-length ###my($daily) = sprintf "%.3f", $cumulativemonth + ($d-1)/$mdays[$m-1]; my($daily) = sprintf "%04d/%02d/%02d", $y, $m, $d; # tack on the total accesses for each of the patterns totalled for (@patterns) { $daily .= " $totals{$_}"; $maxdaily = $totals{$_} if ($totals{$_} > $maxdaily); } # look for a gap and insert a blank line if so push(@daily, "") if (defined $lastdatetime && $datetime > ($lastdatetime + 5)); push(@daily, $daily); # pick up just this year's daily activity push(@dailyyear, "") if (defined $lastdatetime && $datetime > ($lastdatetime + 5) && $datetime >= ($juliantoday - 365)); push(@dailyyear, $daily) if ($datetime >= ($juliantoday - 365)); $lastdatetime = $datetime; # only pick up DOW and TOD data for a limited recent time if ($datetime >= $recentthresh) { print "$datetime/$recentthresh: DOW and TOD numbers:" if ($debug); $ndow{$dow} = 0 if (!exists($ndow{$dow})); ++$ntod; # track how many days contribute to time-of-day stats ++$ndow{$dow}; # track how many days contribute to each day of week # get the sums for (@patterns) { my($pat) = $_; print " $pat" if ($debug); $dow{$dow}{$pat} = 0 if (!exists($dow{$dow}{$pat})); $dow{$dow}{$pat} += $totals{$pat}; my($n) = scalar(@{$counts{$pat}}); for (0 .. $n-1) { my($t) = 24 * $_ / $n; $tod{$t}{$pat} = 0 if (!exists($tod{$t}{$pat})); $tod{$t}{$pat} += @{$counts{$pat}}[$_]; } } } print "\n" if ($verbose); } print "Writing out datafiles and getting maxima\n" if ($verbose); # daily totals - punch out to datafile open(DAILY, ">$outdaily.$raw") || die("Cannot open '$outdaily.$raw': $!"); print DAILY join("\n", @daily), "\n"; close(DAILY); # just this year's open(DAILYYEAR, ">$outdailyyear.$raw") || die("Cannot open '$outdailyyear.$raw': $!"); print DAILYYEAR join("\n", @dailyyear), "\n"; close(DAILYYEAR); # day-of-week totals - punch out to datafile, normalized to per-day open(DOW, ">$outdow.$raw") || die("Cannot open '$outdow.$raw': $!"); print "Day of week counts: " if ($debug); for (sort {$a <=> $b} keys %dow) { my($dow) = $_; print "$dow=$ndow{$dow} " if ($debug); print DOW $_; for (@patterns) { my($val) = $dow{$dow}{$_} / $ndow{$dow}; # normalize to per-day printf DOW " %.3f", $val; $maxdow = $val if ($val > $maxdow); } print DOW "\n"; } print "\n" if ($debug); close(DOW); # time-of-day totals - punch out to datafile, normalized to per-10-min open(TOD, ">$outtod.$raw") || die("Cannot open '$outtod.$raw': $!"); for (sort {$a <=> $b} keys %tod) { my($tod) = $_; printf TOD "%.3f", $_; for (@patterns) { my($val) = $tod{$tod}{$_} / $ntod; # normalize to per-10-min printf TOD " %.3f", $val; $maxtod = $val if ($val > $maxtod); } print TOD "\n"; } close(TOD); print "Writing out ancillary plotfiles\n" if ($verbose); # Make some ancillary plot files for filled-in bars. Gnuplot has no # filled-bar plots, so we fake it by drawing a whole buncha lines going # up and down sufficiently frequently to fill the bitmap representation # with a solid color. Puke, gag, barf, retch. my($min, $max, $del, $dy); # read the raw DOW data open(DOW, "$outdow.$raw") || die("Cannot open '$outdow.$raw': $!"); my(@dow) = ; close(DOW); # now make the bar fillings open(BDOW, ">$outdow.$fill") || die("Cannot open '$outdow.$fill': $!"); $min = (split(' ', $dow[0]))[0]; $max = (split(' ', $dow[$#dow]))[0]; $del = ($max - $min) / $bitmaplines; $dy = ($max - $min) / (2 * $#dow); for (@dow) { my($dow, @sums) = split(' '); my($x); for ($x = $dow-$dy; $x < $dow+$dy; $x+=$del) { my($xmd4) = sprintf "%.3f", $x - $del/4; my($xpd4) = sprintf "%.3f", $x + $del/4; print BDOW $xmd4, " 0" x scalar(@sums), "\n", $xmd4, " ", join(' ', @sums), "\n", $xpd4, " ", join(' ', @sums), "\n", $xpd4, " 0" x scalar(@sums), "\n", } } close(BDOW); # read the raw TOD data open(TOD, "$outtod.$raw") || die("Cannot open '$outtod.$raw': $!"); my(@tod) = ; close(TOD); # now make the bar fillings open(BTOD, ">$outtod.$fill") || die("Cannot open '$outtod.$fill': $!"); $min = (split(' ', $tod[0]))[0]; $max = (split(' ', $tod[$#tod]))[0]; $del = ($max - $min) / $bitmaplines; $dy = ($max - $min) / (2 * $#tod); for (@tod) { my($tod, @sums) = split(' '); my($x); for ($x = $tod-$dy; $x < $tod+$dy; $x+=$del) { my($xmd4) = sprintf "%.3f", $x - $del/4; my($xpd4) = sprintf "%.3f", $x + $del/4; print BTOD $xmd4, " 0" x scalar(@sums), "\n", $xmd4, " ", join(' ', @sums), "\n", $xpd4, " ", join(' ', @sums), "\n", $xpd4, " 0" x scalar(@sums), "\n", } } close(BTOD); # write out the plotfile itself $maxdaily += $maxdaily * 0.1; $maxdow += $maxdow * 0.1; $maxtod += $maxtod * 0.1; print "Creating plotfile\n" if ($verbose); open(GNU, ">$plotfile") || die("Cannot open $plotfile: $!"); print GNU <<"EOGnuplot1"; # set term png small color set term png small set tics out # set size 0.75,0.5 set size 1.0,0.5 # Daily Tide Plot (historical) set title "" set ylabel "Accesses / Day" set xlabel "Date" set xdata time set timefmt "%Y/%m/%d" set format x "%b\\n%Y" set nokey set yrange [0:$maxdaily] set output "$outdaily.$png" EOGnuplot1 ; print GNU "plot "; for (0 .. $#patterns) { print GNU "," if ($_ != 0); print GNU qq|"$outdaily.$raw" using 1:|, $_+2, ' with lines lt ', $colors[$_]; } print GNU "\n"; print GNU <<"EOClearplot1"; set size 0.75,0.5 set xdata set format x EOClearplot1 # Figure out an xmax and the monthly date tics for the last year my ($mday, $mon, $year) = (localtime(time() + 3 * 24 * 60 * 60))[3,4,5]; my $xmax = sprintf "%04d/%02d/%02d", $year + 1900, $mon + 1, $mday; ($mday, $mon, $year) = (localtime(time()))[3,4,5]; my $m = $mon; my $y = $year - 1; my $ticstr = ''; # set last+1 month to next month if (++$mon >= 12) { $mon = 0; ++$year; } while ( ! ($m == $mon && $y == $year)) { $ticstr .= sprintf qq|" %s" "%04d/%02d/01", |, (qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec))[$m], $y + 1900, $m + 1; if (++$m >= 12) { $m = 0; ++$y; } } $ticstr =~ s/,\s*$//; print GNU <<"EOGnuplot1b"; # Daily Tide Plot (Past Year) set size 1.0,0.5 set title "" set ylabel "Accesses / Day" set xlabel "Date" set xdata time set timefmt "%Y/%m/%d" # set format x "%b\\n%Y" set xtics ($ticstr) set xrange [:"$xmax"] set nokey set yrange [0:$maxdaily] set output "$outdailyyear.$png" EOGnuplot1b ; print GNU "plot "; for (0 .. $#patterns) { print GNU "," if ($_ != 0); print GNU qq|"$outdailyyear.$raw" using 1:|, $_+2, ' with lines lt ', $colors[$_]; } print GNU "\n"; print GNU <<"EOClearplot1b"; set size 0.75,0.5 set xdata set format x EOClearplot1b print GNU <<"EOGnuplot2"; # Tide Accesses by Day of Week set title "" set ylabel "Average Accesses / Day" set xlabel "Day of Week" set xdtics set nokey set noyzeroaxis set yrange [0:$maxdow] set xrange [-.5:6.5] set output "$outdow.$png" EOGnuplot2 ; print GNU "plot "; for (0 .. $#patterns) { print GNU "," if ($_ != 0); print GNU qq|"$outdow.$fill" using 1:|, $_+2, ' with lines lt ', $colors[$_]; } print GNU "\n"; print GNU <<"EOGnuplot3"; # Tide Accesses by Time of Day set title "" set ylabel "Average Accesses / 10 Minutes" set xlabel "Time of Day (Local Time)" set nokey set noyzeroaxis set yrange [0:$maxtod] set xrange [-.5:24.5] set xrange [0:24] set xtics 0, 1, 24 set output "$outtod.$png" EOGnuplot3 ; print GNU "plot "; for (0 .. $#patterns) { print GNU "," if ($_ != 0); print GNU qq|"$outtod.$fill" using 1:|, $_+2, ' with lines lt ', $colors[$_]; } print GNU "\n"; close(GNU); print "Calling gnuplot\n" if ($verbose); system("$gnuplot $plotfile"); print "\nDone.\n" if ($verbose); exit; # Accumulate a summary line for all days in the remaining logfiles # (note that first day will be partial, depending on when logfile begins). # Do not assume that logfiles are perfectly linear in time. # Discard the last day, presuming it is partial. sub summarize_logfiles { my($logfiles, $patterns) = @_; my(@logfiles) = @$logfiles; # make life easier... my(%patterns) = %$patterns; my(%count); if ($debug) { print "summarize_logfiles\npatterns: \n"; for (sort keys %patterns) { print "\t$_: '$patterns{$_}'\n"; } } my(@summaries) = (); my($prevdmy) = ''; my($prevtime) = 0; for (@logfiles) { print "reading $_\n" if ($verbose); open(LF, $_) || die("Suddenly, failed to open $_: $!"); while () { my($line) = $_; next unless( $line =~ /\[(.*?):(\d\d):(\d)/ ); # skip bad lines my($dmy) = $1; my($hour10min) = $2 * 6 + $3; my($d, $m, $y) = split("/", $dmy); $m = $months{$m}; # numerical month (0=Jan) my($date) = sprintf "%4d:%02d:%02d", $y, $m+1, $d; # accumulate a hit from the logfile for each matching pattern for (keys %patterns) { ++$count{$date}{$_}{$hour10min} if ($line =~ /$patterns{$_}/); } } close(LF); } # we've read them all, now assemble a line for each day for my $date (sort keys %count) { # do timelocal/localtime just to get day of week my($y, $m, $d) = split(/:/, $date); my($dowtime) = timelocal(0, 0, 12, $d, $m-1, $y-1900); my($dow) = (localtime($dowtime))[6]; # create the line header for this day my($summary) = sprintf "%s:%1d", $date, $dow; for my $pat (sort keys %patterns) { $summary .= "|$pat "; for my $hr (0 .. 23) { for (0 .. 5) { my $hour10min = $hr * 6 + $_; if (exists $count{$date}{$pat}{$hour10min}) { $summary .= $count{$date}{$pat}{$hour10min} . " "; } else { $summary .= "0 "; } } } } # we're done with the day push(@summaries, $summary); print "." if ($verbose); } # dump last day's-worth, presuming it's partial pop(@summaries) if scalar @summaries; print scalar(@summaries), " days of summary info snagged\n" if ($verbose); @summaries; } sub lastline { my($fh) = shift(@_); seek($fh, 0, SEEK_END); # seek to the end my($flength) = tell($fh); my($offset) = 2; # start back before terminal newline my($c) = 0; # initialize to shut up warning my($error) = 0; # io errors # back up until we hit a newline, then read the subsequent line do { seek($fh, -$offset, SEEK_END) || ++$error; read($fh, $c, 1) || ++$error; } while ( ! $error && $offset++ <= $flength && $c ne "\n"); my($lastline) = <$fh> unless $error; $error ? '' : $lastline; } sub get_log_end { my($lf) = @_; my($logend); open(LF, $lf) || croak("Failed to open logfile '$lf': $!"); $logend = lastline(\*LF); close(LF); $logend =~ m|\[(\d*)/(\S\S\S)/(\d\d\d\d):(\d\d):(\d\d):(\d\d)|; if (defined $1 && defined $2 && defined $3 && defined $4 && defined $5 && defined $6) { $logend = timelocal($6, $5, $4, $1, $months{$2}, $3-1900); } else { $logend = 0; } $logend; } sub get_summary_end { my($sf) = @_; my($summaryend); if (!open(SF, $sf)) { print "Creating new summary file '$sf'.\n"; return 0; } $summaryend = lastline(\*SF); close(SF); # now get the time at noon on that day and return it in standard form if ($summaryend) { my($yyyy, $mm, $dd) = split(':', $summaryend); croak("Corrupt summary file '$sf'") if ($yyyy < 1900 || $mm < 1 || $mm > 12 || $dd < 1 || $dd > 31); $summaryend = timelocal(0, 0, 12, $dd, $mm-1, $yyyy-1900); } else { $summaryend = 0; } $summaryend; } # $Log: summarizelog,v $ # Revision 2.11 2009/09/29 18:43:26 dean2 # Put license & log at the end. # Rewrite logfile summarizing so that it does not depend # on the time being linearly increasing. # # Revision 2.9 2005/06/15 18:12:09 dean2 # Added "Forbidden" (code 403) to barf pattern matching. This matches # the rewrite for forbidden spiders. # # Revision 2.8 2002/07/30 00:20:57 dean2 # Fixed a couple of paths for the new server setup. # # Revision 2.7 2002/06/10 04:43:01 dean2 # Added Daily Usage for past year plot. # # Revision 2.6 2002/06/09 19:13:12 dean2 # Inserted check for gap in data in the daily plot (to insert a blank # line into the gnuplot data, so there isn't a connecting line across # the gap). # Widened the daily plot graph since it now goes back to 1995. # # Revision 2.5 2001/07/23 20:44:33 dean2 # Modified lastline() and get_log_end() to deal gracefully with an # empty or bad logfile. That now allows the mainline to exclude # such logfiles from processing. # # Revision 2.4 2001/05/16 20:35:04 dean2 # Changed daily usage plot axis to real dates. # Changed output to go directly to PNG format. # # Revision 2.3 2001/02/03 07:26:20 dean2 # Switched to use the Apache server logs instead of the custom-output # logs previously generated by the tide site. Hence, also restored # the display of barfs. # # Revision 2.2 2001/01/08 20:52:13 dean2 # Added a hack to stop plotting the barf/red data. # # Revision 2.1 2000/01/27 01:09:52 dean2 # *** empty log message *** # # Revision 1.4 1999/08/27 21:23:09 dean # Yup - fixed things (prior version has a syntax typo). We had trouble # with the "log..." name being used by the arglogging function from the # tide script itself. # # Revision 1.3 1999/08/27 21:12:55 dean # Try some minor changes for debugging... # # Revision 1.2 1999/07/14 20:51:55 dean # Added check for numerical day and year to skip bad lines. # # Revision 1.1 1998/04/07 14:18:25 dean # Initial revision # # N. Dean Pentcheff dean@tbone.biol.sc.edu # # This program is Copyright 1996 by N. Dean Pentcheff. All rights # reserved. This program is distributed in the hope that it will be # useful, but WITHOUT ANY WARRANTY; without even the implied warranty # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # This program may be redistributed under the terms of the GNU General # Public Licence (any version) or Larry Wall's Artistic Licence (your # choice), with the following exceptions: portions of the Artistic # Licence that deal exclusively with the Perl language as an # interpreter are not applicable. This includes the entirety of item # 6 (which begins "The scripts and library files supplied as input to # or produced as output from the programs of this Package..."); the # entirety of item 7 (which begins "C subroutines (or comparably # compiled subroutines in other languages) supplied by you and linked # into this Package..."); and the last sentence of item 5, which # refers to embedding the Perl interpreter into other works. # # Textual copies of these licenses should have accompanied this # program (files "Copying" and "Artistic"). If not, you can get the # GNU Licence from (or # with any GNU software, such as emacs or gcc), and the Artistic # Licence from (or # accompanying any distribution of the Perl language).