#!/bin/bash
##############################################################################
# This script can be used to process the access-logfiles which are generated
# by the Apache httpd-daemon for every virtual host. In an access-logfile,
# one line is stored per http-request.
# This script counts the number of http-requests per virtual host. This
# information is stored in the httpstat-file /var/log/atsar/httpstat,
# which contains one line per virtual host with the following format:
#
#       field  1: cumulative_requests HEAD
#       field  2: cumulative_requests GET
#       field  3: cumulative_requests PUT
#       field  4: cumulative_requests POST
#       field  5: cumulative_requests DELETE
#       field  6: date of last processed http-request
#       field  7: time of last processed http-request
#       field  8: last processed offset in the access-logfile (see field 9)
#       field  9: path-name of access-logfile
#       field 10: symbolic name (shown in output of atsar)
#
# Every time this script is started, it reads the lines from the
# httpstat-file. After that it continues reading the access-logfile
# at the point that it stopped the previous time. At the end of a run,
# the information in de httpstat-file will be updated with the new
# cumulative figures.
#
# Note that this script should be activated with regular intervals.
# Obviously the information in the httpstat-file is not accurate any more
# if between two runs *all* generations of a access-logfile have been rotated.
#
# This script must be started without command-line parameters.
# ============================================================================
# Author: Gerlof Langeveld - AT Computing Nijmegen, Holland
# Date:   August 1999 - Initial
#         March  2001 - Configurable names for access-logfiles
##############################################################################
CONFIGF="/etc/atsar.conf"

##############################################################################
# if no statistics-file exists yet, one will be created
##############################################################################
HTTPSTAT=/var/log/atsar/httpstat

if [ ! -f $HTTPSTAT ]
then
	> $HTTPSTAT	# file exists anyhow even if no logfile present

	if [ ! -f $CONFIGF ]
	then
		exit 0
	fi

        CURDATE=`date +%Y%m%d`
        CURTIME=`date +%H%M%S`

	# generate a line for every logfile
	#
	while read CURTYPE CURLOGF CURNAME REST
	do
		if [ "$CURTYPE" != "HTTP" -o "$CURLOGF" = "" -o "$CURNAME" = "" ]
		then
			continue
		fi

		if [ -f $CURLOGF ]
		then
			CUROFFSET=`ls -l $CURLOGF | awk '{print $5}'`

			echo "0 0 0 0 0 $CURDATE $CURTIME $CUROFFSET $CURLOGF $CURNAME" >> $HTTPSTAT
		fi 
	done < $CONFIGF
	exit 0
else
	if [ ! -s $HTTPSTAT ]	# file exists and is empty ?
	then
		exit 0
	fi
fi

##############################################################################
# function: awk-session counts the number of HTTP-requests per type
##############################################################################
function http_process
{
   $NEXTCMD 2> /dev/null | awk '
        BEGIN {
        months["Jan"] =  1
        months["Feb"] =  2
        months["Mar"] =  3
        months["Apr"] =  4
        months["May"] =  5
        months["Jun"] =  6
        months["Jul"] =  7
        months["Aug"] =  8
        months["Sep"] =  9
        months["Oct"] = 10
        months["Nov"] = 11
        months["Dec"] = 12

        needcheck="y"
        cnt=0

        # gather shell-parameters
        prevdate="'"$CURDATE"'"
        prevtime="'"$CURTIME"'"
        prevstamp=prevdate prevtime

        newoffset="'"$NEWOFFSET"'"
        httplogf="'"$CURLOGF"'"
        httpname="'"$CURNAME"'"

        cumhead="'"$CUMHEAD"'"
        cumgets="'"$CUMGETS"'"
        cumputs="'"$CUMPUTS"'"
        cumpost="'"$CUMPOST"'"
        cumdels="'"$CUMDELS"'"
   }

   needcheck=="y" {
        gsub(/[\[/]/, ":", $4)
        split($4, dt, ":")

        # get format YYYYMMDDHHMMSS
        linestamp = sprintf("%04d%02d%02d%02d%02d%02d",
                        dt[4], months[dt[3]], dt[2], dt[5], dt[6], dt[7])

        if (linestamp < prevstamp)        # check for proper date
                next

        needcheck="n"   # from now on no checks any more because log-lines
                        # are delivered in chronological order!
   }

   {
        lastdt=$4                       # remember info of last processed line
   }

   $6 == "\"HEAD" { # for every HEAD-request ...
        cnt++
        cumhead++
        next
   }

   $6 == "\"GET" { # for every GET-request ...
        cnt++
        cumgets++
        next
   }

   $6 == "\"PUT" { # for every PUT-request ...
        cnt++
        cumputs++
        next
   }

   $6 == "\"POST" { # for every POST-request ...
        cnt++
        cumpost++
        next
   }

   $6 == "\"DELETE" { # for every DELETE-request ...
        cnt++
        cumdels++
        next
   }

   END {
        if (cnt > 0)
        {
                gsub(/[\[/]/, ":", lastdt)
                split(lastdt, dt, ":")

                # get format YYYYMMDD
                lastdate = sprintf("%04d%02d%02d", dt[4], months[dt[3]], dt[2])
                lasttime = sprintf("%02d%02d%02d", dt[5], dt[6], dt[7])
        }
        else
        {
                lastdate=prevdate
                lasttime=prevtime
        }

        printf("%lf %lf %lf %lf %lf %08ld %06ld %ld %s %s\n",
                cumhead, cumgets, cumputs, cumpost, cumdels,
                lastdate, lasttime, newoffset, httplogf, httpname)
   }' >> $HTTPSTAT.new
}

##############################################################################
# statistics-file is found; read every line and cut into pieces
##############################################################################
#
> $HTTPSTAT.new

while read CUMHEAD CUMGETS CUMPUTS   CUMPOST CUMDELS  \
           CURDATE CURTIME CUROFFSET CURLOGF CURNAME REST 
do
        #
        # check which access-logfiles for this virtual host are modified
        # since the previous scan; there may have been zero or more
        # logfile-rotations
        #
        ACCNEW=
        ACCNEWGZ=
        ACCCNT=0

        for ACCFILE in ${CURLOGF}*
        do
                # if this access-file is not modified since
                # last write of statistics-file, then this file AND
                # all subsequent files are older and therefor not modified
                if [ $ACCFILE -ot $HTTPSTAT ]
                then
                        break
                fi

                # remember file to be processed

                case $ACCFILE in
                  *.gz) ACCNEWGZ="$ACCFILE $ACCNEWGZ"
                        ;;
                     *) ACCNEW="$ACCFILE $ACCNEW"
                        ;;
                esac

                let ACCCNT+=1              # count files to be processed
        done

        #
        # check which access-files contain new information
        #
        case $ACCCNT in
                ############################################################
                # no http-transfers since previous check?

             0) echo $CUMHEAD $CUMGETS $CUMPUTS   $CUMPOST $CUMDELS     \
                     $CURDATE $CURTIME $CUROFFSET $CURLOGF $CURNAME >> $HTTPSTAT.new
                continue
                ;;
                ############################################################
                # only access_log itself is modified (no logfile-rotation);
                # an efficient scan can be done by skipping the part which has
                # been processed before

             1) if [ "$CUROFFSET" -eq 0 ]
		then
			NEXTCMD="cat $CURLOGF"
		else
			NEXTCMD="dd skip=1 ibs=$CUROFFSET if=$CURLOGF"
		fi
                ;;
                ############################################################
                # apparantly access_logs have been rotated since the previous
                # run; the log-lines from all modified access_log-files are
                # concatenated in chronological order (so filenames are in
                # reversed order) which allows the awk-selection to skip
                # lines which have been processed before; note that the
                # remembered offset does not necessarily belong to the
                # oldest file, because a complete cycle might have been
                # occurred since the previous run

             *) if [ "$ACCNEWGZ" != "" ]
                then
                        NEXTCMD="zcat $ACCNEWGZ | cat - $ACCNEW"
                else
                        NEXTCMD="cat $ACCNEW"
                fi
                ;;
        esac

        #
        # Process new http-transfers for this virtual host
        #
        NEWOFFSET=`ls -l $CURLOGF | awk '{print $5}'`  # to be stored by awk

        http_process    # activate awk-session

done < $HTTPSTAT

mv $HTTPSTAT.new $HTTPSTAT

exit 0
