#!/usr/bin/perl
#
# mon - service availability test and alert scheduler
#
# configuration file is mon.cf
#
# Jim Trocki, trockij@transmeta.com
#
# $Id: mon,v 1.38 1998/04/15 02:17:35 trockij Exp $
#
# Copyright (C) 1998 Jim Trocki
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#
$RCSID='$Id: mon,v 1.38 1998/04/15 02:17:35 trockij Exp $';
$AUTHOR='trockij@transmeta.com';
use Getopt::Std;
use POSIX;
use Fcntl;
use Socket;
use Time::Period;
use Sys::Hostname;
use Sys::Syslog;

getopts ("fdhva:c:m:r:s:i:p:P:");

#
# definitions
#
$CF = $opt_c || "/etc/mon.cf";
$SCRIPTDIR = "/usr/lib/mon/mon.d";
$ALERTDIR  = "/usr/lib/mon/alert.d";
$PIDFILE   = (-d "/var/run/mon" ? "/var/run/mon"
		: -d "/var/run" ? "/var/run"
		: "/etc") . "/mon.pid";
$MAX_KEEP  = 100;
$SLEEPINT  = 1;
$SERVPORT  = 32777;
$MAXPROCS  = 0;
$HOSTNAME  = hostname;
$HOSTIP    = gethostbyname ($HOSTNAME);
$LOCALHOST = gethostbyname ("localhost");
$OS = `uname -s 2>/dev/null` || "Unknown";
chomp $OS;

$opt_d && ($SERVPORT = 32768); #DEBUG

if ($opt_v) {
    print "$RCSID\n";
    exit;
}

if ($opt_h) {
    &usage();
    exit;
}

#
# read config file
#
&read_cf ($CF) ||
    &die_die ("crit", "could not open cf file: $CF: $!");

$PWD = getcwd;
$CF = "$PWD/$CF" if ($CF !~ /^\//);
$SCRIPTDIR = "$PWD/$SCRIPTDIR" if ($SCRIPTDIR !~ /^\//);
$ALERTDIR = "$PWD/$ALERTDIR" if ($ALERTDIR !~ /^\//);

#
# cmdline args override config file
#
$ALERTDIR = $opt_a if ($opt_a);
$SCRIPTDIR = $opt_s if ($opt_s);
$MAX_KEEP = $opt_k if ($opt_k);
$SERVPORT = $opt_p if ($opt_p);
$PIDFILE = $opt_P if defined($opt_P);	# allow empty pidfile
$SLEEPINT = $opt_i if ($opt_i);
$MAXPROCS = $opt_m if ($opt_m);
if ($opt_r) {
    die "bad randstart value\n"
	if (!defined(&dhmstos($opt_r)));
    $RANDSTART = &dhmstos($opt_r);
}

#
# fork and become a daemon
#
$OS eq "Linux" && Sys::Syslog::setlogsock ('unix');
openlog ("mon", "cons,pid", "daemon");
&daemon() if ($opt_f);
if ($PIDFILE ne '' && open PID, ">$PIDFILE") {
    print PID "$$\n";
    close PID;
}

#
# bind and listen
#
&setup_server();

&set_last_test ();

#
# randomize startup checks if asked to
#
&randomize_startdelay()
    if ($RANDSTART);

@last_alerts = ();
@last_failures = ();
$procs = 0;				# number of outstanding procs

$SIG{HUP} = \&reset;
$SIG{INT} = \&handle_sigterm;		# for interactive debugging
$SIG{TERM} = \&handle_sigterm;
$SIG{PIPE} = 'IGNORE';

#
# main monitoring loop
#
$i=0;
$lasttm=time; # the last time(2) the loop started
$fdset_rbits = $fdset_ebits = '';

for (;;) {
&debug ("$i\n");
    $i++;
    $tm = time;

    #
    # handle client connections
    #
    if (&client_pending()) {
    	&handle_client();
    }

    #
    # step through the watch groups, decrementing and
    # handing expired timers
    #
    foreach $group (keys %watch) {
	#
	# skip over disabled watch
	#
	next if ($watch_disabled{$group} == 1);

	for ($service=0;$service<@{$watch{$group}};$service++) {

	    $sref = \%{$watch{$group}[$service]};

	    #
	    # if the timer expires for a service, fork and
	    # run the monitor
	    #
	    if ($$sref{"_timer"} <= 0 && !$running{"$group.$service"}) {

		if (($MAXPROCS && $procs < $MAXPROCS) || !$MAXPROCS) {
		    &run_monitor();
		} else {
		    syslog ('info', "throttled at $procs processes");
		}

	    } else {
		$t = $tm - $lasttm;
		$t = 1 if ($t <= 0);
		$$sref{"_timer"} -= $t if ($$sref{"_timer"} > 0);
	    }
	}
    }

    #
    # collect any output from subprocs
    #
    &collect_output();

    #
    # clean up after exited processes, and trigger alerts
    # $sref is redefined after this point
    #
    &proc_cleanup();

    $lasttm = time;
    sleep $SLEEPINT;
}

&clean_up();
exit;



######################################################################
#
# handle alert event
#
# alert program is called with the following arguments:
#   alert_prog \
#         -s 'service name' \
#         -g 'hostgroup' \
#         -h 'hostgroup expansion' \
#         -t 'time(2) when condition was detected' \
#         -l 'seconds left until next alarm is sent' \
#         $args
#
# $args are everything after the script name in the config file
# path to alert_prog is from $SCRIPTDIR, set by $opt_a
#
sub do_alert {
    my ($group, $service, $output, $retval) = @_;
    my ($tm, $fac, $args, $service_n, $i, $summary);
    my (@groupargs, $period, $last_alert, $alert, $alerts_sent);
    my ($sref, $pref, $range);

    $sref = \%{$watch{$group}[$service]};

    $tm = time;
    $service_n = $watch{$group}[$service]{"service"};

    $summary = (split("\n", $output))[0];
    $summary = "(NO SUMMARY)" if ($summary =~ /^\s*$/m);

    $alerts_sent = 0;

    #
    # check each time period for pending alerts
    #
    foreach $period (keys %{$$sref{"periods"}}) {
	#
	# only send alerts that are in the proper period
	#
    	next if (!inPeriod ($tm, $period));

    	$pref = \%{$$sref{"periods"}{$period}};

	#
	# only alert once every "alertevery" seconds, unless
	# output from monitor is different
	#
	if ($$pref{"alertevery"} != 0 &&
	    ($tm - $$pref{"_last_alert"} < $$pref{"alertevery"}) &&
	    $$sref{"_failure_output"} eq $output) {
	    next;
	}

	#
	# handle alertafter conditions
	#
	if (defined ($$pref{"alertafter"})) {
	    $$pref{"_failcount"}++;

	    if ($tm - $$pref{'_1stfailtime'} <= $$pref{'alertafterival'}
	    	&& $$pref{"_failcount"} < $$pref{"alertafter"}) {
	    	next;
	    }

	    #
	    # start a new time interval
	    #
	    if ($tm - $$pref{'_1stfailtime'} > $$pref{'alertafterival'}) {
		$$pref{"_failcount"} = 1;
	    }
	    if ($$pref{"_failcount"} == 1) {
		$$pref{"_1stfailtime"} = $tm;
	    }

	    if ($$pref{"_failcount"} < $$pref{"alertafter"}) {
	    	next;
	    }
	}

	#
	# at this point, no alerts are blocked,
	# so send the alerts
	#

	#
	# trigger multiple alerts in this period
	#
	for ($i=0;$i<@{$$pref{"alerts"}};$i++) {
	    
	    my $range;

	    if ($$pref{"alerts"}[$i] =~ /^exit=((\d+|\d+-\d+))\s/i) {
	    	$range=$1;
	    	next if (!&inRange($retval, $range));
		($fac, $args) = (split (/\s+/, $$pref{"alerts"}[$i], 3))[1,2];
	    } else {
		($fac, $args) = split (/\s+/, $$pref{"alerts"}[$i], 2);
	    }
	    @groupargs = grep (!/^\*/, @{$groups{$group}});

	    syslog ("alert", "calling alert $fac for $group.$service_n" .
	    	" ($fac,$args) $summary");

	    if (!open (OUTF, "| $ALERTDIR/$fac -s '$service_n' " .
		    "-g '$group' -h '@groupargs' -t '$tm' " .
		    "-l '$$pref{alertevery}' $args")) {
		syslog ('err', "could not open pipe to $ALERTDIR/$fac: $!");
	    } else {
		print OUTF $output;
		close (OUTF);
	    }
	    $alerts_sent++;
	}

	$$pref{"_last_alert"} = $tm;
    }

    return if (!$alerts_sent);

    #
    # tally this alert
    #
    $$sref{"_alert_count"}++;

    #
    # store this in the log
    #
    shift @last_alerts if (@last_alerts > $MAX_KEEP);
    push @last_alerts, "$group " . &getservbynum ($group, $service) .
	" $tm $fac ($args) $summary";
}



######################################################################
#
# walk through the watch list and reset the time
# the service was last called
#
sub set_last_test {
    my ($i, $k, $t);
    $t = time;
    foreach $k (keys %watch) {
    	for ($servnum=0; $servnum<@{$watch{$k}};$servnum++) {
	    $watch{$k}[$servnum]{"_timer"} = $watch{$k}[$servnum]{"interval"};
	    $watch{$k}[$servnum]{"_last_alert"} = 0;
	}
    }

}


######################################################################
#
# parse configuration file
#
# build the following data structures:
#
# %group
#       each element of %group is an array of hostnames
#       group records are terminated by a blank line in the
#       configuration file
# %watch{"group"}[service-number]{"variable"} = value
# %watch is a hash of arrays of hashes
#
# variables for internal use are:
#   $watch{"group"}[service-number]{"_timer"}, countdown timer between checks
#   $watch{"group"}[service-number]{"_last_alert"}, used by &do_alert
# 
#
sub read_cf {
    my ($CF) = @_;
    my ($l, $var, $watchgroup, $ingroup, $curgroup, $inwatch,
	$watchgroup, $servnum, $args, $hosts, %disabled, $h, $i);

    #
    # parse configuration file
    #
    open (CFG, $CF) ||
	die "could not open cf file: $CF: $!\n";

    $servnum = 0;

    for (;;) {
    	last if (!defined ($l = <CFG>));
	next if $l =~ /^#/;
	chomp $l;
	$l =~ s/^\s*//;
	$l =~ s/\s*$//;

	#
	# variables than can be overriden by the command line
	#
	if ($l =~ /^alertdir\s*=\s*(\S+)/) {
	    $ALERTDIR = $1;
	    next;
	} elsif ($l =~ /^mondir\s*=\s*(\S+)/) {
	    $SCRIPTDIR = $1;
	    next;
	} elsif ($l =~ /^histlength\s*=\s*(\d+)/) {
	    $MAX_KEEP = $1;
	    next;
	} elsif ($l =~ /^serverport\s*=\s*(\d+)/) {
	    $SERVPORT = $1;
	    next;
	} elsif ($l =~ /^pidfile\s*=\s*(\S+)/) {
	    $PIDFILE = $1;
	    next;
	} elsif ($l =~ /^randstart\s*=\s*(\S+)/) {
	    $RANDSTART = &dhmstos($1);
	    die "cf error: bad syntax, line $.\n"
		if (!defined ($RANDSTART));
	    next;
	} elsif ($l =~ /^maxprocs\s*=\s*(\d+)/) {
	    $MAXPROCS = $1;
	    next;
	}
 

	#
	# end of record
	#
	if ($l eq "") {
	    $ingroup    = 0;
	    $curgroup   = "";
	    $inwatch    = 0;
	    $watchgroup = "";
	    $servnum = 0;
	    $period = 0;
	    next;
	}

	#
	# group record
	#
	if ($l =~ /^hostgroup\s+(\w+)\s*(.*)/) {
	    $curgroup = $1;
	    $hosts = $2;
	    %disabled = ();
	    foreach $h (grep (/^\*/, @{$groups{$curgroup}})) {
		$h =~ s/^\*//;
		$disabled{$h} = 1;
	    }
	    @{$groups{$curgroup}} = split(/\s+/, $hosts);
	    #
	    # keep hosts which were previously disabled
	    #
	    for ($i=0;$i<@{$groups{$curgroup}};$i++) {
		$groups{$curgroup}[$i] = "*$groups{$curgroup}[$i]"
		    if ($disabled{$groups{$curgroup}[$i]});
	    }
	    $ingroup = 1;
	    next;
	} elsif ($ingroup) {
	    push (@{$groups{$curgroup}}, split(/\s+/, $l));
	    for ($i=0;$i<@{$groups{$curgroup}};$i++) {
		$groups{$curgroup}[$i] = "*$groups{$curgroup}[$i]"
		    if ($disabled{$groups{$curgroup}[$i]});
	    }
	    next;
	}

	#
	# watch record
	#
	if ($l =~ /^watch\s+(\w+)\s*/) {
	    $watchgroup = $1;
	    die "cf error: watch already defined, line $.\n"
	    	if ($watch{$watchgroup});
	    $ingroup    = 0;
	    $curgroup   = "";
	    $servnum = 0;
	    $period = 0;
	    $inwatch = 1;
	    if (!defined ($groups{$watchgroup})) {
	    	@{$groups{$watchgroup}} = ($watchgroup);
	    }
	    next;
	} elsif ($inwatch) {
	    $l =~ /^(\w+)\s*(.*)$/;
	    $var = $1;
	    $args = $2;

	    #
	    # service entry
	    #
	    if ($var eq "service") {
	    	$servnum++;
		$watch{$watchgroup}[$servnum-1]{"service"} = $args;
		$watch{$watchgroup}[$servnum-1]{"interval"} = 1800;
		$watch{$watchgroup}[$servnum-1]{"randskew"} = 0;
		$watch{$watchgroup}[$servnum-1]{"_failure_count"} = 0
		    if (!defined($watch{$watchgroup}[$servnum-1]{"_failure_count"}));
		$watch{$watchgroup}[$servnum-1]{"_start_of_monitor"} = time
		    if (!defined($watch{$watchgroup}[$servnum-1]{"_start_of_monitor"}));
		$watch{$watchgroup}[$servnum-1]{"_alert_count"} = 0
		    if (!defined($watch{$watchgroup}[$servnum-1]{"_alert_count"}));
		$watch{$watchgroup}[$servnum-1]{"_last_failure"} = 0
		    if (!defined($watch{$watchgroup}[$servnum-1]{"_last_failure"}));
		$watch{$watchgroup}[$servnum-1]{"_last_success"} = 0
		    if (!defined($watch{$watchgroup}[$servnum-1]{"_last_success"}));
		next;
	    }

	    if (!$servnum) {
	    	die "cf error: need to specify service in watch record, line $.\n";
	    }


	    #
	    # period definition
	    #
	    if ($var eq "period") {
		$period = 1;
	    	if (inPeriod (time, $args) == -1) {
		    die "cf error: malformed period, line $.\n";
		}
		$periodstr = $args;
		$watch{$watchgroup}[$servnum-1]{"periods"}{$periodstr}{"alertevery"} = 0;
		@{$watch{$watchgroup}[$servnum-1]{"periods"}{$periodstr}{"alerts"}} = ();
		next;
	    }

	    #
	    # alert
	    #
	    if ($var eq "alert" && !$period) {
	    	die "cf error: need to specify a period for alert, line $.\n";
	    } elsif ($var eq "alertevery" && !$period) {
	    	die "cf error: need to specify a period for alertevery, line $.\n";
	    } elsif ($var eq "alertafter" && !$period) {
	    	die "cf error: need to specify a period for alertafter, line $.\n";
	    }

	    #
	    # for each service there can be one or more alert periods
	    # this is stored as an array of hashes named
	    #     %{$watch{$watchgroup}[$servnum-1]{"periods"}}
	    # each index for this hash is something like "wd {Mon-Fri} hr {7am-11pm}"
	    # the value of the hash is an array containing the list of alert commands
	    # and arguments
	    #
	    if ($var eq "alert") {
	    	push @{$watch{$watchgroup}[$servnum-1]{"periods"}{$periodstr}{"alerts"}},
		    $args;

	    #
	    # non-alert variables
	    #
	    } else {
		if ($var eq "interval") {
		    $args = &dhmstos ($args) ||
			die "cf error: invalid time interval, line $.\n";

		} elsif ($var eq "randskew") {
		    $args = &dhmstos ($args) ||
			die "cf error: invalid random skew time, line $.\n";

		} elsif ($var eq "alertevery") {
		    $args = &dhmstos ($args) ||
			die "cf error: invalid time interval, line $.\n";
		    $watch{$watchgroup}[$servnum-1]{"periods"}{$periodstr}{"alertevery"} =
		    	$args;
		    next;
		} elsif ($var eq "alertafter") {
		    if ($args !~ /(\d+)\s+(\d+[hms])$/) {
		    	die "cf error: invalid interval specification, line $.\n";
		    }
		    $watch{$watchgroup}[$servnum-1]{"periods"}{$periodstr}{"alertafter"} =
		    	$1;
		    $watch{$watchgroup}[$servnum-1]{"periods"}{$periodstr}{"alertafterival"} =
		    	&dhmstos ($2);
		    $watch{$watchgroup}[$servnum-1]{"periods"}{$periodstr}{"_1stfailtime"} = 0;
		    $watch{$watchgroup}[$servnum-1]{"periods"}{$periodstr}{"_failcount"} = 0;
		}

		$watch{$watchgroup}[$servnum-1]{$var} = $args;
	    }

	    next;
	}
    }
    close (CFG);
    1;
}


######################################################################
#
# convert a string like "20m" into seconds
#
sub dhmstos {
    my ($str) = @_;
    my ($s);

    if ($str =~ /^\s*(\d+(?:\.\d+)?)([dhms])\s*$/i) {
	if ($2 eq "m") {
	    $s = $1 * 60;
	} elsif ($2 eq "h") {
	    $s = $1 * 60 * 60;
	} elsif ($2 eq "d") {
	    $s = $1 * 60 * 60 * 24;
	} else {
	    $s = $1;
	}
    } else {
    	return undef;
    }
    $s;
}


######################################################################
#
# reset the state of the server on SIGHUP, and reread config
# file.
#
sub reset {
    my ($pid, $group, $service);


    #
    # reap children that may be running
    #
    foreach $pid (keys %runningpid) {
	($group, $service) = split (/\./, $runningpid{$pid});
    	kill 9, $pid;
	waitpid ($pid, 0);
	syslog ('info', "reset killed child $pid, exit status $?");
	&remove_proc ($pid);
    }

    %watch = ();
    %groups = ();
    $procs = 0;
    syslog ('info', "resetting, and re-reading configuration $CF");
    &read_cf ($CF);
    $lasttm=time; # the last time(2) the loop started
    $fdset_rbits = $fdset_ebits = '';
    &set_last_test ();
    &randomize_startdelay() if ($RANDSTART);
}

######################################################################
#
# remove a process from our state
#
sub remove_proc {
    my ($pid) = @_;

    return if (!defined $runningpid{$pid});

    vec ($fdset_rbits, fileno($fhandles{$runningpid{$pid}}), 1) = 0;
    close ($fhandles{$runningpid{$pid}});
    delete $fhandles{$runningpid{$pid}};
    delete $running{$runningpid{$pid}};
    delete $runningpid{$pid};
    $procs--;
}


######################################################################
#
# clean up before exiting
#
sub clean_up {
    unlink $PIDFILE unless $PIDFILE eq '';
}


######################################################################
#
# exit on SIGTERM
#
sub handle_sigterm {
    syslog ("info", "caught TERM signal, exiting");
    &clean_up();
    exit (1);
}


######################################################################
#
# setup server
#
sub setup_server {
    my ($proto);

    $proto = getprotobyname ('tcp');
    socket (SERVER, PF_INET, SOCK_STREAM, $proto) ||
    	&die_die ("crit", "could not create socket: $!");
    setsockopt (SERVER, SOL_SOCKET, SO_REUSEADDR, pack ("l", 1)) ||
    	&die_die ("crit", "could not setsockopt: $!");
    bind (SERVER, sockaddr_in ($SERVPORT, INADDR_ANY)) ||
    	&die_die ("crit", "could not bind: $!");
    listen (SERVER, SOMAXCONN);
}


######################################################################
#
# Check if there is a pending client to be accepted.
# Returns 1 if there is, 0 otherwise.
#
sub client_pending {
    my ($rin, $rout, $n);

    $rin = '';
    vec ($rin, fileno (SERVER), 1) = 1;
    $n = select ($rout = $rin, undef, undef, 0);
}


######################################################################
#
# Handle a connection from a client
#
sub handle_client {
    my ($l, $cmd, $args, $group, $service, $s, $sname);
    my ($var, $value, $msg, @l, $sock, $port, $addr, $sref);

    accept (CLIENT, SERVER);
    $sock = getpeername (CLIENT);
    ($port, $addr) = unpack_sockaddr_in ($sock);
    syslog ('info', "client connection from " . inet_ntoa ($addr) .
    		":" . $port);

    select (CLIENT);
    $|=1;
    select (STDOUT);

    while (defined ($l = <CLIENT>)) {
	chomp $l;

	syslog ('info', "client command \"$l\"");

	if ($l !~ /^(disable|enable|dump|quit|list|set|get|reset|term)\s*(.*)?$/i) {
	    print CLIENT "520 invalid command\n";
	    close (CLIENT);
	    return;
	}
	($cmd, $args) = ("\L$1", $2);

	#
	# quit command
	#
	if ($cmd eq "quit") {
	    last;

	#
	# reset
	#
	} elsif ($cmd eq "reset" && ($addr eq $HOSTIP || $addr eq $LOCALHOST)) {
	    &reset();
	    print CLIENT "220 reset PID $$@$HOSTNAME\n";
	
	#
	# quit
	#
	} elsif ($cmd eq "term" && ($addr eq $HOSTIP || $addr eq $LOCALHOST)) {
	    print CLIENT "220 terminating server\n";
	    close (CLIENT);
	    syslog ("info", "terminating by user command");
	    kill 15, $$;
	    exit;
	
	#
	# set
	#
	} elsif ($cmd eq "set") {
	    if ($args =~ /^maxkeep\s+(\d+)/) {
		$MAX_KEEP = $1;
		print CLIENT "220 set completed\n";
	    } else {
		($group, $service, $var, $value) = split (/\s+/, $args);
		if (!defined ($watch{$group}[&getservbyname($group, $service)])) {
		    print CLIENT "$group,$service not defined\n";
		} else {
		    $watch{$group}[&getservbyname($group, $service)]{$var} = $value;
		    print CLIENT "$group $service $var = $value\n";
		    print CLIENT "220 set completed\n";
		}
	    }

	#
	# get
	#
	} elsif ($cmd eq "get") {
	    if ($args =~ /^maxkeep\s*$/) {
		print CLIENT "maxkeep = $MAX_KEEP\n";
		print CLIENT "220 set completed\n";
	    } else {
		($group, $service, $var) = split (/\s+/, $args);
		if (!defined ($watch{$group}[&getservbyname($group, $service)])) {
		    print CLIENT "520 $group,$service not defined\n";
		} else {
		    print CLIENT "$group $service $var = $watch{$group}[&getservbyname($group, $service)]{$var}\n";
		    print CLIENT "220 get completed\n";
		}
	    }

	#
	# list
	#
	} elsif ($cmd eq "list") {
	    ($cmd, $args) = split (/\s+/, $args);

	    #
	    # list group members
	    #
	    if ($cmd eq "group") {
		if ($groups{$args}) {
		    print CLIENT "hostgroup $args @{$groups{$args}}\n";
		    print CLIENT "220 list group completed\n";
		} else {
		    print CLIENT "520 list group error, undefined group\n";
		}
	    
	    #
	    # list status of all services
	    #
	    } elsif ($cmd eq "opstatus") {
		foreach $group (keys %watch) {
		    for ($service=0;$service<@{$watch{$group}};$service++) {
			$sref = \%{$watch{$group}[$service]};
			if (!defined($$sref{"_op_status"})) {
			    print CLIENT "$group " . &getservbynum($group, $service) .
				" 0 $$sref{_timer} untested\n";
			} elsif ($$sref{"_op_status"}) {
			    $$sref{"_current_output"} =~ /^(.*)$/m;
			    print CLIENT "$group " . &getservbynum($group, $service) .
				" $$sref{_last_success} $$sref{_timer} succeeded $1\n";
			} else {
			    $$sref{"_failure_output"} =~ /^(.*)$/m;
			    print CLIENT "$group " . &getservbynum($group, $service) .
				" $$sref{_last_failure} $$sref{_timer} failed $1\n";
			}
		    }
		}
		print CLIENT "220 list opstatus completed\n";

	    #
	    # list disabled hosts and services
	    #
	    } elsif ($cmd eq "disabled") {
		foreach $group (keys %groups) {
		    @l = grep (/^\*/, @{$groups{$group}});
		    if (@l) {
			grep (s/^\*//, @l);
			print CLIENT "group $group: @l\n";
		    }
		}
		foreach $group (keys %watch) {
		    if ($watch_disabled{$group} == 1) {
			print CLIENT "watch $group\n";
		    }
		    for ($service=0;$service<@{$watch{$group}};$service++) {
			if ($watch{$group}[$service]{'disable'} == 1) {
			    print CLIENT "watch $group service " .
				&getservbynum($group, $service) . "\n";
			}
		    }
		}
		print CLIENT "220 list disabled completed\n";

	    
	    #
	    # list last alert history
	    #
	    } elsif ($cmd eq "alerthist") {
		print CLIENT join ("\n", @last_alerts), "\n" if @last_alerts;
		print CLIENT "220 list alerthist completed\n";

	    #
	    # list time of last failures for each service
	    #
	    } elsif ($cmd eq "failures") {
		foreach $group (keys %watch) {
		    for ($s=0;$s<@{$watch{$group}};$s++) {
			$sref = \%{$watch{$group}[$s]};
			next if ($$sref{"_op_status"} != 0);
			$$sref{"_failure_output"} =~ /^(.*)$/m;
			print CLIENT "$group ", &getservbynum($group, $s),
			    " $$sref{_last_failure} $$sref{_timer} failed $1\n";
		    }
		}
		print CLIENT "220 list failures completed\n";

	    #
	    # list the failure history
	    #
	    } elsif ($cmd eq "failurehist") {
		print CLIENT join ("\n", @last_failures), "\n"
		    if @last_failures;
		print CLIENT "220 list failurehist completed\n";
	    
	    #
	    # list the time of last successes for each service
	    #
	    } elsif ($cmd eq "successes") {
		foreach $group (keys %watch) {
		    for ($s=0;$s<@{$watch{$group}};$s++) {
			$sref = \%{$watch{$group}[$s]};
			next if ($$sref{"_op_status"} == 1);
			$$sref{"_current_output"} =~ /^(.*)$/m;
			print CLIENT "$group ", &getservbynum($group, $s),
			    " $$sref{_last_success} $$sref{_timer} succeeded $1\n";
		    }
		}
		print CLIENT "220 list successes completed\n";

	    #
	    # list process IDs
	    #
	    } elsif ($cmd eq "pids") {
		print CLIENT "$$ server\n";
		foreach $value (keys %runningpid) {
		    ($group, $s) = split (/\./, $runningpid{$value});
		    print CLIENT "$value $group $watch{$group}[$s]{service}\n";
		}
		print CLIENT "220 list pids completed\n";

	    #
	    # list watch groups and services
	    #
	    } elsif ($cmd eq "watch") {
		foreach $group (keys %watch) {
		    for ($s=0;$s<@{$watch{$group}};$s++) {
			if (!defined ($service=&getservbynum($group, $s))) {
			    print CLIENT "$group (undefined service)\n";
			} else {
			    print CLIENT "$group $service\n";
			}
		    }
		}
		print CLIENT "220 list watch completed\n";

	    } else {
		print CLIENT "520 unknown list command\n";
	    }


	#
	# dump, for debugging
	#
	} elsif ($cmd eq "dump") {
	    ($group, $service) = split (/\s+/, $args);

	    if (!defined ($watch{$group})) {
		print CLIENT "520 status error, unknown group\n";
	    } elsif (!defined ($s=&getservbyname($group, $service))) {
		print CLIENT "520 status error, unknown service\n";
	    } else {
		foreach $k (keys %{$watch{$group}[$s]}) {
		    print CLIENT "$k = ", $watch{$group}[$s]{$k}, "\n";
		}
		print CLIENT "220 status completed\n";
	    }

	#
	# disable watch, service or host
	#
	} elsif ($cmd eq "disable") {
	    ($cmd, $args) = split (/\s+/, $args, 2);

	    #
	    # disable watch
	    #
	    if ($cmd eq "watch") {
		if (!defined ($watch{$args})) {
		    print CLIENT "520 disable error, unknown watch\n";
		} else {
		    $watch_disabled{$args} = 1;
		    print CLIENT "220 disable watch completed\n";
		}

	    #
	    # disable service
	    #
	    } elsif ($cmd eq "service") {
		($group, $service) = split (/\s+/, $args, 2);

		if (!defined ($watch{$group})) {
		    print CLIENT "520 disable error, unknown group\n";
		} elsif (!defined ($s=&getservbyname ($group, $service))) {
		    print CLIENT "520 disable error, unknown service\n";

		} else {
		    $watch{$group}[$s]{"disable"} = 1;
		    print CLIENT "$group,$service disabled\n";
		    print CLIENT "220 disable service completed\n";
		}

	    #
	    # disable host
	    #
	    } elsif ($cmd eq "host") {
		foreach $var (split (/\s+/, $args)) {
		    foreach $group (keys %groups) {
			grep (s/^$var$/*$var/, @{$groups{$group}});
		    }
		}
		print CLIENT "220 disable host completed\n";
	    }

	#
	# enable watch, service or host
	#
	} elsif ($cmd eq "enable") {
	    ($cmd, $args) = split (/\s+/, $args, 2);

	    #
	    # enable watch
	    #
	    if ($cmd eq "watch") {
		if (!defined($watch{$args})) {
		    print CLIENT "520 enable error, unknown watch\n";
		} else {
		    $watch_disabled{$args} = 0;
		    print CLIENT "220 enable watch completed\n";
		}


	    #
	    # enable service
	    #
	    } elsif ($cmd eq "service") {
		($group, $service) = split (/\s+/, $args, 2);

		if (!defined ($watch{$group})) {
		    print CLIENT "520 enable error, unknown group\n";
		} elsif (!defined ($s=&getservbyname ($group, $service))) {
		    print CLIENT "520 enable error, unknown service\n";
		} else {
		    $watch{$group}[$s]{"disable"} = 0;
		    print CLIENT "$group,$service enabled\n";
		    print CLIENT "220 enable completed\n";
		}

	    #
	    # enable host
	    #
	    } elsif ($cmd eq "host") {
		foreach $var (split (/\s+/, $args)) {
		    foreach $group (keys %groups) {
			grep (s/^\*$var$/$var/, @{$groups{$group}});
		    }
		}
		print CLIENT "220 enable completed\n";

	    } else {
		print CLIENT "520 invalid command\n";
	    }
	}
    }
    close (CLIENT);
    syslog ('info', "client finished");
    return;
}


######################################################################
#
# show usage
#
sub usage {
    print <<"EOF";
usage: mon [-c config] [-s dir] [-i secs]
       mon -v

  -a dir	alert script dir
  -c config	config file, defaults to "mon.cf"
  -d		debug
  -f		fork and become a daemon
  -i secs	sleep interval (seconds), defaults to 1
  -k num	keep history of last num events
  -p num	server listens on port num
  -s dir	monitor script dir
  -v		print version

Report bugs to $AUTHOR
$RCSID
EOF
}


######################################################################
#
# look up a service by number, returning the service name
#
sub getservbynum {
    my ($group, $s) = @_;

    $watch{$group}[$s]{"service"};
}

######################################################################
#
# look up a service by name, returning the service number
#
sub getservbyname {
    my ($group, $service) = @_;
    my ($s);

    for ($s=0;$s<@{$watch{$group}};$s++) {
	if ($watch{$group}[$s]{"service"} eq $service) {
	    return $s;
	}
    }
    undef;
}


######################################################################
#
# become a daemon
#
sub daemon {
    if ($pid = fork()) {
	# the parent goes away all happy and stuff
    	exit (0);
    } elsif (!defined $pid) {
    	die "could not fork: $!\n";
    }

    setsid();
    chdir ('/');
    umask (022);

    if (!open (N, "+</dev/null")) {
    	syslog ("crit", "could not open /dev/null: %m");
    	exit(1);
    }
    if (!open(STDOUT, ">&N") ||
        !open (STDIN, "<&N") ||
	!open (STDERR, ">&N")) {
    	syslog ("crit", "could not redirect: %m");
	exit(1);
    }
    syslog ('info', "running as daemon");
}


######################################################################
#
# debug
#
sub debug {

    return if (!$opt_d);

    if ($opt_d && !$opt_f) {
    	print STDERR @_;
    } else {
    	syslog ('debug', join ('', @_));
    }
}


######################################################################
#
# die_die
#
sub die_die {
    my ($level, $msg) = @_;

    die "[$level] $msg\n" if ($opt_d);

    syslog ($level, "fatal, $msg");
    closelog();
    exit (1);
}


######################################################################
#
# handle cleanup of exited processes
# trigger alerts on failures (or send no alert if disabled)
# do some accounting
#
sub proc_cleanup {
    my ($summary);
    if (keys %running) {
	while (($p = waitpid (-1, &WNOHANG)) >0) {

&debug ("PID $p ($runningpid{$p}) exited\n");
	    ($group, $service) = split (/\./, $runningpid{$p});
	    $sref = \%{$watch{$group}[$service]};

	    #
	    # error exit value, handle alert
	    #
	    if ($?) {

		#
		# suck in any extra data
		#
		$fh = $fhandles{$runningpid{$p}};
		while ($z = sysread ($fh, $buf, 8192)) {
		    $ibufs{$runningpid{$p}} .= $buf;
		}

		#
		# accounting
		#
		$$sref{"_failure_count"}++;
		$$sref{"_last_failure"} = time;
		$$sref{"_op_status"} = 0;
		$summary = (split("\n", $ibufs{$runningpid{$p}}))[0];
		$summary = "(NO SUMMARY)" if ($summary =~ /^\s*$/m);
		shift @last_failures if (@last_failures > $MAX_KEEP);
		push @last_failures, "$group " . &getservbynum ($group, $service) .
		    " $tm $summary";
		&syslog ('crit', "failure for $last_failures[-1]");

		#
		# if the alarm is disabled, ignore it
		#
		if ($$sref{"disable"} == 1) {
		    syslog ("notice", "ignoring alert for $group.",
			    &getservbynum ($group, $service));
		#
		# otherwise, trigger it
		#
		} else {
		    &do_alert ($group, $service, $ibufs{$runningpid{$p}}, $?>>8);
		}

		$$sref{"_failure_output"} = $ibufs{$runningpid{$p}};

	    #
	    # record the time of the last success
	    #
	    } else {
		$$sref{"_last_success"} = time;
		$$sref{"_op_status"} = 1;
	    }

	    #
	    # save the output
	    #
	    $$sref{"_last_output"} = $ibufs{$runningpid{$p}};
#	    $$sref{"_last_output"} = $$sref{"_current_output"};
#	    $$sref{"_current_output"} = $ibufs{$runningpid{$p}};

	    &remove_proc ($p);
	}
    }
}


######################################################################
#
# collect output from running processes
#
sub collect_output {
    return if (!keys %running);
&debug ("things are running, checking for output\n");

    $nfound = select($rout=$fdset_rbits, undef, undef, 0);
&debug ("select returned $nfound file handles\n");
    if ($nfound) {
	#
	# look for the file descriptors that are readable,
	# and try to read as much as possible from them
	#
	foreach $k (keys %fhandles) {
	    $fh = $fhandles{$k};
	    if (vec ($rout, fileno($fh), 1) == 1) {
		$z = 0;
		while ($z = sysread ($fh, $buf, 8192)) {
		    $ibufs{$k} .= $buf;
&debug ("[$buf] from $fh\n");
		}

		#
		# ignore if EAGAIN, since we're nonblocking
		#
		if (!defined($z) && $! == &EAGAIN) {

		#
		# error on this descriptor
		#
		} elsif (!defined($z)) {
&debug ("error on $fh: $!\n");
		    &syslog ('crit', "error on $fh: $!");
		    vec($fdset_rbits, fileno($fh), 1) = 0;
		} elsif ($z == 0 && $! == &EAGAIN) {
&debug ("EAGAIN on $fh\n");

		#
		# if EOF encountered, stop trying to
		# get input from this file descriptor
		#
		} elsif ($z == 0) {
&debug ("EOF on $fh\n");
		    vec($fdset_rbits, fileno($fh), 1) = 0;

		}
	    }
	}
    }
}




######################################################################
#
# handle forking a monitor process, and set up variables
#
sub run_monitor {
    my (@args, @groupargs, $pid);

    $fhandles{"$group.$service"} = "\U$group$service";

    my $monitor = "$SCRIPTDIR/$$sref{monitor}";

    #
    # if monitor ends with ";;", do not append groups
    # to command line
    #
    if ($monitor =~ s/;;\s*$//) {
	@args = split (/\s+/, $monitor);

    } else {
	@args = (split (/\s+/, $monitor),
		grep (!/^\*/, @{$groups{$group}}));
    }

    $pid = open($fhandles{"$group.$service"}, '-|');
    if (!defined $pid) {
    	syslog ('err', "Could not fork\n");
	delete $fhandles{"$group.$service"};
	return 0;
    } elsif ($pid == 0) {
    	open(STDERR, '>&STDOUT')
	    or syslog ('err', "Could not dup stderr\n");
	close(STDIN);
	exec @args or syslog ('err', "could not exec '@args': $!");
    }

&debug ("watching file handle ", fileno ($fhandles{"$group.$service"}),
    " for $group.$service\n");

    #
    # set nonblocking I/O and setup bit vector for select(2)
    #
    $fl = fcntl $fhandles{"$group.$service"}, F_GETFL, $fl;
    $fl |= O_NONBLOCK;
    fcntl $fhandles{"$group.$service"}, F_SETFL, $fl;

    vec ($fdset_rbits,
	fileno($fhandles{"$group.$service"}), 1) = 1;
    $fdset_ebits |= $fdset_rbits;

    #
    # note that this is running
    #
    $running{"$group.$service"} = 1;
    $runningpid{$pid} = "$group.$service";
    $ibufs{"$group.$service"} = "";
    $procs++;

    #
    # set the countdown timer for this service
    #
    if ($$sref{"randskew"} != 0) {
    	$$sref{"_timer"} = $$sref{"interval"} +
	     (int (rand (2)) == 0 ? -int(rand($$sref{"randskew"}) + 1) :
	     	int(rand($$sref{"randskew"})+1));
    } else {
	$$sref{"_timer"} =
	    $$sref{"interval"};
    }
}


######################################################################
#
# randomize the delay before each test
# $opt{"randstart"} is seconds
#
sub randomize_startdelay {
    my ($group, $service);

    foreach $group (keys %watch) {
	for ($service=0;$service<@{$watch{$group}};$service++) {
            $watch{$group}[$service]{"_timer"} =
                int (rand ($RANDSTART));
        }
    }

}


#
# return 1 if $val is within $range,
# where $range = "number" or "number-number"
#
sub inRange {
    my ($val, $range) = @_;
    my ($retval);

    $retval = 0;
    if ($range =~ /^(\d+)$/) {
        $retval = 1
            if ($val == $1);

    } elsif ($range =~ /^(\d+)\s*-\s*(\d+)$/) {
        $retval = 1
            if ($val >= $1 && $val <= $2);
    }

    $retval;
}
