#!/usr/bin/env perl
# $Id: vcsstats.pl,v 1.18 2008/12/03 16:13:17 ksb Exp $
# $Source: /usr/msrc/usr/local/libexec/vcsstats/RCS/vcsstats.pl,v $
#
# Report on VCS instances that run on this host. --jad
#
# Some assumtions are made about the VCS configuration:
#   1) The name of each system in the VCS cluster matches the
#      unqualified hostname of each host.  If this is not the
#      case, either the "-a" or "-s" options must be used.
#   2) Each system name shares the same alphabetical base and
#      are differentiated only by their unique numerical suffix.
#      This numerical suffix is used to identify the system in the
#      update to PEG. If this is not the case, "-l" must be specified
#      to read a numerical identifier for each system out of
#      /etc/llthosts.
#
# A number of op rules are required for this to work:
#	op vcs-status
#	op vcs-display
# Also, this tool depends on the tool "since".
#
# Maybe it would be better to run this per-service out of VCS rather
# than for an entire system out of crontab.		--jad

use lib '/usr/local/lib/sac/perl'.join('.', unpack('c*', $^V)),
	'/usr/local/lib/sac';

use strict;
use warnings;
use vars qw($progname $sysname $pegname $target $trace $dgrp_c $dres_c $since_c
	$rrdup_c $opt_a $opt_l $opt_N $opt_s $opt_t $opt_x $opt_h $opt_V
	$vcs_log $llthosts $statedir $sincedb $sfile
	$S_ONLINE $S_STARTING $S_STOPPING $S_PARTIAL $S_FAULTED $S_OFFLINE);
use Getopt::Std qw(getopts);
use POSIX qw(uname mktime);
use Data::Dumper;

$ENV{'PATH'} = "$ENV{'PATH'}:/usr/local/bin";
($progname = $0) =~ s,.*/,,;
($S_OFFLINE, $S_FAULTED, $S_PARTIAL, $S_STOPPING, $S_STARTING, $S_ONLINE) = 0..5;
getopts('alN:s:t:xhV');
(undef, $sysname) = uname();
$sysname =~ s/\..*//;
$pegname = $sysname;
$pegname =~ s/\.sac\.fedex\.com$//; # historical PEG reporting behavior
$pegname =~ s/\.fedex\.com$//;
$target = 'peg.sac.fedex.com:31415';
$dgrp_c = $ENV{'VS_DGRPS_COMM'} || 'op vcs-display grp';
$dres_c = $ENV{'VS_DRES_COMM'} || 'op vcs-display res';
$since_c = $ENV{'VS_SINCE_COMM'} || 'since';
$rrdup_c = $ENV{'VS_RRDUP_COMM'} || 'rrdup';
$vcs_log = $ENV{'VS_ENGINE_LOG'} || '/var/VRTSvcs/log/engine_A.log';
$llthosts = $ENV{'VS_LLTHOSTS'} || '/etc/llthosts';
$statedir = $ENV{'VS_STATE_DIR'} || "$ENV{'HOME'}/.${progname}";
$sincedb = $ENV{'VS_SINCE_DB'} || "$statedir/since";
$sfile = $ENV{'VS_STATE_FILE'} || "$statedir/state";

sub usage($) {
	my ($ret) = $_[0];
	my($fh) = *STDERR;
	$ret == 0 and
		$fh = *STDOUT;
	print $fh
		"$progname: usage [-lx] [-N host] [-t target] [-a | -s system] [group] ...\n",
		"$progname: usage -h\n",
		"$progname: usage -V\n",
		"a         report on service groups on all systems, not just this one\n",
		"l         base the numerical name from llthosts, not the system name\n",
		"N host    send reports to peg as a particular hostname\n",
		"s system  report on groups on a particular system; defaults to current system\n",
		"t target  rrd host to update\n",
		"x         trace rrd updates to stderr\n",
		"h         print only this help message\n",
		"V         show only version info\n",
		"group     report only on a specific service group\n";
	exit $ret;
}

# Read the output of the group display and place the data in the stats structure
sub parseGroups {
	my($stats) = @_;
	my($line, $group, $attr, $sys, $val);
	open(DISPIN,"$dgrp_c |") or
		return undef;
	while(<DISPIN>) {
		m/^(#|\s*$)/o
			and next;
		($group, $attr, $sys, $val) = split;
		# Remap the reserved system word 'global' to '_global' to make later filtering easier
		'global' eq $sys and
			$sys = '_global';
		$stats->{$sys} ||= {};
		$stats->{$sys}{$group} ||= {};
		$stats->{$sys}{$group}{$attr} = $val;
		# Conver the "State" attibute to a numerical "_state"
		if('State' eq $attr) {
			$stats->{$sys}{$group}{'_state'} = -1;
			for($val) {
			m/\|OFFLINE\|/o and do {
				$stats->{$sys}{$group}{'_state'} = $S_OFFLINE; last; };
			m/\|FAULTED\|/o and do {
				$stats->{$sys}{$group}{'_state'} = $S_FAULTED; last; };
			m/\|ONLINE\|/o and do {
				$stats->{$sys}{$group}{'_state'} = $S_ONLINE; last; };
			m/\|STARTING\|/o and do {
				$stats->{$sys}{$group}{'_state'} = $S_STARTING; last; };
			m/\|STOPPING\|/o and do {
				$stats->{$sys}{$group}{'_state'} = $S_STOPPING; last; };
			m/\|PARTIAL\|/o and do {
				$stats->{$sys}{$group}{'_state'} = $S_PARTIAL; last; };
			}
		}
	}
	close(DISPIN);
	return $stats;
}

# Read the output of the resource display and place the data in the stats structure
# Also, build a map from the resource name to the  name of the group that owns it
sub parseResources {
	my($stats, $resmap) = @_;
	my($res, $attr, $sys, $val, %online);
	open(DISPIN,"$dres_c |") or
		return undef;
	while(<DISPIN>) {
		m/^(#|\s*$)/
			and next;
		($res, $attr, $sys, $val) = split;
		# Map online resources to the group that owns them
		# Once built, this map will be used by other parts of the program
		if('Group' eq $attr) {
			$resmap->{$res} = $val;
		}
		# If we find a resource thats online, remember what system
		#	we saw it on for the next step
		if('State' eq $attr && 'ONLINE' eq $val) {
			$online{$res} ||= {};
			$online{$res}{$sys} = 1;
		}
	}
	close(DISPIN);
	# We need to have the resource map built before we can count up the online
	#	resources for a group on a system
	foreach $res (keys(%online)) {
		foreach $sys (keys(%{$online{$res}})) {
			$stats->{$sys}{$resmap->{$res}}{'_resonline'} ||= 0;
			$stats->{$sys}{$resmap->{$res}}{'_resonline'}++;
		}
	}
	return $stats;
}

# Read any updates to the VCS log and place it in the stats structure
# We rely on "since" to only process the lines added since the last run
# We need the resource map from parseResources to map some resource names back
#	to group names
sub parseLog {
	my($stats, $resmap) = @_;
	my($logpre, %msgcnt);
	open(VCSLOG,"$since_c -F$sincedb $vcs_log |") or
		return undef;
	$logpre = qr!^[\d/]+\s+[\d:]+\s+VCS\s+(\w+)\s+V-[\d\-]+\s+!o ;
	while(<VCSLOG>)  {
		m/${logpre}.*\(Owner:\s+\w+,\s+Group:\s+(\w+)\)\s+is\s+\w+\s+on\s+(\w+)/o and do {
			msgincr($stats, $3, $2, $1); next; };
		m/${logpre}Group\s+(\w+)\s+is\s+online\s+on\s+system\s+(\w+)/o and do {
			my($sys, $group) = ($3, $2);
			msgincr($stats, $sys, $group, $1);
			m!^(\d+)/(\d+)/(\d+)\s+(\d+):(\d+):(\d+)!o;
			$stats->{$sys}{$group}{'_onlinetime'} = mktime($6,$5,$4,$3 - 1,$2 - 1,$1 - 1900);
			next; };
		m/${logpre}Group\s+(\w+)\s+.*\s+on\s+system\s+(\w+)/o and do {
			msgincr($stats, $3, $2, $1); next; };
		m/${logpre}.*for\s+Group\s+(\w+)\s+on\s+node\s+(\w+)/o and do {
			msgincr($stats, $3, $2, $1); next; };
		m/${logpre}attribute\s+for\s+group\s+(\w+)\s+on/o and do {
			msgincr($stats, '_global', $2, $1); next; };
		m/${logpre}\((\w+)\)\s+\w+:(\w+):/o and do {
			exists $resmap->{$3} and
				msgincr($stats, $2, $resmap->{$3}, $1);
			next; };
	}
	close(VCSLOG);
	return $stats;
}

# Increment the number of messages seen for a particular time on a
#	specific group and system
sub msgincr {
	my($stats, $sys, $group, $msgtype) = @_;
	$stats->{$sys}{$group} ||= {};
	$stats->{$sys}{$group}{'_msgcnt'} ||= {};
	$stats->{$sys}{$group}{'_msgcnt'}{$msgtype} ||= 0;
	$stats->{$sys}{$group}{'_msgcnt'}{$msgtype}++;
}

# Copy all attributes filed under '_global' to each system
# This makes the final processing logic much simpler
sub normalize {
	my($stats) = @_;
	my($group, $attr, $sys);
	foreach $sys (keys(%{$stats})) {
		foreach $group (keys(%{$stats->{$sys}})) {
			foreach $attr (keys(%{$stats->{'_global'}{$group}})) {
				$stats->{$sys}{$group}{$attr} =
					$stats->{'_global'}{$group}{$attr};
			}
		}
	}
}

# Format an update and pass it to rrdup to be sent to peg
# The RRDUP handle must be open to an rrdup process before calling this
sub sendUpdate {
	my($group, $member, $state, $onlinetime, $resonline, $linfo, $lnotice, $lwarning, $lerror) = @_;

	!defined $member and do {
		print STDERR "$progname: internal error mapping systems to integers\n";
		exit 70;
	};
	!defined $onlinetime and
		$onlinetime = 'N';
	!defined $resonline and
		$resonline = 0;
	!defined $linfo and
		$linfo = 0;
	!defined $lnotice and
		$lnotice = 0;
	!defined $lwarning and
		$lwarning = 0;
	!defined $lerror and
		$lerror = 0;

	print RRDUP ("update host/$pegname/vcs-$group.rrd ",
		"-t member:state:onlinetime:resonline:linfo:lnotice:lwarning:lerror ",
		"N:$member:$state:$onlinetime:$resonline:$linfo:$lnotice:$lwarning:$lerror\n");
}

################
# Main program

my($stats, $memory, $resmap, @systems, @groups, $sys, $group);
$stats = {};
$resmap = {};
$memory = {};

$opt_h and
	usage(0);
if($opt_a && $opt_s) {
	print STDERR "$progname: -s incompatible with -a\n";
	usage(64);
}
# Pasrse the group data a bit early so we can use its data in -V
#	If it fails, we won't report the error until after we check for -V
$stats = parseGroups($stats) and
	@systems = sort grep { ! m/^_/o } keys(%{$stats});
$trace = "";
$opt_x and
	$trace = "-x";
$opt_N and
	$pegname = $opt_N;
$opt_t and
	$target = $opt_t;
$opt_s and
	$sysname = $opt_s;
if($opt_V) {
	print "$progname: ", '$Id: vcsstats.pl,v 1.18 2008/12/03 16:13:17 ksb Exp $',"\n";
	print "$progname: send updates as: $pegname\n";
	print "$progname: send updates to: $target\n";
	print "$progname: report on system: $sysname\n";
	print "$progname: display groups command: $dgrp_c\n";
	print "$progname: display resources command: $dres_c\n";
	print "$progname: vcs log: $vcs_log\n";
	print "$progname: llthosts: $llthosts\n";
	print "$progname: since database: $sincedb\n";
	print "$progname: state file: $sfile\n";
	defined $stats and
		print "$progname: all systems: @systems\n";
	exit 0;
}
!$opt_a and
	@systems = ($sysname);
if(!defined $stats) {
	print STDERR "$progname: failed to run: $dgrp_c\n";
	exit 69;
}
parseResources($stats, $resmap) or do {
	print STDERR ("$progname: unable to run: $dres_c\n");
	exit 69;
};

# Read in any state from a previous run
if(-f $sfile) {
	my($rawstate);
	local $/; $/ = undef; # slurp mode
	open(STATE,"< $sfile") or do {
		print "$progname: failed to open: $sfile\n";
		exit 69;
	};
	$rawstate = <STATE>;
	eval $rawstate;
	close(STATE);
}

# Make sure we open _before_ we call since all the descriptors we'll need later
# If any opens fail, we won't be holding data we can't put anywhere
open(RRDUP,"| $rrdup_c $trace $target") or do {
	print STDERR "$progname: failed to run: $rrdup_c\n";
	exit 69;
};
-d $statedir or
	mkdir($statedir, "755");
open(STATE,"> $sfile") or do {
	print "$progname: failed to open for write: $sfile\n";
	exit 69;
};
if($opt_l) {
	open(LLTHOSTS, "< $llthosts") or do {
		print STDERR "$progname: failed to open $llthosts\n";
		exit 69;
	};
}

# We want to parse data from the log as late as possible so we don't run
#	since then fail without an update
parseLog($stats, $resmap) or do {
	print STDERR ("$progname: unable to since: $vcs_log\n");
	exit 69;
};
# Copy straggling data out of _global to each system
normalize($stats);

# Determine the member index of each system base on the hostname or the
#	mapping in llthosts
if($opt_l) {
	my($index);
	while(<LLTHOSTS>) {
		m/^\s*[0-9]/o and
			($index, $sys) = split;
		$stats->{$sys}{'_index'} = $index;
	}
	close(LLTHOSTS);
} else {
	foreach $sys (keys(%{$stats})) {
		($stats->{$sys}{'_index'} = $sys) =~ s/^[^\d]*(\d+).*$/$1/;
	}
}

# Process each system we want to send updates for, making any final decisions and
#	then sending the update
foreach $sys (@systems) {
	$memory->{$sys} ||= {};
	# Get group list from the command line if specified, else output all groups
	#	for a system
	@groups = @ARGV or
		@groups = grep { ! m/^_/o } keys(%{$stats->{$sys}});
	foreach $group (@groups) {
		# All groups may not be available on all systems
		!defined $stats->{$sys}{$group} and
			next;
		$memory->{$sys}{$group} ||= {};
		# Do not report groups that are offline
		$stats->{$sys}{$group}{'_state'} == $S_OFFLINE and do {
			# If a group is offline on the monitored system,
			#     forget state information.
			delete $memory->{$sys}{$group};
			next;
		};
		# If we read an group online message from the log, update
		#      our persistent state
		$stats->{$sys}{$group}{'_onlinetime'} and
			$memory->{$sys}{$group}{'_onlinetime'} =
				$stats->{$sys}{$group}{'_onlinetime'};
		# Clear the online time from our persistent memory if the
		#	group isn't online
		$S_ONLINE == $stats->{$sys}{$group}{'_state'} or
			$memory->{$sys}{$group}{'_onlinetime'} = undef;
		# Do not report on parallel groups unless explicitly listed on
		#	the command line
		'0' ne $stats->{$sys}{$group}{'Parallel'} && ! @ARGV and
			next;

		sendUpdate($group,
			$stats->{$sys}{'_index'},
			$stats->{$sys}{$group}{'_state'},
			$memory->{$sys}{$group}{'_onlinetime'},
			$stats->{$sys}{$group}{'_resonline'},
			$stats->{$sys}{$group}{'_msgcnt'}{'INFO'},
			$stats->{$sys}{$group}{'_msgcnt'}{'NOTICE'},
			$stats->{$sys}{$group}{'_msgcnt'}{'WARNING'},
			$stats->{$sys}{$group}{'_msgcnt'}{'ERROR'}
		);
	}
}
close(RRDUP);

# Dump our persistent state out to a file for the next run
print STATE Data::Dumper->Dump([$memory], ["memory"]);
close(STATE);

exit 0;
