#!/usr/bin/perl
#
# Filename: /usr/local/etc/ping/pingsweep.pl
#
# Purpose: This Perl script is designed to ping a list of assets and alert
# if any asset doesn't respond to an ICMP ping. The alert will be
# in the form of an email message but if a major outage occurs the
# script will also send pages (via email) to the appropriate staff.
#
# Author: Michael McNamara (http://blog.michaelfmcnamara.com)
#
# Credits: Stewart Kendric (http://www.skendric.com/)
# I've taken a lot of ideas (and some code) from Stuart's many scripts.
# Without Stuart's help it would have taken me much longer to develop
# some of the scripts I've come to rely on today.
#
# Date: May 6, 2003
#
# License:
# Copyright (C) 2002 Michael McNamara (mfm@michaelfmcnamara.com)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see
#
# Changes:
#
# Jul 18, 2011: added logic to skip assets which regex to IGNORE_PAGERALERTS
# from being counted against the 15 assets to trigger a page
# Dec 10, 2009: added subroutine for SMTP mail delivery
# Dec 9, 2009: change to MIME::Lite for email delivery, attaching GIF
# files within HTML message body
# Dec 30, 2005: cleaned up DEBUG and logging statements for troubleshooting
# Mar 18, 2005: added file logging for troubleshooting and monitoring
# Feb 19, 2005: added threshold logic to prevent flapping notifications
# Feb 17, 2005: fixed logic regarding alert notification and ping retry
# Jan 17, 2005: added code that will retry ping if # devices alarming is < 10 devices
# Jul 14, 2004: script will now report when devices are once again rechable
# Jan 5, 2004: record downtime for monthly computation of downtime %
# Jan 2, 2004: changed alert notifications so that device must be
# down for at least 60 seconds before email notification
# Also added 15 minute alert email notification
# Notifications now: > 1 min and < 5 min
# at 15 min
# at 60 min
# Dec 30, 2003: changed email notifications to HTML based messages
#
# Notes:
# Over the years I've built up this Perl script from a simple shell
# that would just call FPING to a all encompassing tool which would
# still perform its basic function yet go well beyond its original
# design. What does it have now that it didn't have then you ask?
# Well lets name a few; debug output, comments are now allowed in
# the input files, ping retry, flapping notification thresholds, etc.
# And it currently has timing logic to initiate both email
# notification and pager notification. It now has threshold logic
# to prevent those few occasions where I would find a few hunderd
# alert messages in my inbox because of some flapping condition. It
# also has the ability to record the outages so that a downtime
# report can be generated from that data (reportdowntime.pl).
#
#
#
# Load Modules
use strict;
use warnings;
use Fcntl qw(:DEFAULT :flock);
use DBI;
use MIME::Lite;
use Net::SMTP;
# Server we're going to run this script on
our $server = "roo.mdc.acme.org";
# Database Variables
my $dbuser = "dbuser";
my $dbpasswd = "dbpass";
my $database = "technology";
my $table = "devices";
my $dsn="DBI:mysql:technology:localhost";
# Declare Constants
use constant DEBUG => 1; # DEBUG settings
use constant CONSOLE => 0; # CONSOLE DEBUG settings
# Skip pager notification for the following assets (using regex matching)
our @IGNORE_PAGERALERTS = qw/
vpn.acme.org
/;
# Declare Global Variables
my $result;
my $sdate;
my $date;
my $time;
my $currentTime;
my @devices;
my $retry = 5; # Number of seconds to wait before retrying ping
my $pause = 0; # Number of seconds to pause at the startup of the script
my %disk; # Array of Devices that were previously DOWN
my %data; # Array of Devices that are currently DOWN (after processing)
my %live; # Array of Devices that was recorded as DOWN from FPING
my %notify; # Array of Devices that will be included in notifications
my %thres; # Array of Devices for threshold checking
our @list;
my $firstalert;
my $secondalert;
my $minalert;
my $largealert;
my $pagealert;
my $thresholdTime;
my $thresholdEvents;
my $FPING = "/usr/local/etc/fping"; # Location of FPING exec
my $MAILTO; # Who should recieve the email alerts
my $MAILFROM; # Who they should appear as coming from
my $MAILSUBJECT; # The subject of the email alerts
my $MAILHOST; # The DNS/IP address of the SMTP gateway
my $PAGERTO; # The email pager addresses of those that should be paged
my $PAGERSUBJECT; # The subject of the pager alerts
my $NOTIFY; # Flag/count variable for email notification
my $SKIP; # Flag/count variable for skipping pager notification
my $PAGERNOTIFY = 0; # Flag/count variable for pager notification
my $SOMETHINGTODO = 0; # Flag/count variable for alert logic
my $RECOVER = 0; # Flag/count variable for recovery notification
my $program = "pingsweep.pl";
my $version = "v1.3";
my $author = "Michael McNamara";
my $purpose = "This Perl script is designed to poll the network electronics and report any failures.";
my $usage = "Usage: pingsweep.pl \[debug\]\n";
# Email Addresses and Subject Lines
#########################################################
# Verizon SMS Text Messaging 123456789@vtext.com
# AT&T SMS Text Messaging 123456789@txt.att.net
#########################################################
$MAILTO = 'NetworkAlert@acme.org';
$MAILFROM = 'NetworkAlert@acme.org';
$MAILSUBJECT = 'PING: Network Status Report';
$MAILHOST = "smtp.acme.org";
$PAGERTO = '123456789@txt.att.net';
$PAGERSUBJECT = "NETWORK ALERT";
# Data File Paths and Locations
my $electronics = "/usr/local/etc/ping/pingsweep.txt";
my $datafile = "/usr/local/etc/ping/pingsweep.dat";
my $flagfile = "/tmp/pingsweep.flg";
my $recordfile = "/usr/local/etc/ping/pingsweeprecord.dat";
my $thresholdfile = "/usr/local/etc/ping/pingsweeprecord.dat";
my $logfile = "/usr/local/etc/ping/logs/pingsweep.log";
my $lockfile = "/tmp/pingsweep.tmp";
my $templock = "/tmp/pingsweep.tmp.$$";
# Notification Intervals in seconds
$firstalert = 15*60; # Time value for first alert notification
$secondalert = 60*60; # Time value for second alert notification
$minalert = 5*50; # Time value for minimum alert notification
$largealert = 10; # Number of alert devices to send immediate
$pagealert = 15; # Number of alarms to trigger page/text message
#$thresholdTime = 120*60; # Time (seconds) value for threshold checking
$thresholdTime = 1*60; # Time (seconds) value for threshold checking
$thresholdEvents = 3; # Threshold value for number of events
our $device_count; # Number of devices the script is polling
my $NOW = localtime;
###########################################################################
# Signal Handlers
###########################################################################
$SIG{INT} = $SIG{TERM} =
sub { unlink ${main::Cleanfile}
if defined $main::Cleanfile;
unlink ${main::Cleanfile2}
if defined $main::Cleanfile2;
unlink ${main::Cleanfile3}
if defined $main::Cleanfile3;
warn "$NOW: ERROR: Bailout after SIG $_[0]\n";
exit 1;
};
$SIG{HUP} = sub {
unlink ${main::Cleanfile}
if defined $main::Cleanfile;
unlink ${main::Cleanfile2}
if defined $main::Cleanfile2;
unlink ${main::Cleanfile3}
if defined $main::Cleanfile3;
die "$NOW: ERROR: Bailout after SIG $_[0]\n";
};
END {
local($?, $!);
unlink ${main::Cleanfile} if defined $main::Cleanfile;
unlink ${main::Cleanfile2} if defined $main::Cleanfile2;
}
###########################################################################
# B E G I N M A I N
###########################################################################
# Initialize program environment
&initialize;
# Load threshold data and identify problem devices
&load_thres;
&get_count;
# Ping devices and get list of devices not responding
&get_data;
# Load devices that failed to respond previously
&load_data;
# Compare the list of devices responding against the old list
&comp_data;
# Check to see if email alerts should be sent
&check_alert;
# Store list of devices not responding for later use
&write_data;
# Issue email for list of devices not responding or now responding
&alert_notify;
# Issue pager email alert if more than 10 devices for 5 minutes
&alert_pager;
# Store list of devices that are now responding with their total
# accumulated downtime for later reporting with reportdowntime.pl
&record_downtime;
# Remove the flagfile after everything has completed
&finishup;
exit 0;
###########################################################################
# E N D M A I N
###########################################################################
###########################################################################
# Subroutine initialize
#
# Purpose: perform all the initialization steps and procedures
###########################################################################
sub initialize {
# Declare Local Variables
# Inialize time and date
$sdate = localtime;
($date, $time) = &get_time;
$currentTime = time;
## Test for flagfile and exit if exists
#if (-e $flagfile) {
# print "ERROR: The flagfile $flagfile already exists \n" if (DEBUG);
# #`mutt McNamaraM@acme.org -s "PINGSWEEP.PL: Lockfile exists, aborting" < /dev/null`
# die "$program script is already running or flagfile needs to be removed.";
#}
&lockit($lockfile,$templock);
## Let's open the logfile and place a lock on the file
open(LOGFILE, ">>$logfile");
flock(LOGFILE, LOCK_EX) or die "$program script unable to lock logfile $logfile\n";
logit("DEBUG: #################################################################", 1);
logit("DEBUG: (initialize) $program $version starting up...", 1);
logit("DEBUG: (initialize) logfile $logfile is open and flock complete", 1);
logit("DEBUG: (initialize) creating flagfile $flagfile", 1);
## Create a flagfile to prevent multiple versions from running
##`touch $flagfile`;
#open(FLAGFILE, ">$flagfile");
#print FLAGFILE "flagfile for pingsweep.pl\n";
#close(FLAGFILE);
logit("DEBUG: (initialize) sleeping $pause seconds",1) if (DEBUG);
# sleep 1 seconds to offset other test scripts
sleep $pause;
return;
} #end sub
###########################################################################
# Subroutine finishup
#
# Purpose: wrap up the program and close any open files
###########################################################################
sub finishup {
logit("DEBUG: (finishup) all finished let's remove the flagfile.",1) if (DEBUG);
## Remove the flagfile after everything has completed
##`/bin/rm $flagfile`;
#unlink $flagfile;
close LOCK; unlink ($templock, $lockfile);
logit("DEBUG: (finishup) closing logfile $logfile",1) if (DEBUG);
# Close the log data file
close(LOGFILE);
# Copy the list of down devices for the webserver to access
`cp -f /usr/local/etc/ping/pingsweep.dat /var/www/html/pingsweep.dat`;
return;
}
###########################################################################
# Subroutine get_count
#
# Purpose: count the number of devices that will be pings, record it
###########################################################################
sub get_count {
# Open data file
open DEVICES, "$electronics" or die "Can't open $electronics: $!\n";
# Walk through data file
while () {
# Skip blank lines
next if (/^\n$/);
# Skip comments
next if (/^#/);
$device_count++;
}
close DEVICES;
return;
}
###########################################################################
# Subroutine get_data
#
# Purpose: ping a list of devices and record those that don't respond
###########################################################################
sub get_data {
# Declare Local Variables
my $name;
my $execstr;
my $result2;
my $numdevices; # Number of elements in @devices array
logit("DEBUG: (get_data) we're about to poll $device_count devices",1) if (DEBUG);
logit("DEBUG: (get_data) shelling out to exec system call to FPING",1) if (DEBUG);
# Use FPING to ping all the electronics and store the list of devices
# that doesn't respond into the string $result
$result = `$FPING -b 56 -u -f $electronics`;
# If there were some devices that didn't respond let's go to work
if ($result) {
$SOMETHINGTODO = 1; # Set this flag for later processing
chomp ($result); # remove the CR/LF
logit("DEBUG: (get_data) some devices failed to respond to our ping",1) if (DEBUG);
# Lets take the results and load them into an array
@devices = split (/\n/, $result);
# Lets count the number of elements in the array
$numdevices = @devices;
# If there are less than 10 devices not responding lets go ahead and
# try to PING them again just to make sure they are really down. If
# there are more than 10 devices we can assume that this is a larger
# problem and that we shouldn't overload the system trying to re-PING
# too many devices in too short a timeframe else we'll go outside our
# 60 second window.
if ($numdevices < 15) {
# Lets take that list of devices and ping them again to confirm
$execstr = "$FPING -b 56 -u ";
foreach $name (@devices) {
$execstr = $execstr . "$name ";
}
logit("DEBUG: (get_data) some devices failed to responsd, retrying...",1) if (DEBUG);
logit("DEBUG: (get_data) here's the execstr $execstr",1) if (DEBUG);
logit("DEBUG: (get_data) sleeping the retry interval of $retry seconds",1) if (DEBUG);
# Lets stop and catch our breath for a few seconds before trying again
sleep $retry;
# Lets ping those devices that originally failed again
$result2 = `$execstr`;
# If there were any results lets store and evaluate them
if ($result2) {
$SOMETHINGTODO = 1; # Set this flag for later processing
chomp ($result2); # remove CR/LF
# Lets take the results and load them into an array
@devices = split(/\n/, $result2);
logit("DEBUG: (get_data) some devices failed to respond a SECOND time",1) if (DEBUG);
foreach $name (@devices) {
$name =~ s/\/n//g; # remove the CR/LF
$live{$name} = $currentTime; # store the current time for each device
logit("DEBUG: (get_data) live{$name} = $live{$name}",1) if (DEBUG);
} #end foreach
} else { # if there were no devices down the SECOND time around
$SOMETHINGTODO = 0; # Set this flag for later processing
logit("DEBUG: (get_data) all devices responded the SECOND time around.",1) if (DEBUG);
} #end if else($result2)
} else { # if number of devices down > 15 just process
logit("DEBUG: (get_data) number of devices > 15 processing without retrying PING",1) if (DEBUG);
foreach $name (@devices) {
$name =~ s/\/n//g; # remove the CR/LF
$live{$name} = $currentTime; # store the current time for each device
logit("DEBUG: (get_data) live{$name} = $live{$name}",1) if (DEBUG);
} #end foreach
} # endif numdevices > 10
} else { # if there were no devices down the FIRST time around
# There were no devices that failed to respond so there's nothing to-do
$SOMETHINGTODO = 0;
logit("DEBUG: (get_data) there were no devices that failed to respond...",1) if (DEBUG);
} #end if($result)
return 1;
}
##########################################################################
# Subroutine alert_notify
#
# Purpose: compose an HTML based email message which details the devices
# that failed to response and also details those devices that
# are now reponding (recovered).
##########################################################################
sub alert_notify {
# Declare Local Varaibles
my $name;
my $oTime;
my $lTime;
my $dTime;
my $alert;
my $flag = 0;
my $message_body;
# If there was some device that either failed to ping or has recovered
if ($NOTIFY | $RECOVER) {
logit("DEBUG: (alert_notify) there is something todo",1) if (DEBUG);
$message_body .= "ACME Corporation Network Infrastructure Status Report
\n";
#### $message_body .= "
";
$message_body .= "Date: $sdate
";
$message_body .= "Server: $server
";
# There was a device that failed to respond so we'll be alerting on it
if ($NOTIFY) {
logit("DEBUG: (alert_notify) within the down host section",1) if (DEBUG);
$message_body .= "
\n";
$message_body .= "The following devices failed to respond to an ICMP ping(s);
\n";
$message_body .= "
\n";
$message_body .= "";
$message_body .= "Device Hostname or IP Address | ";
$message_body .= "Time Down DD:HH:MM:SS | ";
$message_body .= "
";
# processing in here for devices that failed
for my $index (sort keys %notify) {
($name, $oTime, $lTime, $dTime, $alert) = split(' ', $notify{$index});
# Calculate down time
$dTime = &calc_down_time($oTime, $currentTime);
$message_body .= "\n";
$message_body .= "$name | \n";
$message_body .= "$dTime | \n";
$message_body .= "
\n";
logit("DEBUG: (alert_notify) Down Hostname = $name \tDownTime = $dTime",1) if (DEBUG);
}
$message_body .= "
\n";
} #end if $NOTIFY
# There was some device that recovered so we'll be alerting on it
if ($RECOVER) {
logit("DEBUG: (alert_notify) within the recovered host section",1) if (DEBUG);
$message_body .= "
\n";
$message_body .= "The following devices are now responding to ICMP ping(s);
\n";
$message_body .= "\n";
$message_body .= "\n";
$message_body .= "Device Hostname or IP Address | \n";
$message_body .= "Time Down DD:HH:MM:SS | \n";
$message_body .= "
\n";
# processing in here for devices that recovered
for my $triple (sort keys %disk) {
# If device is still down we shouldn't report it as recovered
next if ( exists ($data{$triple}) );
($name, $oTime, $lTime, $dTime) = split(' ', $disk{$triple});
# Threshold checking
if ( exists( $thres{$name} ) && ($thres{$name} > $thresholdEvents)) {
logit("DEBUG: (alert_notify) threshold reached $name skipping recovery email.",1) if (DEBUG);
next;
}
# Calculate down time
$dTime = &calc_down_time($oTime, $currentTime);
$message_body .= "\n";
$message_body .= "$name | \n";
$message_body .= "$dTime | \n";
$message_body .= "
\n";
logit("DEBUG: (alert_noitfy) Up Hostname = $name \tDownTime = $dTime\n",1) if (DEBUG);
} #end for my $triple
$message_body .= "
\n";
} #end if $RECOVER
$message_body .= "
";
$message_body .= "Notes: this message is now being sent in HTML format.
";
#$message_body .= "
\n";
$message_body .= "\n";
### Adjust the filenames
my $powered_by_perl = '/var/www/html/images/circle_power_perl.gif';
my $technology_logo = '/var/www/html/image6.gif';
send_mail($MAILTO, $MAILFROM, $MAILSUBJECT, $message_body);
} #end if ($NOTIFY | $RECOVER)
else {
logit("DEBUG: (alert_notify) there is nothing to notify",1) if (DEBUG);
}
return 1;
} #end sub alert_notify
########################################################################
# Subroutine get_time
#
# Purpose: calculate the time
########################################################################
sub get_time {
# Declare Local Variables
my ($sec, $min, $hour, $day, $mon, $year, $date, $time, $now);
($sec, $min, $hour, $day, $mon, $year) = (localtime)[0,1,2,3,4,5];
if ($sec < 10) { $sec = "0" . $sec }
if ($min < 10) { $min = "0" . $min }
if ($hour < 10) { $hour = "0" . $hour }
$mon = $mon + 1;
$year = $year + 1900;
$date = $mon . "-" . $day . "-" . $year;
$time = $hour . ":" . $min . ":" . $sec;
$now = $date . " at " . $time;
return ($date, $time);
} #end sub get_time
########################################################################
# Subroutine load_data
#
# Purpose: load from file the list of devices that were previously down
########################################################################
sub load_data {
# Declare Local Variables
my $oTime; # Original timestamp when device went down
my $lTime; # Last timestamp when device was checked
my $rTime; # Recovery timestap when device recovered
my $dTime; # Amount of time the device has been down
my $name; # FQDN of the device being checked
my $alert; # Number of Pager alerts sent
# Open data file
open DATA, "$datafile" or die "Can't open $datafile: $!\n";
logit("DEBUG: (load_data) starting to load hash \%disk",1) if (DEBUG);
# Walk through data file
while () {
# Skip blank lines
next if (/^\n$/);
# Skip comments
next if (/^#/);
# Read a line of data, throw away iTime
($name, $oTime, $lTime, $dTime, $alert) = split(' ');
# Build data structure
$disk{$name} = "$name $oTime $lTime $dTime $alert";
logit("DEBUG: (load_data) reading disk{$name} = $name $oTime $lTime $dTime $alert",1) if (DEBUG);
} #end while
close DATA;
return;
} #end sub load_data
########################################################################
# Subroutine load_thres
#
# Purpose: load threshold data from file of devices that were previously down
########################################################################
sub load_thres {
# Declare Local Variables
my $oTime; # Original timestamp when device went down
my $lTime; # Last timestamp when device was checked
my $rTime; # Recovery timestap when device recovered
my $dTime; # Amount of time the device has been down
my $name; # FQDN of the device being checked
my $alert; # Number of Pager alerts sent
my $index; # Index variable for hash array
my $tmwindow = $currentTime - $thresholdTime;
#
# Load threshold data from \$thresholdfile for threshold checking
#
# We can use the pingsweeprecord.dat file to check for thresholds
# The format of that file appears below. We can load all the data and then
# count the number of events within the threshold window, perhaps 60 minutes.
#
# Device Hostname Original Recover Total Down Date Time
# Time Time Time Time
#---------------------------------------------------------------------------------------
#
#sw-ccr-8600-b.core.acme.org 1107277623 1107277801 238 0:00:03:58 2-1-2005 12:11:01
#
# Open data file
open THRESDATA, "$thresholdfile" or die "Can't open $thresholdfile: $!\n";
logit("DEBUG: (load_thres) starting to load hash \%thres",1) if (DEBUG);
logit("DEBUG: (load_thres) threshold events = $thresholdEvents and time window = $tmwindow",1) if (DEBUG);
# Walk through data file
while () {
# Skip blank lines
next if (/^\n$/);
# Skip comments
next if (/^#/);
($name, $oTime, $rTime, $dTime) = split(' ');
# If the recovery time was before the \$thresholdTime lets count this event
if ($rTime > ($currentTime - $thresholdTime) ) {
logit("DEBUG: (load_thres) threshold event for $name detected",1) if (DEBUG);
$thres{$name}++;
}
} #end while
close THRESDATA;
# Lets dump all the devices that have reached the threshold
if (DEBUG) {
logit("DEBUG: (load_thres) checking threshold events for each device", 1);
for my $idx (sort keys %thres) {
if ($thres{$idx} > $thresholdEvents) {
logit("DEBUG: (load_thres) threshold event limit reached for $idx events = $thres{$idx}",1);
}
}
}
return 1;
} #end sub load_thres
########################################################################
# Subroutine write_data
#
# Purpose: write list of devices not responding to file for future
########################################################################
sub write_data {
# Declare Local variables
my $name;
my $oTime;
my $key;
my $now;
my $lTime;
my $dTime;
my $alert;
# Find time
$now = time;
# Open data file
open DATA, ">$datafile";
print DATA < $thresholdEvents)) {
logit("DEBUG: (comp_data) threshold reached $name skipping recovery notification.",1) if (DEBUG);
next;
}
# $dTime = &calc_down_time($oTime, $currentTime);
#if ($currentTime - $oTime >= 60*2*1) { # SECS*MINUTES*HOURS
#CHANGED TIMING BECAUSE RECOVERY REPORTS WERE MISSING FOR DOWN REPORTS
if ($currentTime - $oTime >= 30*1*1) { # SECS*MINUTES*HOURS
$RECOVER = 1;
($name, $oTime, $lTime, $dTime) = split(' ', $disk{$test});
logit("DEBUG: (comp_data) the following host has recovered $name",1) if (DEBUG);
} #end if $currentTime
} #end for my $test
return 1;
} #end sub comp_data
############################################################################
# Subroutine alert_pager
#
# Purpose: send email alert to pager smtp address if applicable
############################################################################
sub alert_pager {
# Declare Local Variables
my $name;
my $oTime;
my $lTime;
my $dTime;
my $alert;
# if number of devices alerting for more than 5 minutes is greater
# than 10 then please page someone to alert them
if ($PAGERNOTIFY >= $pagealert) {
logit("DEBUG: (alert_pager) issuing pager alert to $PAGERTO ($PAGERNOTIFY devices down)",1) if (DEBUG);
open(SENDMAIL, "| /usr/lib/sendmail $PAGERTO") || die;
print SENDMAIL "From: $MAILFROM\nTo: $PAGERTO\nSubject: $PAGERSUBJECT\n\n";
print SENDMAIL "Date: $sdate";
#print SENDMAIL "Server: $server";
print SENDMAIL "There is a farily large problem going on at work.";
print SENDMAIL "Approximately $PAGERNOTIFY assets have been down for more than 5 minutes.";
print SENDMAIL "You should check your email for a complete list of the affected assets.";
print SENDMAIL "You will not recieve a page when this problem clearsassets.";
close(SENDMAIL) || die;
} else {
logit("DEBUG: (alert_pager) not enough devices down - no pager alert ($PAGERNOTIFY devices down)",1) if (DEBUG);
} #endif ($NOTIFY)
return;
} #end sub alert_pager
#############################################################################
# Subroutine check_alert
#
# Purpose: check to see what alerts are necessary and set flags
#############################################################################
sub check_alert {
# Declare Local Variables
my $index;
my $name;
my $oTime;
my $lTime;
my $dTime;
my $alert;
my $DownTimeToPage = 5*60;
logit("DEBUG: (check_alert) checking for email notifications",1) if (DEBUG);
$NOTIFY = 0;
$SKIP = 0;
# Setting the ALERT value to 0 designates that this device should not be notified
# if timedown > 5 minutes and number of devices > 10 alert by pager
# if downtime < 5 minutes send email alert during every cycle
# if timedown = 15 minutes send email alert
# if timedown = 60 minutes send email alert
# if timedown > 60 minutes ignore
for my $index (sort keys %data) {
($name, $oTime, $lTime, $dTime, $alert) = split(' ', $data{$index});
# Threshold checking
if ( exists( $thres{$name} ) && ($thres{$name} > $thresholdEvents)) {
logit("DEBUG: (check_alert) threshold reached on device $name skipping alert notification.",1) if (DEBUG);
next;
}
#
# THIS SECTION WILL PERFORM THE LOGIC FOR DETERMINING THE NOTIFICATION TIMINGS
#
# ALERT = 0 No futher notifications for this device
# ALERT = 1 first email notifycation (0 minute)
# ALERT = 2 single pager notification (5 minute)
# ALERT = 3 second email notifcation (15 minute)
# ALERT = 4 third email notification (1 hour)
#
# In the original script we notified immediately upon a failure, but that left
# us with a lot of false positives, so then we notified only after the device
# had been down for more than 30 seconds, but then that would lead to RECOVERY
# notifications for devices that were never notified as being down.
#
# I've added some logic to retry the PING if the number of down devices is less
# than 10. This will hopefully prevent the false positives for the one or two
# errand lost PINGs yet it won't create any performance issues or delay any
# notifications if there is a larger problem that would go beyond a few lost or
# errand PINGs.
#
## I REMOVED THE LINE BELOW ON FEBRUARY 18, 2005 TO IMPROVE NOTIFICATIONS AND
## CONTROL THE NUMBER OF FALSE POSITIVE NOTIFICATIONS AND ALERTS
##if (( $alert == 1 ) and ($currentTime - $oTime < 5*60) and ($currentTime - $oTime > 30)) {
#REMOVED REQUIREMENT THAT DEVICE BE DOWN FOR > 60 SECONDS BEFORE ALERTING
#if (( $alert == 1 ) and ($currentTime - $oTime < 5*60) ) {
#if ( ( $alert == 1 ) and ($currentTime - $oTime < 5*60) ) {
if ( ( $alert == 1 ) ) {
logit("DEBUG: (check_alert) alerting on $name for 5 minutes",1) if (DEBUG);
$data{$index} = "$name $oTime $lTime $dTime 2";
$notify{$index} = "$name $oTime $lTime $dTime $alert";
$NOTIFY = 1;
###} elsif (( $alert == 1) and ($currentTime - $oTime >= 2*60 )) {
} elsif (( $alert == 2) and ($currentTime - $oTime >= $DownTimeToPage )) {
logit("DEBUG: (check_alert) resetting alert for $name to 3",1) if (DEBUG);
$data{$index} = "$name $oTime $lTime $dTime 3";
foreach my $asset (@IGNORE_PAGERALERTS) {
if ($name =~ /$asset/) {
$SKIP = 1;
}
} #endif
if ($SKIP) {
logit("DEBUG: (check_alert) pager alert skipping ignore asset",1) if (DEBUG);
} else {
$PAGERNOTIFY++;
logit("DEBUG: (check_alert) pager alert now at $PAGERNOTIFY",1) if (DEBUG);
}
} elsif (( $alert == 3) and ($currentTime - $oTime >= 60*15*1)) { # SECS*MINUTES*HOURS
logit("DEBUG: (check_alert) 15 minute alert for $name and clearing marker",1) if (DEBUG);
$data{$index} = "$name $oTime $lTime $dTime 4";
$notify{$index} = "$name $oTime $lTime $dTime $alert";
$NOTIFY = 1;
} elsif (( $alert == 4) and ($currentTime - $oTime >= 60*60*1)) { # SECS*MINUTES*HOURS
logit("DEBUG: (check_alert) 60 minute alert for $name and clearing marker",1) if (DEBUG);
$data{$index} = "$name $oTime $lTime $dTime 0";
$notify{$index} = "$name $oTime $lTime $dTime $alert";
$NOTIFY = 1;
}
} #endfor $index
return;
} #end sub check_alert
#############################################################################
# Subroutine record_downtime
#
# Purpose: check to see what alerts are necessary and set flags
#############################################################################
sub record_downtime {
# Declare Local variables
my $name;
my $oTime;
my $key;
my $now;
my $lTime;
my $dTime;
my $alert;
my $tdtime;
# Find time
$now = time;
if ($RECOVER) {
logit("DEBUG: (record_downtime) there is something todo",1) if (DEBUG);
# Open data file
open DATA, ">>$recordfile";
# print DATA <$templock") or die "$NOW: ERROR: Creating templock $templock: $!";
$main::Cleanfile = $templock;
if (!link($templock,$lockfile)) { # Lock file exists - deal with it.
my($nlink,$lockage) = (stat($lockfile))[3,9];
$lockage = time() - $lockage;
if ($nlink < 2 or $lockage > 30*60) { #lockfile is alone and old
unlink($lockfile)
|| do{ unlink $templock;
die "$NOW: ERROR: Can't unlink stale lockfile ($lockfile). Permissions?\n"};
link($templock,$lockfile)
|| do{ unlink $templock;
die "$NOW: ERROR: Can't create lockfile ($lockfile).\n".
"Permission problem or another mrtg locking succesfully?\n"};
} else {
unlink $templock;
die "$NOW: ERROR: It looks as if you are running two copies of mrtg in parallel on\n".
" the same config file. There is a lockfile ($lockfile) and it is\n".
" is only $lockage seconds old ... Check your crontab.\n".
" (/etc/crontab and /var/spool/cron/root) \n"
if $lockage < 4;
die "$NOW: ERROR: I guess another mrtg is running. A lockfile ($lockfile) aged\n".
"$lockage seconds is hanging around. If you are sure that no other mrtg\n".
"is running you can remove the lockfile\n";
}
}
}
#############################################################################
# Subroutine load_database
#
# Purpose: output the gathered SNMP data to a text file
#############################################################################
sub load_database {
# Declare Local Variables
my ($hostname, $ip, $sysDescr, $sysObjectID, $sysUpTime, $sysContact, $sysName, $sysLocation);
# Let's open a DBI connection to the MySQL database
my $dbh = DBI->connect($dsn,$dbuser,$dbpasswd) or
die "ERROR: cannot open database: $DBI::errstr\n";
my $query_sth = $dbh->prepare("SELECT hostname FROM $table WHERE ping = ?");
# Walk through data and count ports
my $rows = $query_sth->execute('Y');
# FETCHROW ARRAY
my @results;
while (@results = $query_sth->fetchrow()) {
print "$results[0] $results[1]\n";
#push(@list, $hostnamt);
}
$query_sth->finish();
$dbh->disconnect || die "Failed to disconnect\n";
return 1;
} #end sub load_database
#############################################################################
# Subroutine send_mail
#
# Purpose: send SMTP messages to selected users and devices
# Variables: MAILTO
# MAILFROM
# MAILSUBJECT
# MAILMESSAGE
#############################################################################
sub send_mail {
# Local Variables
my ($mailto, $mailfrom, $mailsubject, $mailmessage, $attachments);
$mailto = $_[0];
$mailfrom = $_[1];
$mailsubject = $_[2];
$mailmessage = $_[3];
$attachments = $_[4];
### Adjust the filenames
my $powered_by_perl = '/var/www/html/images/circle_power_perl.gif';
my $technology_logo = '/var/www/html/image6.gif';
### Create the multipart container
my $msg = MIME::Lite->new (
From => $mailfrom,
To => $mailto,
Subject => $mailsubject,
Type =>'multipart/mixed'
) or die "Error creating multipart container: $!\n";
### Add the text message part
$msg->attach (
Type => 'text/html',
Data => $mailmessage
) or die "Error adding the text message part: $!\n";
### Add the GIF file 1
#$msg->attach (
# Type => 'image/gif',
# Id => 'powered_by_perl',
# Path => $powered_by_perl,
# Encoding => 'base64',
# Disposition => 'attachment'
#) or die "Error adding GIF 1 $!\n";
### Add the GIF file 2
#$msg->attach (
# Type => 'image/gif',
# Id => 'technology_logo',
# Path => $technology_logo,
# Encoding => 'base64',
# Disposition => 'attachment'
#) or die "Error adding GIF 2 $!\n";
### Send the Message
MIME::Lite->send('smtp', $MAILHOST, Timeout=>60);
$msg->send;
logit("DEBUG: (sendmail) sending SMTP message to $mailto with $mailsubject", 1);
return 1;
}