#!/usr/bin/perl # # Filename: /usr/local/etc/ping/pingsweep.pl # # Purpose: This Perl script is designed to ping a list of assets and alert # if any asset doesn't respond to an ICMP ping. The alert will be # in the form of an email message but if a major outage occurs the # script will also send pages (via email) to the appropriate staff. # # Author: Michael McNamara (http://blog.michaelfmcnamara.com) # # Credits: Stewart Kendric (http://www.skendric.com/) # I've taken a lot of ideas (and some code) from Stuart's many scripts. # Without Stuart's help it would have taken me much longer to develop # some of the scripts I've come to rely on today. # # Date: May 6, 2003 # # License: # Copyright (C) 2002 Michael McNamara (mfm@michaelfmcnamara.com) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see # # Changes: # # Jul 18, 2011: added logic to skip assets which regex to IGNORE_PAGERALERTS # from being counted against the 15 assets to trigger a page # Dec 10, 2009: added subroutine for SMTP mail delivery # Dec 9, 2009: change to MIME::Lite for email delivery, attaching GIF # files within HTML message body # Dec 30, 2005: cleaned up DEBUG and logging statements for troubleshooting # Mar 18, 2005: added file logging for troubleshooting and monitoring # Feb 19, 2005: added threshold logic to prevent flapping notifications # Feb 17, 2005: fixed logic regarding alert notification and ping retry # Jan 17, 2005: added code that will retry ping if # devices alarming is < 10 devices # Jul 14, 2004: script will now report when devices are once again rechable # Jan 5, 2004: record downtime for monthly computation of downtime % # Jan 2, 2004: changed alert notifications so that device must be # down for at least 60 seconds before email notification # Also added 15 minute alert email notification # Notifications now: > 1 min and < 5 min # at 15 min # at 60 min # Dec 30, 2003: changed email notifications to HTML based messages # # Notes: # Over the years I've built up this Perl script from a simple shell # that would just call FPING to a all encompassing tool which would # still perform its basic function yet go well beyond its original # design. What does it have now that it didn't have then you ask? # Well lets name a few; debug output, comments are now allowed in # the input files, ping retry, flapping notification thresholds, etc. # And it currently has timing logic to initiate both email # notification and pager notification. It now has threshold logic # to prevent those few occasions where I would find a few hunderd # alert messages in my inbox because of some flapping condition. It # also has the ability to record the outages so that a downtime # report can be generated from that data (reportdowntime.pl). # # # # Load Modules use strict; use warnings; use Fcntl qw(:DEFAULT :flock); use DBI; use MIME::Lite; use Net::SMTP; # Server we're going to run this script on our $server = "roo.mdc.acme.org"; # Database Variables my $dbuser = "dbuser"; my $dbpasswd = "dbpass"; my $database = "technology"; my $table = "devices"; my $dsn="DBI:mysql:technology:localhost"; # Declare Constants use constant DEBUG => 1; # DEBUG settings use constant CONSOLE => 0; # CONSOLE DEBUG settings # Skip pager notification for the following assets (using regex matching) our @IGNORE_PAGERALERTS = qw/ vpn.acme.org /; # Declare Global Variables my $result; my $sdate; my $date; my $time; my $currentTime; my @devices; my $retry = 5; # Number of seconds to wait before retrying ping my $pause = 0; # Number of seconds to pause at the startup of the script my %disk; # Array of Devices that were previously DOWN my %data; # Array of Devices that are currently DOWN (after processing) my %live; # Array of Devices that was recorded as DOWN from FPING my %notify; # Array of Devices that will be included in notifications my %thres; # Array of Devices for threshold checking our @list; my $firstalert; my $secondalert; my $minalert; my $largealert; my $pagealert; my $thresholdTime; my $thresholdEvents; my $FPING = "/usr/local/etc/fping"; # Location of FPING exec my $MAILTO; # Who should recieve the email alerts my $MAILFROM; # Who they should appear as coming from my $MAILSUBJECT; # The subject of the email alerts my $MAILHOST; # The DNS/IP address of the SMTP gateway my $PAGERTO; # The email pager addresses of those that should be paged my $PAGERSUBJECT; # The subject of the pager alerts my $NOTIFY; # Flag/count variable for email notification my $SKIP; # Flag/count variable for skipping pager notification my $PAGERNOTIFY = 0; # Flag/count variable for pager notification my $SOMETHINGTODO = 0; # Flag/count variable for alert logic my $RECOVER = 0; # Flag/count variable for recovery notification my $program = "pingsweep.pl"; my $version = "v1.3"; my $author = "Michael McNamara"; my $purpose = "This Perl script is designed to poll the network electronics and report any failures."; my $usage = "Usage: pingsweep.pl \[debug\]\n"; # Email Addresses and Subject Lines ######################################################### # Verizon SMS Text Messaging 123456789@vtext.com # AT&T SMS Text Messaging 123456789@txt.att.net ######################################################### $MAILTO = 'NetworkAlert@acme.org'; $MAILFROM = 'NetworkAlert@acme.org'; $MAILSUBJECT = 'PING: Network Status Report'; $MAILHOST = "smtp.acme.org"; $PAGERTO = '123456789@txt.att.net'; $PAGERSUBJECT = "NETWORK ALERT"; # Data File Paths and Locations my $electronics = "/usr/local/etc/ping/pingsweep.txt"; my $datafile = "/usr/local/etc/ping/pingsweep.dat"; my $flagfile = "/tmp/pingsweep.flg"; my $recordfile = "/usr/local/etc/ping/pingsweeprecord.dat"; my $thresholdfile = "/usr/local/etc/ping/pingsweeprecord.dat"; my $logfile = "/usr/local/etc/ping/logs/pingsweep.log"; my $lockfile = "/tmp/pingsweep.tmp"; my $templock = "/tmp/pingsweep.tmp.$$"; # Notification Intervals in seconds $firstalert = 15*60; # Time value for first alert notification $secondalert = 60*60; # Time value for second alert notification $minalert = 5*50; # Time value for minimum alert notification $largealert = 10; # Number of alert devices to send immediate $pagealert = 15; # Number of alarms to trigger page/text message #$thresholdTime = 120*60; # Time (seconds) value for threshold checking $thresholdTime = 1*60; # Time (seconds) value for threshold checking $thresholdEvents = 3; # Threshold value for number of events our $device_count; # Number of devices the script is polling my $NOW = localtime; ########################################################################### # Signal Handlers ########################################################################### $SIG{INT} = $SIG{TERM} = sub { unlink ${main::Cleanfile} if defined $main::Cleanfile; unlink ${main::Cleanfile2} if defined $main::Cleanfile2; unlink ${main::Cleanfile3} if defined $main::Cleanfile3; warn "$NOW: ERROR: Bailout after SIG $_[0]\n"; exit 1; }; $SIG{HUP} = sub { unlink ${main::Cleanfile} if defined $main::Cleanfile; unlink ${main::Cleanfile2} if defined $main::Cleanfile2; unlink ${main::Cleanfile3} if defined $main::Cleanfile3; die "$NOW: ERROR: Bailout after SIG $_[0]\n"; }; END { local($?, $!); unlink ${main::Cleanfile} if defined $main::Cleanfile; unlink ${main::Cleanfile2} if defined $main::Cleanfile2; } ########################################################################### # B E G I N M A I N ########################################################################### # Initialize program environment &initialize; # Load threshold data and identify problem devices &load_thres; &get_count; # Ping devices and get list of devices not responding &get_data; # Load devices that failed to respond previously &load_data; # Compare the list of devices responding against the old list &comp_data; # Check to see if email alerts should be sent &check_alert; # Store list of devices not responding for later use &write_data; # Issue email for list of devices not responding or now responding &alert_notify; # Issue pager email alert if more than 10 devices for 5 minutes &alert_pager; # Store list of devices that are now responding with their total # accumulated downtime for later reporting with reportdowntime.pl &record_downtime; # Remove the flagfile after everything has completed &finishup; exit 0; ########################################################################### # E N D M A I N ########################################################################### ########################################################################### # Subroutine initialize # # Purpose: perform all the initialization steps and procedures ########################################################################### sub initialize { # Declare Local Variables # Inialize time and date $sdate = localtime; ($date, $time) = &get_time; $currentTime = time; ## Test for flagfile and exit if exists #if (-e $flagfile) { # print "ERROR: The flagfile $flagfile already exists \n" if (DEBUG); # #`mutt McNamaraM@acme.org -s "PINGSWEEP.PL: Lockfile exists, aborting" < /dev/null` # die "$program script is already running or flagfile needs to be removed."; #} &lockit($lockfile,$templock); ## Let's open the logfile and place a lock on the file open(LOGFILE, ">>$logfile"); flock(LOGFILE, LOCK_EX) or die "$program script unable to lock logfile $logfile\n"; logit("DEBUG: #################################################################", 1); logit("DEBUG: (initialize) $program $version starting up...", 1); logit("DEBUG: (initialize) logfile $logfile is open and flock complete", 1); logit("DEBUG: (initialize) creating flagfile $flagfile", 1); ## Create a flagfile to prevent multiple versions from running ##`touch $flagfile`; #open(FLAGFILE, ">$flagfile"); #print FLAGFILE "flagfile for pingsweep.pl\n"; #close(FLAGFILE); logit("DEBUG: (initialize) sleeping $pause seconds",1) if (DEBUG); # sleep 1 seconds to offset other test scripts sleep $pause; return; } #end sub ########################################################################### # Subroutine finishup # # Purpose: wrap up the program and close any open files ########################################################################### sub finishup { logit("DEBUG: (finishup) all finished let's remove the flagfile.",1) if (DEBUG); ## Remove the flagfile after everything has completed ##`/bin/rm $flagfile`; #unlink $flagfile; close LOCK; unlink ($templock, $lockfile); logit("DEBUG: (finishup) closing logfile $logfile",1) if (DEBUG); # Close the log data file close(LOGFILE); # Copy the list of down devices for the webserver to access `cp -f /usr/local/etc/ping/pingsweep.dat /var/www/html/pingsweep.dat`; return; } ########################################################################### # Subroutine get_count # # Purpose: count the number of devices that will be pings, record it ########################################################################### sub get_count { # Open data file open DEVICES, "$electronics" or die "Can't open $electronics: $!\n"; # Walk through data file while () { # Skip blank lines next if (/^\n$/); # Skip comments next if (/^#/); $device_count++; } close DEVICES; return; } ########################################################################### # Subroutine get_data # # Purpose: ping a list of devices and record those that don't respond ########################################################################### sub get_data { # Declare Local Variables my $name; my $execstr; my $result2; my $numdevices; # Number of elements in @devices array logit("DEBUG: (get_data) we're about to poll $device_count devices",1) if (DEBUG); logit("DEBUG: (get_data) shelling out to exec system call to FPING",1) if (DEBUG); # Use FPING to ping all the electronics and store the list of devices # that doesn't respond into the string $result $result = `$FPING -b 56 -u -f $electronics`; # If there were some devices that didn't respond let's go to work if ($result) { $SOMETHINGTODO = 1; # Set this flag for later processing chomp ($result); # remove the CR/LF logit("DEBUG: (get_data) some devices failed to respond to our ping",1) if (DEBUG); # Lets take the results and load them into an array @devices = split (/\n/, $result); # Lets count the number of elements in the array $numdevices = @devices; # If there are less than 10 devices not responding lets go ahead and # try to PING them again just to make sure they are really down. If # there are more than 10 devices we can assume that this is a larger # problem and that we shouldn't overload the system trying to re-PING # too many devices in too short a timeframe else we'll go outside our # 60 second window. if ($numdevices < 15) { # Lets take that list of devices and ping them again to confirm $execstr = "$FPING -b 56 -u "; foreach $name (@devices) { $execstr = $execstr . "$name "; } logit("DEBUG: (get_data) some devices failed to responsd, retrying...",1) if (DEBUG); logit("DEBUG: (get_data) here's the execstr $execstr",1) if (DEBUG); logit("DEBUG: (get_data) sleeping the retry interval of $retry seconds",1) if (DEBUG); # Lets stop and catch our breath for a few seconds before trying again sleep $retry; # Lets ping those devices that originally failed again $result2 = `$execstr`; # If there were any results lets store and evaluate them if ($result2) { $SOMETHINGTODO = 1; # Set this flag for later processing chomp ($result2); # remove CR/LF # Lets take the results and load them into an array @devices = split(/\n/, $result2); logit("DEBUG: (get_data) some devices failed to respond a SECOND time",1) if (DEBUG); foreach $name (@devices) { $name =~ s/\/n//g; # remove the CR/LF $live{$name} = $currentTime; # store the current time for each device logit("DEBUG: (get_data) live{$name} = $live{$name}",1) if (DEBUG); } #end foreach } else { # if there were no devices down the SECOND time around $SOMETHINGTODO = 0; # Set this flag for later processing logit("DEBUG: (get_data) all devices responded the SECOND time around.",1) if (DEBUG); } #end if else($result2) } else { # if number of devices down > 15 just process logit("DEBUG: (get_data) number of devices > 15 processing without retrying PING",1) if (DEBUG); foreach $name (@devices) { $name =~ s/\/n//g; # remove the CR/LF $live{$name} = $currentTime; # store the current time for each device logit("DEBUG: (get_data) live{$name} = $live{$name}",1) if (DEBUG); } #end foreach } # endif numdevices > 10 } else { # if there were no devices down the FIRST time around # There were no devices that failed to respond so there's nothing to-do $SOMETHINGTODO = 0; logit("DEBUG: (get_data) there were no devices that failed to respond...",1) if (DEBUG); } #end if($result) return 1; } ########################################################################## # Subroutine alert_notify # # Purpose: compose an HTML based email message which details the devices # that failed to response and also details those devices that # are now reponding (recovered). ########################################################################## sub alert_notify { # Declare Local Varaibles my $name; my $oTime; my $lTime; my $dTime; my $alert; my $flag = 0; my $message_body; # If there was some device that either failed to ping or has recovered if ($NOTIFY | $RECOVER) { logit("DEBUG: (alert_notify) there is something todo",1) if (DEBUG); $message_body .= "

ACME Corporation Network Infrastructure Status Report

\n"; #### $message_body .= "

"; $message_body .= "Date: $sdate
"; $message_body .= "Server: $server
"; # There was a device that failed to respond so we'll be alerting on it if ($NOTIFY) { logit("DEBUG: (alert_notify) within the down host section",1) if (DEBUG); $message_body .= "
\n"; $message_body .= "The following devices failed to respond to an ICMP ping(s);
\n"; $message_body .= "\n"; $message_body .= ""; $message_body .= ""; $message_body .= ""; $message_body .= ""; # processing in here for devices that failed for my $index (sort keys %notify) { ($name, $oTime, $lTime, $dTime, $alert) = split(' ', $notify{$index}); # Calculate down time $dTime = &calc_down_time($oTime, $currentTime); $message_body .= "\n"; $message_body .= "\n"; $message_body .= "\n"; $message_body .= "\n"; logit("DEBUG: (alert_notify) Down Hostname = $name \tDownTime = $dTime",1) if (DEBUG); } $message_body .= "
Device Hostname or IP AddressTime Down
DD:HH:MM:SS
$name$dTime
\n"; } #end if $NOTIFY # There was some device that recovered so we'll be alerting on it if ($RECOVER) { logit("DEBUG: (alert_notify) within the recovered host section",1) if (DEBUG); $message_body .= "
\n"; $message_body .= "The following devices are now responding to ICMP ping(s);
\n"; $message_body .= "\n"; $message_body .= "\n"; $message_body .= "\n"; $message_body .= "\n"; $message_body .= "\n"; # processing in here for devices that recovered for my $triple (sort keys %disk) { # If device is still down we shouldn't report it as recovered next if ( exists ($data{$triple}) ); ($name, $oTime, $lTime, $dTime) = split(' ', $disk{$triple}); # Threshold checking if ( exists( $thres{$name} ) && ($thres{$name} > $thresholdEvents)) { logit("DEBUG: (alert_notify) threshold reached $name skipping recovery email.",1) if (DEBUG); next; } # Calculate down time $dTime = &calc_down_time($oTime, $currentTime); $message_body .= "\n"; $message_body .= "\n"; $message_body .= "\n"; $message_body .= "\n"; logit("DEBUG: (alert_noitfy) Up Hostname = $name \tDownTime = $dTime\n",1) if (DEBUG); } #end for my $triple $message_body .= "
Device Hostname or IP AddressTime Down
DD:HH:MM:SS
$name$dTime
\n"; } #end if $RECOVER $message_body .= "
"; $message_body .= "Notes: this message is now being sent in HTML format.

"; #$message_body .= "
\n"; $message_body .= "\n"; ### Adjust the filenames my $powered_by_perl = '/var/www/html/images/circle_power_perl.gif'; my $technology_logo = '/var/www/html/image6.gif'; send_mail($MAILTO, $MAILFROM, $MAILSUBJECT, $message_body); } #end if ($NOTIFY | $RECOVER) else { logit("DEBUG: (alert_notify) there is nothing to notify",1) if (DEBUG); } return 1; } #end sub alert_notify ######################################################################## # Subroutine get_time # # Purpose: calculate the time ######################################################################## sub get_time { # Declare Local Variables my ($sec, $min, $hour, $day, $mon, $year, $date, $time, $now); ($sec, $min, $hour, $day, $mon, $year) = (localtime)[0,1,2,3,4,5]; if ($sec < 10) { $sec = "0" . $sec } if ($min < 10) { $min = "0" . $min } if ($hour < 10) { $hour = "0" . $hour } $mon = $mon + 1; $year = $year + 1900; $date = $mon . "-" . $day . "-" . $year; $time = $hour . ":" . $min . ":" . $sec; $now = $date . " at " . $time; return ($date, $time); } #end sub get_time ######################################################################## # Subroutine load_data # # Purpose: load from file the list of devices that were previously down ######################################################################## sub load_data { # Declare Local Variables my $oTime; # Original timestamp when device went down my $lTime; # Last timestamp when device was checked my $rTime; # Recovery timestap when device recovered my $dTime; # Amount of time the device has been down my $name; # FQDN of the device being checked my $alert; # Number of Pager alerts sent # Open data file open DATA, "$datafile" or die "Can't open $datafile: $!\n"; logit("DEBUG: (load_data) starting to load hash \%disk",1) if (DEBUG); # Walk through data file while () { # Skip blank lines next if (/^\n$/); # Skip comments next if (/^#/); # Read a line of data, throw away iTime ($name, $oTime, $lTime, $dTime, $alert) = split(' '); # Build data structure $disk{$name} = "$name $oTime $lTime $dTime $alert"; logit("DEBUG: (load_data) reading disk{$name} = $name $oTime $lTime $dTime $alert",1) if (DEBUG); } #end while close DATA; return; } #end sub load_data ######################################################################## # Subroutine load_thres # # Purpose: load threshold data from file of devices that were previously down ######################################################################## sub load_thres { # Declare Local Variables my $oTime; # Original timestamp when device went down my $lTime; # Last timestamp when device was checked my $rTime; # Recovery timestap when device recovered my $dTime; # Amount of time the device has been down my $name; # FQDN of the device being checked my $alert; # Number of Pager alerts sent my $index; # Index variable for hash array my $tmwindow = $currentTime - $thresholdTime; # # Load threshold data from \$thresholdfile for threshold checking # # We can use the pingsweeprecord.dat file to check for thresholds # The format of that file appears below. We can load all the data and then # count the number of events within the threshold window, perhaps 60 minutes. # # Device Hostname Original Recover Total Down Date Time # Time Time Time Time #--------------------------------------------------------------------------------------- # #sw-ccr-8600-b.core.acme.org 1107277623 1107277801 238 0:00:03:58 2-1-2005 12:11:01 # # Open data file open THRESDATA, "$thresholdfile" or die "Can't open $thresholdfile: $!\n"; logit("DEBUG: (load_thres) starting to load hash \%thres",1) if (DEBUG); logit("DEBUG: (load_thres) threshold events = $thresholdEvents and time window = $tmwindow",1) if (DEBUG); # Walk through data file while () { # Skip blank lines next if (/^\n$/); # Skip comments next if (/^#/); ($name, $oTime, $rTime, $dTime) = split(' '); # If the recovery time was before the \$thresholdTime lets count this event if ($rTime > ($currentTime - $thresholdTime) ) { logit("DEBUG: (load_thres) threshold event for $name detected",1) if (DEBUG); $thres{$name}++; } } #end while close THRESDATA; # Lets dump all the devices that have reached the threshold if (DEBUG) { logit("DEBUG: (load_thres) checking threshold events for each device", 1); for my $idx (sort keys %thres) { if ($thres{$idx} > $thresholdEvents) { logit("DEBUG: (load_thres) threshold event limit reached for $idx events = $thres{$idx}",1); } } } return 1; } #end sub load_thres ######################################################################## # Subroutine write_data # # Purpose: write list of devices not responding to file for future ######################################################################## sub write_data { # Declare Local variables my $name; my $oTime; my $key; my $now; my $lTime; my $dTime; my $alert; # Find time $now = time; # Open data file open DATA, ">$datafile"; print DATA < $thresholdEvents)) { logit("DEBUG: (comp_data) threshold reached $name skipping recovery notification.",1) if (DEBUG); next; } # $dTime = &calc_down_time($oTime, $currentTime); #if ($currentTime - $oTime >= 60*2*1) { # SECS*MINUTES*HOURS #CHANGED TIMING BECAUSE RECOVERY REPORTS WERE MISSING FOR DOWN REPORTS if ($currentTime - $oTime >= 30*1*1) { # SECS*MINUTES*HOURS $RECOVER = 1; ($name, $oTime, $lTime, $dTime) = split(' ', $disk{$test}); logit("DEBUG: (comp_data) the following host has recovered $name",1) if (DEBUG); } #end if $currentTime } #end for my $test return 1; } #end sub comp_data ############################################################################ # Subroutine alert_pager # # Purpose: send email alert to pager smtp address if applicable ############################################################################ sub alert_pager { # Declare Local Variables my $name; my $oTime; my $lTime; my $dTime; my $alert; # if number of devices alerting for more than 5 minutes is greater # than 10 then please page someone to alert them if ($PAGERNOTIFY >= $pagealert) { logit("DEBUG: (alert_pager) issuing pager alert to $PAGERTO ($PAGERNOTIFY devices down)",1) if (DEBUG); open(SENDMAIL, "| /usr/lib/sendmail $PAGERTO") || die; print SENDMAIL "From: $MAILFROM\nTo: $PAGERTO\nSubject: $PAGERSUBJECT\n\n"; print SENDMAIL "Date: $sdate"; #print SENDMAIL "Server: $server"; print SENDMAIL "There is a farily large problem going on at work."; print SENDMAIL "Approximately $PAGERNOTIFY assets have been down for more than 5 minutes."; print SENDMAIL "You should check your email for a complete list of the affected assets."; print SENDMAIL "You will not recieve a page when this problem clearsassets."; close(SENDMAIL) || die; } else { logit("DEBUG: (alert_pager) not enough devices down - no pager alert ($PAGERNOTIFY devices down)",1) if (DEBUG); } #endif ($NOTIFY) return; } #end sub alert_pager ############################################################################# # Subroutine check_alert # # Purpose: check to see what alerts are necessary and set flags ############################################################################# sub check_alert { # Declare Local Variables my $index; my $name; my $oTime; my $lTime; my $dTime; my $alert; my $DownTimeToPage = 5*60; logit("DEBUG: (check_alert) checking for email notifications",1) if (DEBUG); $NOTIFY = 0; $SKIP = 0; # Setting the ALERT value to 0 designates that this device should not be notified # if timedown > 5 minutes and number of devices > 10 alert by pager # if downtime < 5 minutes send email alert during every cycle # if timedown = 15 minutes send email alert # if timedown = 60 minutes send email alert # if timedown > 60 minutes ignore for my $index (sort keys %data) { ($name, $oTime, $lTime, $dTime, $alert) = split(' ', $data{$index}); # Threshold checking if ( exists( $thres{$name} ) && ($thres{$name} > $thresholdEvents)) { logit("DEBUG: (check_alert) threshold reached on device $name skipping alert notification.",1) if (DEBUG); next; } # # THIS SECTION WILL PERFORM THE LOGIC FOR DETERMINING THE NOTIFICATION TIMINGS # # ALERT = 0 No futher notifications for this device # ALERT = 1 first email notifycation (0 minute) # ALERT = 2 single pager notification (5 minute) # ALERT = 3 second email notifcation (15 minute) # ALERT = 4 third email notification (1 hour) # # In the original script we notified immediately upon a failure, but that left # us with a lot of false positives, so then we notified only after the device # had been down for more than 30 seconds, but then that would lead to RECOVERY # notifications for devices that were never notified as being down. # # I've added some logic to retry the PING if the number of down devices is less # than 10. This will hopefully prevent the false positives for the one or two # errand lost PINGs yet it won't create any performance issues or delay any # notifications if there is a larger problem that would go beyond a few lost or # errand PINGs. # ## I REMOVED THE LINE BELOW ON FEBRUARY 18, 2005 TO IMPROVE NOTIFICATIONS AND ## CONTROL THE NUMBER OF FALSE POSITIVE NOTIFICATIONS AND ALERTS ##if (( $alert == 1 ) and ($currentTime - $oTime < 5*60) and ($currentTime - $oTime > 30)) { #REMOVED REQUIREMENT THAT DEVICE BE DOWN FOR > 60 SECONDS BEFORE ALERTING #if (( $alert == 1 ) and ($currentTime - $oTime < 5*60) ) { #if ( ( $alert == 1 ) and ($currentTime - $oTime < 5*60) ) { if ( ( $alert == 1 ) ) { logit("DEBUG: (check_alert) alerting on $name for 5 minutes",1) if (DEBUG); $data{$index} = "$name $oTime $lTime $dTime 2"; $notify{$index} = "$name $oTime $lTime $dTime $alert"; $NOTIFY = 1; ###} elsif (( $alert == 1) and ($currentTime - $oTime >= 2*60 )) { } elsif (( $alert == 2) and ($currentTime - $oTime >= $DownTimeToPage )) { logit("DEBUG: (check_alert) resetting alert for $name to 3",1) if (DEBUG); $data{$index} = "$name $oTime $lTime $dTime 3"; foreach my $asset (@IGNORE_PAGERALERTS) { if ($name =~ /$asset/) { $SKIP = 1; } } #endif if ($SKIP) { logit("DEBUG: (check_alert) pager alert skipping ignore asset",1) if (DEBUG); } else { $PAGERNOTIFY++; logit("DEBUG: (check_alert) pager alert now at $PAGERNOTIFY",1) if (DEBUG); } } elsif (( $alert == 3) and ($currentTime - $oTime >= 60*15*1)) { # SECS*MINUTES*HOURS logit("DEBUG: (check_alert) 15 minute alert for $name and clearing marker",1) if (DEBUG); $data{$index} = "$name $oTime $lTime $dTime 4"; $notify{$index} = "$name $oTime $lTime $dTime $alert"; $NOTIFY = 1; } elsif (( $alert == 4) and ($currentTime - $oTime >= 60*60*1)) { # SECS*MINUTES*HOURS logit("DEBUG: (check_alert) 60 minute alert for $name and clearing marker",1) if (DEBUG); $data{$index} = "$name $oTime $lTime $dTime 0"; $notify{$index} = "$name $oTime $lTime $dTime $alert"; $NOTIFY = 1; } } #endfor $index return; } #end sub check_alert ############################################################################# # Subroutine record_downtime # # Purpose: check to see what alerts are necessary and set flags ############################################################################# sub record_downtime { # Declare Local variables my $name; my $oTime; my $key; my $now; my $lTime; my $dTime; my $alert; my $tdtime; # Find time $now = time; if ($RECOVER) { logit("DEBUG: (record_downtime) there is something todo",1) if (DEBUG); # Open data file open DATA, ">>$recordfile"; # print DATA <$templock") or die "$NOW: ERROR: Creating templock $templock: $!"; $main::Cleanfile = $templock; if (!link($templock,$lockfile)) { # Lock file exists - deal with it. my($nlink,$lockage) = (stat($lockfile))[3,9]; $lockage = time() - $lockage; if ($nlink < 2 or $lockage > 30*60) { #lockfile is alone and old unlink($lockfile) || do{ unlink $templock; die "$NOW: ERROR: Can't unlink stale lockfile ($lockfile). Permissions?\n"}; link($templock,$lockfile) || do{ unlink $templock; die "$NOW: ERROR: Can't create lockfile ($lockfile).\n". "Permission problem or another mrtg locking succesfully?\n"}; } else { unlink $templock; die "$NOW: ERROR: It looks as if you are running two copies of mrtg in parallel on\n". " the same config file. There is a lockfile ($lockfile) and it is\n". " is only $lockage seconds old ... Check your crontab.\n". " (/etc/crontab and /var/spool/cron/root) \n" if $lockage < 4; die "$NOW: ERROR: I guess another mrtg is running. A lockfile ($lockfile) aged\n". "$lockage seconds is hanging around. If you are sure that no other mrtg\n". "is running you can remove the lockfile\n"; } } } ############################################################################# # Subroutine load_database # # Purpose: output the gathered SNMP data to a text file ############################################################################# sub load_database { # Declare Local Variables my ($hostname, $ip, $sysDescr, $sysObjectID, $sysUpTime, $sysContact, $sysName, $sysLocation); # Let's open a DBI connection to the MySQL database my $dbh = DBI->connect($dsn,$dbuser,$dbpasswd) or die "ERROR: cannot open database: $DBI::errstr\n"; my $query_sth = $dbh->prepare("SELECT hostname FROM $table WHERE ping = ?"); # Walk through data and count ports my $rows = $query_sth->execute('Y'); # FETCHROW ARRAY my @results; while (@results = $query_sth->fetchrow()) { print "$results[0] $results[1]\n"; #push(@list, $hostnamt); } $query_sth->finish(); $dbh->disconnect || die "Failed to disconnect\n"; return 1; } #end sub load_database ############################################################################# # Subroutine send_mail # # Purpose: send SMTP messages to selected users and devices # Variables: MAILTO # MAILFROM # MAILSUBJECT # MAILMESSAGE ############################################################################# sub send_mail { # Local Variables my ($mailto, $mailfrom, $mailsubject, $mailmessage, $attachments); $mailto = $_[0]; $mailfrom = $_[1]; $mailsubject = $_[2]; $mailmessage = $_[3]; $attachments = $_[4]; ### Adjust the filenames my $powered_by_perl = '/var/www/html/images/circle_power_perl.gif'; my $technology_logo = '/var/www/html/image6.gif'; ### Create the multipart container my $msg = MIME::Lite->new ( From => $mailfrom, To => $mailto, Subject => $mailsubject, Type =>'multipart/mixed' ) or die "Error creating multipart container: $!\n"; ### Add the text message part $msg->attach ( Type => 'text/html', Data => $mailmessage ) or die "Error adding the text message part: $!\n"; ### Add the GIF file 1 #$msg->attach ( # Type => 'image/gif', # Id => 'powered_by_perl', # Path => $powered_by_perl, # Encoding => 'base64', # Disposition => 'attachment' #) or die "Error adding GIF 1 $!\n"; ### Add the GIF file 2 #$msg->attach ( # Type => 'image/gif', # Id => 'technology_logo', # Path => $technology_logo, # Encoding => 'base64', # Disposition => 'attachment' #) or die "Error adding GIF 2 $!\n"; ### Send the Message MIME::Lite->send('smtp', $MAILHOST, Timeout=>60); $msg->send; logit("DEBUG: (sendmail) sending SMTP message to $mailto with $mailsubject", 1); return 1; }