Add monitoring for object-replicator logs disappearing
As noted in lp#1691570, there is an issue with storage I/O during coro-thread cleanup that affects the swift-object-replicator's ability to complete replication successfully. This is most easily witnessed by the lack of the every 5 minute replicated percentage complete messages that should come from the swift-object-replicator daemon to syslog. This patch monitors for and alerts on the condition of the "replicated" line missing from syslog within the past 15 minutes. Change-Id: Ieb15da3f3f67fa9bcad03151e36c70faae4c36c9 Closes-Bug: 1691570
This commit is contained in:
parent
971df56d61
commit
74daa465d6
14
config.yaml
14
config.yaml
|
@ -130,6 +130,20 @@ options:
|
|||
default: "-m -r 60 180 10 20"
|
||||
type: string
|
||||
description: String appended to nagios check
|
||||
nagios-replication-check-params:
|
||||
default: "replicated 15 2 1"
|
||||
type: string
|
||||
description: |
|
||||
Space delimited parameters for check_swift_replicator_logs.sh.
|
||||
search_pattern
|
||||
interval_in_minutes
|
||||
minimum_hits_before_warning
|
||||
minimum_hits_before_critical
|
||||
Default of "replicated 15 2 1" leads to warning alert when there
|
||||
have not been at least 2 lines matching "replicated" in the last 15
|
||||
minutes, and critical if there have been no matching lines in the
|
||||
last 15 minutes.
|
||||
Set to blank string "" to disable the check.
|
||||
nagios_context:
|
||||
default: "juju"
|
||||
type: string
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
#!/bin/bash
|
||||
|
||||
pattern=${1:-replicated}
|
||||
interval=${2:-15}
|
||||
warn_min=${3:-2}
|
||||
crit_min=${4:-1}
|
||||
|
||||
exec sudo -u root /usr/local/lib/nagios/plugins/check_timed_logs.pl -pattern $pattern -logfile /var/log/syslog -interval $interval -w $warn_min -c $crit_min -reverse 2>&1
|
|
@ -0,0 +1,195 @@
|
|||
#!/usr/bin/perl
|
||||
##############################################################################
|
||||
#
|
||||
# NAME: check_timed_logs.pl
|
||||
#
|
||||
# AUTHOR: Gerd Radecke
|
||||
#
|
||||
# COMMENT: Script searches a text file for the appearance of a given RegEx within a given time period.
|
||||
# Using additional parameters you can adjust: Time string format,
|
||||
# time string position, number of pattern matches required to be "successful".
|
||||
#
|
||||
# Return Values for NRPE:
|
||||
# OK - There are only 0 instances of $pattern in the last $interval minutes (0)
|
||||
# CRITICAL - There are $hits instances of \"$pattern\" in the last $interval minutes (2)
|
||||
# WARNING - There are $hits instances of \"$pattern\" in the last $interval minutes (1)
|
||||
# UNKNOWN - There were no files matching the passed filename (3)
|
||||
#
|
||||
# REQUIRES: perl-Time-Piece perl-File-ReadBackwards
|
||||
# ON RHEL-based systems you can run: yum install perl-Time-Piece perl-File-ReadBackwards
|
||||
#
|
||||
# CHANGELOG:
|
||||
# 1.0 2013-02-19 - initial version
|
||||
# 1.0.1 2013-02-27 - fixed false variable reference
|
||||
# 1.0.2 2013-10-07 - integrated threshold comparison fix by Christoph Tavan - thanks ;)
|
||||
# 1.0.3 2019-12-23 - Added --reverse flag to check for presense of lines within last $interval - drewn3ss
|
||||
# 1.0.4 2019-12-30 - Updated time_pattern default to match ubuntu syslog timepattern
|
||||
#
|
||||
##############################################################################
|
||||
|
||||
|
||||
use File::ReadBackwards; # EPEL RPM: perl-File-ReadBackwards.noarch
|
||||
use Getopt::Long;
|
||||
use Time::Piece; # RHEL package: perl-Time-Piece
|
||||
use File::Find;
|
||||
|
||||
$ENV{"LC_ALL"} = "C";
|
||||
$time_pattern = '%b %e %H:%M:%S';
|
||||
$warning = 1;
|
||||
$critical = 1;
|
||||
$reverse = 0;
|
||||
|
||||
$time_position = 0;
|
||||
$result = GetOptions (
|
||||
"pattern=s" => \$pattern, # string e.g. "CRITICAL"
|
||||
"logfile=s" => \$logfile, # string e.g. "/var/log/messages"
|
||||
"interval=i" => \$interval, # int e.g. 30 for half an hour
|
||||
"timepattern=s" => \$time_pattern, #string e.g. '%Y-%m-%d %H:%M:%S'
|
||||
"timeposition=i" => \$time_position, # int, each line is split into string on the space character, this provides the index of the first string block for the time
|
||||
"warning|w=i" => \$warning, # int e.g. 3
|
||||
"critical|c=i" => \$critical, # int e.g. 5
|
||||
"debug|d|vv" => \$debug, # flag/boolean
|
||||
"verbose|v" => \$verbose, # flag/boolean
|
||||
"reverse|r|?" => \$reverse, # flag/boolean - should we report on absence of pattern rather than presence
|
||||
"help|h|?" => \$usage # flag/boolean - is help called?
|
||||
);
|
||||
|
||||
print $count;
|
||||
if ($usage || !(defined($pattern) && $pattern ne "") || !(defined($logfile) && $logfile ne "") || !(defined($interval) && $interval gt 0 )) {
|
||||
print "\nUsage: $0
|
||||
\t -pattern <regex-pattern>
|
||||
\t -logfile <path to log file>
|
||||
\t -interval <minutes>
|
||||
\t -reverse # report on absence of enough entries in the timeframe
|
||||
\t [-timepattern <POSIX time pattern>]
|
||||
\t [-warning|w <number_of_required_hits>] [-critical|c <number_of_required_hits>]
|
||||
\t [-timeposition <time_string_index_on_line>] \n\n";
|
||||
print "To allow for rotating logfiles, any file that matches the passed filename and was changed within the passed interval is checked. e.g. If you pass /var/log/applog, this could match /var/log/applog.0, /var/log/applog.old and so on. However, it does not handle compressed (e.g. gzip/bzip) files. \n\n";
|
||||
print "Default time pattern is: %Y-%m-%d %H:%M:%S => 2012-12-31 17:20:40\n";
|
||||
print "Example Time patterns (from a RHEL system):
|
||||
BSD/Syslog: %b %d %H:%M:%S => Dec 31 17:20:40
|
||||
Apache Logs: %d/%b/%Y:%H:%M:%S (with -timeposition 3) => 31/Dec/2012:17:20:40
|
||||
Websphere Logs: %d-%b-%Y %I:%M:%S %p => 31-Dec-2012 05:20:40 PM
|
||||
Nagios logs: %s => 1361260238 (seconds since 01-01-1970) \n";
|
||||
print "For a posix time format documentation check out: http://linux.die.net/man/3/strftime \n\n";
|
||||
print "Default warning/critical threshold of pattern matches to find is: 1 -> unless you change this, you will only get OK or CRITICAL, but never WARNING\n\n";
|
||||
print "Default time position is 0 \n";
|
||||
print "\t Time Position: each line is split into an array of strings on the space character, this provides the index for the first time string.\n";
|
||||
print "\t Note: If the line starts with the time, that means we start at index 0.\n\n";
|
||||
print "The values for interval and warning/critical need to be larger than zero \n";
|
||||
exit;
|
||||
}
|
||||
|
||||
my $now = localtime;
|
||||
|
||||
$oldestDate = $now - $interval*60;
|
||||
if ($debug) { print "Now: $now and tzoffset: ". ($now)->tzoffset ."\n"; }
|
||||
if ($debug) { print "Oldest date: $oldestDate and tzoffset: ". ($oldestDate)->tzoffset ."\n"; }
|
||||
|
||||
|
||||
$hits = 0; # number of matches for the regex within the log files will be counted in this variable
|
||||
$validFileNames = 0; # number of files that match the given filename
|
||||
my @dateFields = $time_pattern =~ / /g; # how many spaces do we have in our time pattern?
|
||||
my $dateFieldsCount = @dateFields; # count the number spaces in the date format
|
||||
|
||||
if ($debug) {
|
||||
$verbose = 1; # if we debug, we want to have all information
|
||||
print "Interval: $interval equals " . ($interval/1440) . " Fraction of days.\n";
|
||||
}
|
||||
|
||||
|
||||
$logfile=~m/^.+\//;
|
||||
$DIR=$&; # greedy matching from theline above
|
||||
|
||||
@files = find(\&process, $DIR);
|
||||
sub process {
|
||||
|
||||
### note the following is done for each file that is found and matches the name and date criteria
|
||||
if ($File::Find::name =~ m/$logfile/ && (-T)) { # match only files that are ASCII files (-T) and that contain the file name
|
||||
$validFileNames += 1;
|
||||
if ($debug) { print "Found: $File::Find::name has age " . (-M) ." (in Fraction of days) \n"; }
|
||||
|
||||
# -M returns the last change date of the file in fraction of days. e.g. 24 ago -> 1, 6 hours ago -> 0.25
|
||||
if ((-M) < ($interval/1440)) { # match only files whose last change (-M) is within the change interval
|
||||
# perldoc defines -M : Script start time minus file modification time, in days.
|
||||
|
||||
$LOGS = File::ReadBackwards->new($File::Find::name) or
|
||||
die "Can't read file: $File::Find::name\n";
|
||||
|
||||
while (defined($line = $LOGS->readline) ) {
|
||||
my @fields = split ' ', $line; # split the line into an array, split on ' '(space)
|
||||
$dateString = ""; # reset the datestring for each line
|
||||
for ($i=0; $i <= $dateFieldsCount; $i++) {
|
||||
$dateString .= $fields[$time_position + $i] . " "; # concatenate all date strings into one parseable string
|
||||
}
|
||||
$dateString =~ s/^\s+|\s+$//g ; # remove both leading and tailing whitespace - perl 6 will have a trim() function, until then - regex !
|
||||
$dateString =~ s/<|>|\]|\[//g ; # remove brackets
|
||||
#if ($debug) { print "Datestring: $dateString \n";} # this is only needed if you are unsure which strings of the array are part of your datestring
|
||||
|
||||
my $dt = Time::Piece->strptime($dateString, $time_pattern); # parse string into Time::Piece object
|
||||
my $dt_tzadjusted = ($dt - $now->tzoffset); # TIME::PIECE assumes the parsed dates will be UTC, we need to adjust to the local tz offset
|
||||
|
||||
# some date formats don't have the year information e.g. Dec 31 15:50:57 -> the year would automatically be parsed to 1970,
|
||||
# which is probably never correct. We will correct this to this or last year
|
||||
if ($dt->year eq 1970) {
|
||||
$dt = $dt->add_years($now->year - 1970); # We cannot set the year directly. So we add the number of years that have passed since 1970.
|
||||
$dt_tzadjusted = ($dt - $now->tzoffset);
|
||||
# NOTE: If $now is January 1st and we're looking at log files from the end of last year, we will add too many years
|
||||
# hence if the date is now in the future, we subtract one year again.
|
||||
if ($dt_tzadjusted > $now) {
|
||||
$dt = $dt->add_years(-1);
|
||||
$dt_tzadjusted = ($dt - $now->tzoffset);
|
||||
}
|
||||
}
|
||||
|
||||
if ($dt_tzadjusted > $oldestDate) { # is the date bigger=>newer than the oldest date we want to look at?
|
||||
if ($line =~ m/$pattern/){ # if the line contains the regex pattern
|
||||
if ($debug) {print $dt . " => "; }
|
||||
if ($verbose) { print $line; }
|
||||
$hits++; # increase by 1 hit
|
||||
}
|
||||
}
|
||||
else{
|
||||
last; #if the date is older than the oldest we still care about, leave this loop -> go to the next file if available
|
||||
}
|
||||
}
|
||||
|
||||
close(LOGS);
|
||||
}
|
||||
|
||||
}
|
||||
}## the find sub process ends here
|
||||
|
||||
|
||||
|
||||
if (!$reverse) {
|
||||
if ($hits >= ($critical + 0)) {
|
||||
print "CRITICAL - There are $hits instances of \"$pattern\" in the last $interval minutes\n";
|
||||
exit 2; }
|
||||
if ($hits >= ($warning + 0)) {
|
||||
print "WARNING - There are $hits instances of \"$pattern\" in the last $interval minutes\n";
|
||||
exit 1; }
|
||||
if ($validFileNames == 0) {
|
||||
print "UNKNOWN - There were no files matching the passed filename: \"$logfile\"\n";
|
||||
exit 3; }
|
||||
else {
|
||||
print "OK - There are only $hits instances of \"$pattern\" in the last $interval minutes - Warning threshold is $warning\n";
|
||||
exit 0;
|
||||
}
|
||||
} else {
|
||||
if ($hits < ($critical + 0)) {
|
||||
print "CRITICAL - There are only $hits instances of \"$pattern\" in the last $interval minutes\n";
|
||||
exit 2; }
|
||||
if ($hits < ($warning + 0)) {
|
||||
print "WARNING - There are only $hits instances of \"$pattern\" in the last $interval minutes\n";
|
||||
exit 1; }
|
||||
if ($validFileNames == 0) {
|
||||
print "UNKNOWN - There were no files matching the passed filename: \"$logfile\"\n";
|
||||
exit 3; }
|
||||
else {
|
||||
print "OK - There are at least $hits instances of \"$pattern\" in the last $interval minutes - Warning threshold is $warning\n";
|
||||
exit 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1 +1,2 @@
|
|||
nagios ALL=(swift) NOPASSWD:/usr/bin/swift-init status *
|
||||
nagios ALL=NOPASSWD:/usr/local/lib/nagios/plugins/check_timed_logs.pl *
|
||||
|
|
|
@ -385,6 +385,12 @@ def update_nrpe_config():
|
|||
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nrpe-external-master',
|
||||
'check_swift_storage.py'),
|
||||
os.path.join(NAGIOS_PLUGINS, 'check_swift_storage.py'))
|
||||
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nrpe-external-master',
|
||||
'check_timed_logs.pl'),
|
||||
os.path.join(NAGIOS_PLUGINS, 'check_timed_logs.pl'))
|
||||
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nrpe-external-master',
|
||||
'check_swift_replicator_logs.sh'),
|
||||
os.path.join(NAGIOS_PLUGINS, 'check_swift_replicator_logs.sh'))
|
||||
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nrpe-external-master',
|
||||
'check_swift_service'),
|
||||
os.path.join(NAGIOS_PLUGINS, 'check_swift_service'))
|
||||
|
@ -405,6 +411,16 @@ def update_nrpe_config():
|
|||
check_cmd='check_swift_storage.py {}'.format(
|
||||
config('nagios-check-params'))
|
||||
)
|
||||
if config('nagios-replication-check-params'):
|
||||
nrpe_setup.add_check(
|
||||
shortname='swift_replicator_health',
|
||||
description='Check swift object replicator log reporting',
|
||||
check_cmd='check_swift_replicator_logs.sh {}'.format(
|
||||
config('nagios-replication-check-params'))
|
||||
)
|
||||
else:
|
||||
nrpe_setup.remove_check(shortname='swift_replicator_health')
|
||||
|
||||
nrpe.add_init_service_checks(nrpe_setup, SWIFT_SVCS, current_unit)
|
||||
nrpe_setup.write()
|
||||
|
||||
|
|
|
@ -112,6 +112,8 @@ PACKAGES = [
|
|||
'python-psutil',
|
||||
'ufw',
|
||||
'xfsprogs',
|
||||
'libfile-readbackwards-perl',
|
||||
'libtime-piece-perl',
|
||||
]
|
||||
|
||||
PY3_PACKAGES = [
|
||||
|
|
|
@ -176,7 +176,8 @@ class SwiftStorageRelationsTests(CharmTestCase):
|
|||
self.apt_install.assert_called_with(
|
||||
['gdisk', 'lvm2', 'swift', 'swift-account',
|
||||
'swift-container', 'swift-object', 'python-jinja2',
|
||||
'python-psutil', 'ufw', 'xfsprogs'],
|
||||
'python-psutil', 'ufw', 'xfsprogs',
|
||||
'libfile-readbackwards-perl', 'libtime-piece-perl'],
|
||||
fatal=True)
|
||||
self.assertTrue(self.update_nrpe_config.called)
|
||||
self.assertTrue(mock_ensure_devs_tracked.called)
|
||||
|
|
|
@ -567,7 +567,8 @@ class SwiftStorageUtilsTests(CharmTestCase):
|
|||
options=dpkg_opts,
|
||||
packages=['gdisk', 'lvm2', 'swift', 'swift-account',
|
||||
'swift-container', 'swift-object', 'python-jinja2',
|
||||
'python-psutil', 'ufw', 'xfsprogs'],
|
||||
'python-psutil', 'ufw', 'xfsprogs',
|
||||
'libfile-readbackwards-perl', 'libtime-piece-perl'],
|
||||
fatal=True
|
||||
)
|
||||
self.assertTrue(mock_remove_old_packages.called)
|
||||
|
@ -600,6 +601,7 @@ class SwiftStorageUtilsTests(CharmTestCase):
|
|||
options=dpkg_opts,
|
||||
packages=['gdisk', 'lvm2', 'swift', 'swift-account',
|
||||
'swift-container', 'swift-object', 'ufw', 'xfsprogs',
|
||||
'libfile-readbackwards-perl', 'libtime-piece-perl',
|
||||
'python3-jinja2', 'python3-psutil', 'python3-six',
|
||||
'python3-swift'],
|
||||
fatal=True
|
||||
|
|
Loading…
Reference in New Issue