NRPE: Don't report paused hacluster nodes as CRITICAL error
Previously, paused hacluster units showed up CRITICAL error in nagios even though they were only in the 'standby' mode in corosync. The hacluster charm now uses the '-s' option of the check_crm nrpe script to ignore alerts of the standby units. Change-Id: I976d5ff01d0156fbaa91f9028ac81b44c96881af Closes-Bug: #1880576
This commit is contained in:
parent
0ce34b17be
commit
c385fef7b0
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/perl
|
||||
#
|
||||
# check_crm_v0_8
|
||||
# check_crm_v0_10
|
||||
#
|
||||
# Copyright © 2013 Philip Garner, Sysnix Consultants Limited
|
||||
#
|
||||
|
@ -34,6 +34,8 @@
|
|||
# v0.8 06/11/2018 - Choose whether to ignore/warn/crit on failed actions
|
||||
# v0.9 18/02/2020 - Phase out failed actions check in favor of separate
|
||||
# failcount thresholds
|
||||
# v0.10 06/11/2020 - Don't report paused hacluster nodes if 'standbyignore' flag
|
||||
# (-s) is specified
|
||||
#
|
||||
# NOTE:- Requires Perl 5.8 or higher & either the Perl Module Nagios::Plugin
|
||||
# or Monitoring::Plugin, whichever is available for your system.
|
||||
|
@ -200,10 +202,24 @@ foreach my $line (<$fh>) {
|
|||
# Check Resources Stopped
|
||||
$np->add_message( $warn_or_crit, ": $1 Stopped" );
|
||||
}
|
||||
elsif ( $line =~ m/\s*stopped\:\s*\[(.*)\]/i ) {
|
||||
elsif ( $line =~ m/\s*stopped\:\s*\[\s(.*)\s\]/i ) {
|
||||
|
||||
# Check Master/Slave stopped
|
||||
$np->add_message( $warn_or_crit, ": $1 Stopped" );
|
||||
my @stopped_nodes = split ' ', $1;
|
||||
my $report_nodes = "";
|
||||
|
||||
for my $node (@stopped_nodes) {
|
||||
# Don't report standby nodes if 'standbyignore' is specified
|
||||
if ( $np->opts->standbyignore && grep { $node eq $_ } @standby ) {
|
||||
next
|
||||
}
|
||||
$report_nodes .= "${node} "
|
||||
}
|
||||
|
||||
if ( $report_nodes ne "") {
|
||||
chop $report_nodes;
|
||||
$np->add_message( $warn_or_crit, ": $report_nodes Stopped" );
|
||||
}
|
||||
}
|
||||
elsif ( $line =~ m/^failed actions\:/i ) {
|
||||
if ($np->opts->failedactions =~ /^(warning|critical)$/i) {
|
||||
|
|
|
@ -601,7 +601,7 @@ def update_nrpe_config():
|
|||
|
||||
apt_install('python-dbus')
|
||||
|
||||
check_crm_cmd = 'check_crm'
|
||||
check_crm_cmd = 'check_crm -s'
|
||||
check_crm_cmd += ' --failedactions={}'.format(
|
||||
config('failed_actions_alert_type').lower()
|
||||
)
|
||||
|
|
|
@ -573,8 +573,8 @@ class TestHooks(test_utils.CharmTestCase):
|
|||
nrpe.NRPE.assert_called_once_with(hostname='localhost')
|
||||
apt_install.assert_called_once_with('python-dbus')
|
||||
|
||||
check_crm_cmd = ('check_crm --failedactions={} --failcount-warn={}'
|
||||
' --failcount-crit={}'.format(
|
||||
check_crm_cmd = ('check_crm -s --failedactions={} '
|
||||
'--failcount-warn={} --failcount-crit={}'.format(
|
||||
cfg['failed_actions_alert_type'].lower(),
|
||||
cfg['res_failcount_warn'],
|
||||
cfg['res_failcount_crit']))
|
||||
|
|
Loading…
Reference in New Issue