NRPE: Don't report paused hacluster nodes as CRITICAL error

Previously, paused hacluster units showed up CRITICAL error
in nagios even though they were only in the 'standby' mode
in corosync.
The hacluster charm now uses the '-s' option of the check_crm
nrpe script to ignore alerts of the standby units.

Change-Id: I976d5ff01d0156fbaa91f9028ac81b44c96881af
Closes-Bug: #1880576
This commit is contained in:
Martin Kalcok 2020-11-06 12:24:57 +01:00
parent 0ce34b17be
commit c385fef7b0
3 changed files with 37 additions and 21 deletions

View File

@ -1,6 +1,6 @@
#!/usr/bin/perl
#
# check_crm_v0_8
# check_crm_v0_10
#
# Copyright © 2013 Philip Garner, Sysnix Consultants Limited
#
@ -34,6 +34,8 @@
# v0.8 06/11/2018 - Choose whether to ignore/warn/crit on failed actions
# v0.9 18/02/2020 - Phase out failed actions check in favor of separate
# failcount thresholds
# v0.10 06/11/2020 - Don't report paused hacluster nodes if 'standbyignore' flag
# (-s) is specified
#
# NOTE:- Requires Perl 5.8 or higher & either the Perl Module Nagios::Plugin
# or Monitoring::Plugin, whichever is available for your system.
@ -200,10 +202,24 @@ foreach my $line (<$fh>) {
# Check Resources Stopped
$np->add_message( $warn_or_crit, ": $1 Stopped" );
}
elsif ( $line =~ m/\s*stopped\:\s*\[(.*)\]/i ) {
elsif ( $line =~ m/\s*stopped\:\s*\[\s(.*)\s\]/i ) {
# Check Master/Slave stopped
$np->add_message( $warn_or_crit, ": $1 Stopped" );
my @stopped_nodes = split ' ', $1;
my $report_nodes = "";
for my $node (@stopped_nodes) {
# Don't report standby nodes if 'standbyignore' is specified
if ( $np->opts->standbyignore && grep { $node eq $_ } @standby ) {
next
}
$report_nodes .= "${node} "
}
if ( $report_nodes ne "") {
chop $report_nodes;
$np->add_message( $warn_or_crit, ": $report_nodes Stopped" );
}
}
elsif ( $line =~ m/^failed actions\:/i ) {
if ($np->opts->failedactions =~ /^(warning|critical)$/i) {

View File

@ -601,7 +601,7 @@ def update_nrpe_config():
apt_install('python-dbus')
check_crm_cmd = 'check_crm'
check_crm_cmd = 'check_crm -s'
check_crm_cmd += ' --failedactions={}'.format(
config('failed_actions_alert_type').lower()
)

View File

@ -573,8 +573,8 @@ class TestHooks(test_utils.CharmTestCase):
nrpe.NRPE.assert_called_once_with(hostname='localhost')
apt_install.assert_called_once_with('python-dbus')
check_crm_cmd = ('check_crm --failedactions={} --failcount-warn={}'
' --failcount-crit={}'.format(
check_crm_cmd = ('check_crm -s --failedactions={} '
'--failcount-warn={} --failcount-crit={}'.format(
cfg['failed_actions_alert_type'].lower(),
cfg['res_failcount_warn'],
cfg['res_failcount_crit']))