From c385fef7b0659f673844a317eb6ba14f8f1821c8 Mon Sep 17 00:00:00 2001 From: Martin Kalcok Date: Fri, 6 Nov 2020 12:24:57 +0100 Subject: [PATCH] NRPE: Don't report paused hacluster nodes as CRITICAL error Previously, paused hacluster units showed up CRITICAL error in nagios even though they were only in the 'standby' mode in corosync. The hacluster charm now uses the '-s' option of the check_crm nrpe script to ignore alerts of the standby units. Change-Id: I976d5ff01d0156fbaa91f9028ac81b44c96881af Closes-Bug: #1880576 --- files/nrpe/check_crm | 52 +++++++++++++++++++----------- hooks/hooks.py | 2 +- unit_tests/test_hacluster_hooks.py | 4 +-- 3 files changed, 37 insertions(+), 21 deletions(-) diff --git a/files/nrpe/check_crm b/files/nrpe/check_crm index 7732d4c..5dc0772 100755 --- a/files/nrpe/check_crm +++ b/files/nrpe/check_crm @@ -1,6 +1,6 @@ #!/usr/bin/perl # -# check_crm_v0_8 +# check_crm_v0_10 # # Copyright © 2013 Philip Garner, Sysnix Consultants Limited # @@ -19,21 +19,23 @@ # # Authors: Phil Garner - phil@sysnix.com & Peter Mottram - peter@sysnix.com # -# v0.1 09/01/2011 -# v0.2 11/01/2011 -# v0.3 22/08/2011 - bug fix and changes suggested by Vadym Chepkov -# v0.4 23/08/2011 - update for spelling and anchor regex capture (Vadym Chepkov) -# v0.5 29/09/2011 - Add standby warn/crit suggested by Sönke Martens & removal -# of 'our' to 'my' to completely avoid problems with ePN -# v0.6 14/03/2013 - Change from \w+ to \S+ in stopped check to cope with -# Servers that have non word charachters in. Suggested by -# Igal Baevsky. -# v0.7 01/09/2013 - In testing as still not fully tested. Adds optional -# constraints check (Boris Wesslowski). Adds fail count -# threshold ( Zoran Bosnjak & Marko Hrastovec ) -# v0.8 06/11/2018 - Choose whether to ignore/warn/crit on failed actions -# v0.9 18/02/2020 - Phase out failed actions check in favor of separate -# failcount thresholds +# v0.1 09/01/2011 +# v0.2 11/01/2011 +# v0.3 22/08/2011 - bug fix and changes suggested by Vadym Chepkov +# v0.4 23/08/2011 - update for spelling and anchor regex capture (Vadym Chepkov) +# v0.5 29/09/2011 - Add standby warn/crit suggested by Sönke Martens & removal +# of 'our' to 'my' to completely avoid problems with ePN +# v0.6 14/03/2013 - Change from \w+ to \S+ in stopped check to cope with +# Servers that have non word charachters in. Suggested by +# Igal Baevsky. +# v0.7 01/09/2013 - In testing as still not fully tested. Adds optional +# constraints check (Boris Wesslowski). Adds fail count +# threshold ( Zoran Bosnjak & Marko Hrastovec ) +# v0.8 06/11/2018 - Choose whether to ignore/warn/crit on failed actions +# v0.9 18/02/2020 - Phase out failed actions check in favor of separate +# failcount thresholds +# v0.10 06/11/2020 - Don't report paused hacluster nodes if 'standbyignore' flag +# (-s) is specified # # NOTE:- Requires Perl 5.8 or higher & either the Perl Module Nagios::Plugin # or Monitoring::Plugin, whichever is available for your system. @@ -200,10 +202,24 @@ foreach my $line (<$fh>) { # Check Resources Stopped $np->add_message( $warn_or_crit, ": $1 Stopped" ); } - elsif ( $line =~ m/\s*stopped\:\s*\[(.*)\]/i ) { + elsif ( $line =~ m/\s*stopped\:\s*\[\s(.*)\s\]/i ) { # Check Master/Slave stopped - $np->add_message( $warn_or_crit, ": $1 Stopped" ); + my @stopped_nodes = split ' ', $1; + my $report_nodes = ""; + + for my $node (@stopped_nodes) { + # Don't report standby nodes if 'standbyignore' is specified + if ( $np->opts->standbyignore && grep { $node eq $_ } @standby ) { + next + } + $report_nodes .= "${node} " + } + + if ( $report_nodes ne "") { + chop $report_nodes; + $np->add_message( $warn_or_crit, ": $report_nodes Stopped" ); + } } elsif ( $line =~ m/^failed actions\:/i ) { if ($np->opts->failedactions =~ /^(warning|critical)$/i) { diff --git a/hooks/hooks.py b/hooks/hooks.py index 268dda3..f93d938 100755 --- a/hooks/hooks.py +++ b/hooks/hooks.py @@ -601,7 +601,7 @@ def update_nrpe_config(): apt_install('python-dbus') - check_crm_cmd = 'check_crm' + check_crm_cmd = 'check_crm -s' check_crm_cmd += ' --failedactions={}'.format( config('failed_actions_alert_type').lower() ) diff --git a/unit_tests/test_hacluster_hooks.py b/unit_tests/test_hacluster_hooks.py index 6668f95..64c5919 100644 --- a/unit_tests/test_hacluster_hooks.py +++ b/unit_tests/test_hacluster_hooks.py @@ -573,8 +573,8 @@ class TestHooks(test_utils.CharmTestCase): nrpe.NRPE.assert_called_once_with(hostname='localhost') apt_install.assert_called_once_with('python-dbus') - check_crm_cmd = ('check_crm --failedactions={} --failcount-warn={}' - ' --failcount-crit={}'.format( + check_crm_cmd = ('check_crm -s --failedactions={} ' + '--failcount-warn={} --failcount-crit={}'.format( cfg['failed_actions_alert_type'].lower(), cfg['res_failcount_warn'], cfg['res_failcount_crit']))