Allow tuning for check_crm failure handling

This commit adds two new options, failed_actions_alert_type and
failed_actions_threshold, which map onto the check_crm options
--failedactions and --failcounts, respectively.
The default option values make check_crm generate critical alerts if
actions failed once.
The actions check can be entirely bypassed if failed_actions_alert_type
is set to 'ignore'.

Closes-Bug: #1796400
Change-Id: I72f65bacba8bf17a13db19d2a3472f760776019a
This commit is contained in:
Andrea Ieri 2019-05-13 15:58:46 +02:00 committed by Ryan A Farrell
parent e28f8a9adc
commit 4d391e8107
3 changed files with 117 additions and 5 deletions

View File

@ -171,3 +171,17 @@ options:
description: |
A comma-separated list of nagios servicegroups. If left empty, the
nagios_context will be used as the servicegroup.
failed_actions_alert_type:
type: string
default: 'critical'
description: |
If the CRM status has recorded failed actions in any of the registered
resource agents, check_crm can optionally generate an alert.
Valid options: ignore/warning/critical
failed_actions_threshold:
type: int
default: 1
description: |
check_crm will not generate an alert unless enough failed actions have
been recorded. Has no effect if failed_actions_alert_type is set to
'ignore'

View File

@ -505,6 +505,19 @@ def stop():
@hooks.hook('nrpe-external-master-relation-joined',
'nrpe-external-master-relation-changed')
def update_nrpe_config():
# Validate options
valid_alerts = ['ignore', 'warning', 'critical']
if config('failed_actions_alert_type').lower() not in valid_alerts:
status_set('blocked',
'The value of option failed_actions_alert_type must be '
'among {}'.format(valid_alerts))
return
if config('failed_actions_threshold') <= 0:
status_set('blocked',
'The value of option failed_actions_threshold must be a '
'positive integer')
return
scripts_src = os.path.join(os.environ["CHARM_DIR"], "files",
"nrpe")
@ -531,25 +544,32 @@ def update_nrpe_config():
apt_install('python-dbus')
if config('failed_actions_alert_type').lower() == 'ignore':
check_crm_cmd = 'check_crm --failedactions=ignore'
else:
check_crm_cmd = ('check_crm --failcounts={} --failedactions={}'.format(
config('failed_actions_threshold'),
config('failed_actions_alert_type').lower()))
# corosync/crm checks
nrpe_setup.add_check(
shortname='corosync_rings',
description='Check Corosync rings {%s}' % current_unit,
description='Check Corosync rings {}'.format(current_unit),
check_cmd='check_corosync_rings')
nrpe_setup.add_check(
shortname='crm_status',
description='Check crm status {%s}' % current_unit,
check_cmd='check_crm')
description='Check crm status {}'.format(current_unit),
check_cmd=check_crm_cmd)
# process checks
nrpe_setup.add_check(
shortname='corosync_proc',
description='Check Corosync process {%s}' % current_unit,
description='Check Corosync process {}'.format(current_unit),
check_cmd='check_procs -c 1:1 -C corosync'
)
nrpe_setup.add_check(
shortname='pacemakerd_proc',
description='Check Pacemakerd process {%s}' % current_unit,
description='Check Pacemakerd process {}'.format(current_unit),
check_cmd='check_procs -c 1:1 -C pacemakerd'
)

View File

@ -441,3 +441,81 @@ class TestHooks(test_utils.CharmTestCase):
relation_set.assert_called_once_with(
relation_id='relid1',
**{'pacemaker-key': 'pcmkrkey'})
@mock.patch.object(hooks, 'apt_install')
@mock.patch('hooks.nrpe', autospec=True)
@mock.patch('hooks.os')
@mock.patch('hooks.glob')
@mock.patch.object(hooks, 'status_set')
@mock.patch.object(hooks, 'config')
def test_update_nrpe_config(self, config, status_set, mock_glob, mock_os,
nrpe, apt_install):
cfg = {'failed_actions_alert_type': 'ignore',
'failed_actions_threshold': 5}
config.side_effect = lambda key: cfg.get(key)
# Set up valid values to try for 'failed_actions_alert_type'
alert_type_params = ["IGNore", "warning", "CRITICAL"]
for alert_type in alert_type_params:
cfg['failed_actions_alert_type'] = alert_type
nrpe.get_nagios_hostname.return_value = 'localhost'
nrpe.get_nagios_unit_name.return_value = 'nagios/1'
mock_nrpe_setup = mock.MagicMock()
nrpe.NRPE.return_value = mock_nrpe_setup
hooks.update_nrpe_config()
nrpe.NRPE.assert_called_once_with(hostname='localhost')
apt_install.assert_called_once_with('python-dbus')
if alert_type.lower() == 'ignore':
check_crm_cmd = 'check_crm --failedactions=ignore'
else:
check_crm_cmd = ('check_crm --failcounts={} '
'--failedactions={}'.format(
cfg['failed_actions_threshold'],
cfg['failed_actions_alert_type'].lower()))
mock_nrpe_setup.add_check.assert_any_call(
shortname='corosync_rings',
description='Check Corosync rings nagios/1',
check_cmd='check_corosync_rings')
mock_nrpe_setup.add_check.assert_any_call(
shortname='crm_status',
description='Check crm status nagios/1',
check_cmd=check_crm_cmd)
mock_nrpe_setup.add_check.assert_any_call(
shortname='corosync_proc',
description='Check Corosync process nagios/1',
check_cmd='check_procs -c 1:1 -C corosync')
mock_nrpe_setup.add_check.assert_any_call(
shortname='pacemakerd_proc',
description='Check Pacemakerd process nagios/1',
check_cmd='check_procs -c 1:1 -C pacemakerd')
mock_nrpe_setup.write.assert_called_once()
nrpe.reset_mock()
apt_install.reset_mock()
# Check unsupported case for failed_actions_alert_type
cfg['failed_actions_alert_type'] = 'unsupported'
cfg['failed_actions_threshold'] = 1
hooks.update_nrpe_config()
valid_alerts = ['ignore', 'warning', 'critical']
status_set.assert_called_once_with('blocked',
'The value of option '
'failed_actions_alert_type must be '
'among {}'.format(valid_alerts))
status_set.reset_mock()
# Check unsupported case for failed_actions_threshold
cfg['failed_actions_alert_type'] = 'ignore'
cfg['failed_actions_threshold'] = 0
hooks.update_nrpe_config()
status_set.assert_called_once_with('blocked',
'The value of option failed_'
'actions_threshold must be a '
'positive integer')