Creates nrpe check for number of OSDs

Alert is triggered when number of known OSDs in osdmap is different
than number of "in" or "up" OSDs.

Change-Id: Id3d43f0146452d0bbd73e1ce98616a994eaee090
Partial-Bug: 1735579
This commit is contained in:
Marian Gasparovic 2019-04-26 16:54:02 +02:00
parent 3d9ea3908e
commit caa1cd8d6a
6 changed files with 89 additions and 3 deletions

View File

@ -204,6 +204,12 @@ options:
description: |
Whether to report Critical instead of Warning when the nodeep-scrub
flag is set.
nagios_check_num_osds:
default: False
type: boolean
description: |
Whether to report an error when number of known OSDs does not equal
to the number of OSDs in or up.
nagios_additional_checks:
default: ""
type: string
@ -214,8 +220,8 @@ options:
.
Example:
.
{'noout': 'noout', 'too_few': 'too few PGs', 'clock': 'clock skew',
'osd-down': 'osds down', 'degraded_redundancy': 'Degraded data redundancy'}
{'noout_set': 'noout', 'too_few_PGs': 'too few PGs', 'clock': 'clock skew',
'degraded_redundancy': 'Degraded data redundancy'}
.
nagios_additional_checks_critical:
default: False

View File

@ -175,6 +175,20 @@ def check_ceph_status(args):
print(message_all_ok)
return message_all_ok
# if it is just --check_osds_down, deal with it and ignore overall health
if args.check_num_osds:
osdmap = status_data['osdmap']['osdmap']
num_osds = osdmap['num_osds']
num_up_osds = osdmap['num_up_osds']
num_in_osds = osdmap['num_in_osds']
if num_osds != num_up_osds or num_up_osds != num_in_osds:
msg = "CRITICAL: OSDs: {}, OSDs up: {}, OSDs in: {}".format(
num_osds, num_up_osds, num_in_osds)
raise CriticalError(msg)
message_ok = "OK: {} OSDs, all up and in".format(num_osds)
print(message_ok)
return message_ok
if overall_status != 'HEALTH_OK':
# Health is not OK, collect status message(s) and
# decide whether to return warning or critical
@ -265,6 +279,11 @@ def parse_args(args):
"positive. If the argument is not provided,"
"check returns a warning. Otherwise it "
"returns an error condition.")
parser.add_argument('--check_num_osds',
dest='check_num_osds', default=False,
action='store_true',
help="Check whether all OSDs are up and in")
return parser.parse_args(args)

View File

@ -877,6 +877,14 @@ def update_nrpe_config():
current_unit),
check_cmd=check_cmd
)
if config('nagios_check_num_osds'):
check_cmd = 'check_ceph_status.py -f {} --check_num_osds'.format(
STATUS_FILE)
nrpe_setup.add_check(
shortname='ceph_num_osds',
description='Check whether all OSDs are up and in',
check_cmd=check_cmd
)
nrpe_setup.write()

View File

@ -1 +1 @@
{"health":{"health":{"health_services":[{"mons":[{"name":"juju-2691ab-1-lxd-1","kb_total":155284096,"kb_used":1247744,"kb_avail":154036352,"avail_percent":99,"last_updated":"2017-05-17 03:31:35.562497","store_stats":{"bytes_total":1012055342,"bytes_sst":0,"bytes_log":29673298,"bytes_misc":982382044,"last_updated":"0.000000"},"health":"HEALTH_OK"},{"name":"juju-2691ab-13-lxd-0","kb_total":153820288,"kb_used":1361280,"kb_avail":152459008,"avail_percent":99,"last_updated":"2017-05-17 03:31:04.097201","store_stats":{"bytes_total":1370003168,"bytes_sst":0,"bytes_log":29813159,"bytes_misc":1340190009,"last_updated":"0.000000"},"health":"HEALTH_OK"},{"name":"juju-2691ab-2-lxd-1","kb_total":155251072,"kb_used":1373440,"kb_avail":153877632,"avail_percent":99,"last_updated":"2017-05-17 03:31:20.684777","store_stats":{"bytes_total":1400974192,"bytes_sst":0,"bytes_log":1129945,"bytes_misc":1399844247,"last_updated":"0.000000"},"health":"HEALTH_OK"}]}]},"timechecks":{"epoch":32,"round":24492,"round_status":"finished","mons":[{"name":"juju-2691ab-1-lxd-1","skew":0.000000,"latency":0.000000,"health":"HEALTH_OK"},{"name":"juju-2691ab-13-lxd-0","skew":0.000919,"latency":0.001036,"health":"HEALTH_OK"},{"name":"juju-2691ab-2-lxd-1","skew":0.000000,"latency":0.001009,"health":"HEALTH_OK"}]},"summary":[{"severity":"HEALTH_WARN","summary":"48 pgs backfill_wait"},{"severity":"HEALTH_WARN","summary":"45 pgs backfilling"},{"severity":"HEALTH_WARN","summary":"1 pgs degraded"},{"severity":"HEALTH_WARN","summary":"1 pgs recovery_wait"},{"severity":"HEALTH_WARN","summary":"22 pgs stuck unclean"},{"severity":"HEALTH_WARN","summary":"recovery 14\/46842755 objects degraded (0.000%)"},{"severity":"HEALTH_WARN","summary":"recovery 448540\/46842755 objects misplaced (0.958%)"}],"overall_status":"HEALTH_WARN","detail":[]},"fsid":"ca9451f1-5c4f-4e85-bb14-a08dfc0568f7","election_epoch":32,"quorum":[0,1,2],"quorum_names":["juju-2691ab-1-lxd-1","juju-2691ab-13-lxd-0","juju-2691ab-2-lxd-1"],"monmap":{"epoch":1,"fsid":"ca9451f1-5c4f-4e85-bb14-a08dfc0568f7","modified":"2016-12-03 08:09:21.854671","created":"2016-12-03 08:09:21.854671","mons":[{"rank":0,"name":"juju-2691ab-1-lxd-1","addr":"10.182.254.221:6789\/0"},{"rank":1,"name":"juju-2691ab-13-lxd-0","addr":"10.182.254.229:6789\/0"},{"rank":2,"name":"juju-2691ab-2-lxd-1","addr":"10.182.254.242:6789\/0"}]},"osdmap":{"osdmap":{"epoch":141540,"num_osds":314,"num_up_osds":314,"num_in_osds":314,"full":false,"nearfull":false,"num_remapped_pgs":92}},"pgmap":{"pgs_by_state":[{"state_name":"active+clean","count":9274},{"state_name":"active+remapped+wait_backfill","count":48},{"state_name":"active+remapped+backfilling","count":45},{"state_name":"active+clean+scrubbing+deep","count":9},{"state_name":"active+remapped","count":2},{"state_name":"active+recovery_wait+degraded","count":1},{"state_name":"active+clean+scrubbing","count":1}],"version":13885884,"num_pgs":9380,"data_bytes":64713222471610,"bytes_used":193613093122048,"bytes_avail":690058090491904,"bytes_total":883671183613952,"degraded_objects":14,"degraded_total":46842755,"degraded_ratio":0.000000,"misplaced_objects":448540,"misplaced_total":46842755,"misplaced_ratio":0.009575,"recovering_objects_per_sec":389,"recovering_bytes_per_sec":1629711746,"recovering_keys_per_sec":0,"num_objects_recovered":218,"num_bytes_recovered":912252928,"num_keys_recovered":0,"read_bytes_sec":117041457,"write_bytes_sec":293414043,"read_op_per_sec":5282,"write_op_per_sec":5270},"fsmap":{"epoch":1,"by_rank":[]}}
{"health":{"health":{"health_services":[{"mons":[{"name":"juju-2691ab-1-lxd-1","kb_total":155284096,"kb_used":1247744,"kb_avail":154036352,"avail_percent":99,"last_updated":"2017-05-17 03:31:35.562497","store_stats":{"bytes_total":1012055342,"bytes_sst":0,"bytes_log":29673298,"bytes_misc":982382044,"last_updated":"0.000000"},"health":"HEALTH_OK"},{"name":"juju-2691ab-13-lxd-0","kb_total":153820288,"kb_used":1361280,"kb_avail":152459008,"avail_percent":99,"last_updated":"2017-05-17 03:31:04.097201","store_stats":{"bytes_total":1370003168,"bytes_sst":0,"bytes_log":29813159,"bytes_misc":1340190009,"last_updated":"0.000000"},"health":"HEALTH_OK"},{"name":"juju-2691ab-2-lxd-1","kb_total":155251072,"kb_used":1373440,"kb_avail":153877632,"avail_percent":99,"last_updated":"2017-05-17 03:31:20.684777","store_stats":{"bytes_total":1400974192,"bytes_sst":0,"bytes_log":1129945,"bytes_misc":1399844247,"last_updated":"0.000000"},"health":"HEALTH_OK"}]}]},"timechecks":{"epoch":32,"round":24492,"round_status":"finished","mons":[{"name":"juju-2691ab-1-lxd-1","skew":0.000000,"latency":0.000000,"health":"HEALTH_OK"},{"name":"juju-2691ab-13-lxd-0","skew":0.000919,"latency":0.001036,"health":"HEALTH_OK"},{"name":"juju-2691ab-2-lxd-1","skew":0.000000,"latency":0.001009,"health":"HEALTH_OK"}]},"summary":[{"severity":"HEALTH_WARN","summary":"48 pgs backfill_wait"},{"severity":"HEALTH_WARN","summary":"45 pgs backfilling"},{"severity":"HEALTH_WARN","summary":"1 pgs degraded"},{"severity":"HEALTH_WARN","summary":"1 pgs recovery_wait"},{"severity":"HEALTH_WARN","summary":"22 pgs stuck unclean"},{"severity":"HEALTH_WARN","summary":"recovery 14\/46842755 objects degraded (0.000%)"},{"severity":"HEALTH_WARN","summary":"recovery 448540\/46842755 objects misplaced (0.958%)"}],"overall_status":"HEALTH_WARN","detail":[]},"fsid":"ca9451f1-5c4f-4e85-bb14-a08dfc0568f7","election_epoch":32,"quorum":[0,1,2],"quorum_names":["juju-2691ab-1-lxd-1","juju-2691ab-13-lxd-0","juju-2691ab-2-lxd-1"],"monmap":{"epoch":1,"fsid":"ca9451f1-5c4f-4e85-bb14-a08dfc0568f7","modified":"2016-12-03 08:09:21.854671","created":"2016-12-03 08:09:21.854671","mons":[{"rank":0,"name":"juju-2691ab-1-lxd-1","addr":"10.182.254.221:6789\/0"},{"rank":1,"name":"juju-2691ab-13-lxd-0","addr":"10.182.254.229:6789\/0"},{"rank":2,"name":"juju-2691ab-2-lxd-1","addr":"10.182.254.242:6789\/0"}]},"osdmap":{"osdmap":{"epoch":141540,"num_osds":314,"num_up_osds":311,"num_in_osds":311,"full":false,"nearfull":false,"num_remapped_pgs":92}},"pgmap":{"pgs_by_state":[{"state_name":"active+clean","count":9274},{"state_name":"active+remapped+wait_backfill","count":48},{"state_name":"active+remapped+backfilling","count":45},{"state_name":"active+clean+scrubbing+deep","count":9},{"state_name":"active+remapped","count":2},{"state_name":"active+recovery_wait+degraded","count":1},{"state_name":"active+clean+scrubbing","count":1}],"version":13885884,"num_pgs":9380,"data_bytes":64713222471610,"bytes_used":193613093122048,"bytes_avail":690058090491904,"bytes_total":883671183613952,"degraded_objects":14,"degraded_total":46842755,"degraded_ratio":0.000000,"misplaced_objects":448540,"misplaced_total":46842755,"misplaced_ratio":0.009575,"recovering_objects_per_sec":389,"recovering_bytes_per_sec":1629711746,"recovering_keys_per_sec":0,"num_objects_recovered":218,"num_bytes_recovered":912252928,"num_keys_recovered":0,"read_bytes_sec":117041457,"write_bytes_sec":293414043,"read_op_per_sec":5282,"write_op_per_sec":5270},"fsmap":{"epoch":1,"by_rank":[]}}

View File

@ -52,6 +52,7 @@ CHARM_CONFIG = {'config-flags': '',
'nagios_raise_nodeepscrub': True,
'nagios_additional_checks': "",
'nagios_additional_checks_critical': False,
'nagios_check_num_osds': False,
'disable-pg-max-object-skew': False}

View File

@ -288,3 +288,55 @@ class NagiosTestCase(unittest.TestCase):
'--additional_check_critical'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Num OSD OK, pre-luminous
@patch('check_ceph_status.get_ceph_version')
def test_num_osds_ok_pre_luminous(self,
mock_ceph_version,
mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_ok.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--check_num_osds'])
check_output = check_ceph_status.check_ceph_status(args)
self.assertRegex(check_output, r"^OK")
# Num OSD error, pre-luminous
@patch('check_ceph_status.get_ceph_version')
def test_num_osds_error_pre_luminous(self,
mock_ceph_version,
mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_warn.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--check_num_osds'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Num OSD OK, luminous
@patch('check_ceph_status.get_ceph_version')
def test_num_osds_ok_luminous(self,
mock_ceph_version,
mock_subprocess):
mock_ceph_version.return_value = [12, 2, 0]
with open('unit_tests/ceph_many_warnings_luminous.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--check_num_osds'])
check_output = check_ceph_status.check_ceph_status(args)
self.assertRegex(check_output, r"^OK")
# Num OSD error, luminous
@patch('check_ceph_status.get_ceph_version')
def test_num_osds_error_luminous(self,
mock_ceph_version,
mock_subprocess):
mock_ceph_version.return_value = [12, 2, 0]
with open('unit_tests/ceph_degraded_luminous.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--check_num_osds'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))