Merge "Update Nagios check for Luminous"

This commit is contained in:
Zuul 2018-05-09 13:05:18 +00:00 committed by Gerrit Code Review
commit 2aa99d2af9
4 changed files with 455 additions and 12 deletions

View File

@ -68,6 +68,23 @@ def check_file_freshness(filename, newer_than=3600):
% (filename, time.ctime(mtime)))
def get_ceph_version():
"""
Uses CLI to get the ceph version, because the status output changes from
Luminous onwards (12.2.0 or higher)
:returns: list of integers, just the actual version number
"""
try:
out_string = subprocess.check_output(['ceph',
'--version']).decode('UTF-8')
except subprocess.CalledProcessError as e:
raise UnknownError(
"UNKNOWN: could not determine Ceph version, error: {}".format(e))
out_version = [int(x) for x in out_string.split(" ")[2].split(".")]
return out_version
def check_ceph_status(args):
"""
Used to check the status of a Ceph cluster. Uses the output of 'ceph
@ -109,15 +126,27 @@ def check_ceph_status(args):
required_keys = ['health', 'monmap', 'pgmap']
if not all(key in status_data.keys() for key in required_keys):
raise UnknownError('UNKNOWN: status data is incomplete')
ceph_version = get_ceph_version()
if ceph_version[0] >= 12 and ceph_version[1] >= 2:
# This is Luminous or above
overall_status = status_data['health'].get('status')
luminous = True
else:
overall_status = status_data['health'].get('overall_status')
luminous = False
if status_data['health']['overall_status'] != 'HEALTH_OK':
if overall_status != 'HEALTH_OK':
# Health is not OK, check if any lines are not in our list of OK
# any lines that don't match, check is critical
status_msg = []
for status in status_data['health']['summary']:
if not re.match(ignorable, status['summary']):
if luminous:
status_messages = [x['summary']['message'] for x in status_data['health'].get('checks').values()]
else:
status_messages = [x['summary'] for x in status_data['health']['summary']]
for status in status_messages:
if not re.match(ignorable, status):
status_critical = True
status_msg.append(status['summary'])
status_msg.append(status)
# If we got this far, then the status is not OK but the status lines
# are all in our list of things we consider to be operational tasks.
# Check the thresholds and return CRITICAL if exceeded,
@ -138,10 +167,10 @@ def check_ceph_status(args):
status_msg.append("Recovering objects/sec {}".format(recovering))
if status_critical:
msg = 'CRITICAL: ceph health: "{} {}"'.format(
status_data['health']['overall_status'],
overall_status,
", ".join(status_msg))
raise CriticalError(msg)
if status_data['health']['overall_status'] == 'HEALTH_WARN':
if overall_status == 'HEALTH_WARN':
msg = "WARNING: {}".format(", ".join(status_msg))
raise WarnError(msg)
message = "All OK"

View File

@ -0,0 +1,196 @@
{
"fsid": "a7285ad8-3961-11e8-b715-00163e030140",
"health": {
"checks": {
"OSD_DOWN": {
"severity": "HEALTH_WARN",
"summary": {
"message": "1 osds down"
}
},
"PG_DEGRADED": {
"severity": "HEALTH_WARN",
"summary": {
"message": "Degraded data redundancy: 31/906 objects degraded (3.422%), 74 pgs unclean, 74 pgs degraded"
}
}
},
"status": "HEALTH_WARN"
},
"election_epoch": 28,
"quorum": [
0,
1,
2
],
"quorum_names": [
"juju-7cfc1d-1-lxd-0",
"juju-7cfc1d-0-lxd-0",
"juju-7cfc1d-12-lxd-0"
],
"monmap": {
"epoch": 2,
"fsid": "a7285ad8-3961-11e8-b715-00163e030140",
"modified": "2018-04-06 06:37:04.978765",
"created": "2018-04-06 06:35:06.513449",
"features": {
"persistent": [
"kraken",
"luminous"
],
"optional": []
},
"mons": [
{
"rank": 0,
"name": "juju-7cfc1d-1-lxd-0",
"addr": "172.18.250.75:6789/0",
"public_addr": "172.18.250.75:6789/0"
},
{
"rank": 1,
"name": "juju-7cfc1d-0-lxd-0",
"addr": "172.18.250.76:6789/0",
"public_addr": "172.18.250.76:6789/0"
},
{
"rank": 2,
"name": "juju-7cfc1d-12-lxd-0",
"addr": "172.18.250.84:6789/0",
"public_addr": "172.18.250.84:6789/0"
}
]
},
"osdmap": {
"osdmap": {
"epoch": 257,
"num_osds": 33,
"num_up_osds": 32,
"num_in_osds": 33,
"full": false,
"nearfull": false,
"num_remapped_pgs": 0
}
},
"pgmap": {
"pgs_by_state": [
{
"state_name": "active+clean",
"count": 958
},
{
"state_name": "active+undersized+degraded",
"count": 74
}
],
"num_pgs": 1032,
"num_pools": 20,
"num_objects": 302,
"data_bytes": 580388173,
"bytes_used": 2971890057216,
"bytes_avail": 128989599563776,
"bytes_total": 131961489620992,
"degraded_objects": 31,
"degraded_total": 906,
"degraded_ratio": 0.034216
},
"fsmap": {
"epoch": 1,
"by_rank": []
},
"mgrmap": {
"epoch": 4,
"active_gid": 4131,
"active_name": "juju-7cfc1d-1-lxd-0",
"active_addr": "172.18.250.75:6800/88914",
"available": true,
"standbys": [
{
"gid": 4134,
"name": "juju-7cfc1d-0-lxd-0",
"available_modules": [
"balancer",
"dashboard",
"influx",
"localpool",
"prometheus",
"restful",
"selftest",
"status",
"zabbix"
]
},
{
"gid": 4299,
"name": "juju-7cfc1d-12-lxd-0",
"available_modules": [
"balancer",
"dashboard",
"influx",
"localpool",
"prometheus",
"restful",
"selftest",
"status",
"zabbix"
]
}
],
"modules": [
"balancer",
"restful",
"status"
],
"available_modules": [
"balancer",
"dashboard",
"influx",
"localpool",
"prometheus",
"restful",
"selftest",
"status",
"zabbix"
],
"services": {}
},
"servicemap": {
"epoch": 22,
"modified": "2018-04-14 06:25:03.499825",
"services": {
"rgw": {
"daemons": {
"summary": "",
"radosgw.gateway": {
"start_epoch": 22,
"start_stamp": "2018-04-14 06:25:02.277715",
"gid": 156351,
"addr": "172.18.250.74:0/2962286796",
"metadata": {
"arch": "x86_64",
"ceph_version": "ceph version 12.2.2 (cf0baeeeeba3b47f9427c6c97e2144b094b7e5ba) luminous (stable)",
"cpu": "Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz",
"distro": "ubuntu",
"distro_description": "Ubuntu 16.04.4 LTS",
"distro_version": "16.04",
"frontend_config#0": "civetweb port=60",
"frontend_type#0": "civetweb",
"hostname": "juju-7cfc1d-1-lxd-1",
"kernel_description": "#43~16.04.1-Ubuntu SMP Wed Mar 14 17:48:43 UTC 2018",
"kernel_version": "4.13.0-38-generic",
"mem_swap_kb": "8388604",
"mem_total_kb": "528154640",
"num_handles": "1",
"os": "Linux",
"pid": "225019",
"zone_id": "34009c14-e608-47e6-84c5-bf2cefbe94f8",
"zone_name": "default",
"zonegroup_id": "7771c284-f980-41f0-861b-66c95357cb3d",
"zonegroup_name": "default"
}
}
}
}
}
}
}

View File

@ -0,0 +1,180 @@
{
"fsid": "1111111-11111-1111-1111-111111111111",
"health": {
"checks": {},
"status": "HEALTH_OK"
},
"election_epoch": 28,
"quorum": [
0,
1,
2
],
"quorum_names": [
"juju-badbad-1-lxd-0",
"juju-badbad-0-lxd-0",
"juju-badbad-12-lxd-0"
],
"monmap": {
"epoch": 2,
"fsid": "1111111-11111-1111-1111-111111111111",
"modified": "2018-04-06 06:37:04.978765",
"created": "2018-04-06 06:35:06.513449",
"features": {
"persistent": [
"kraken",
"luminous"
],
"optional": []
},
"mons": [
{
"rank": 0,
"name": "juju-badbad-1-lxd-0",
"addr": "10.11.12.75:6789/0",
"public_addr": "10.11.12.75:6789/0"
},
{
"rank": 1,
"name": "juju-badbad-0-lxd-0",
"addr": "10.11.12.76:6789/0",
"public_addr": "10.11.12.76:6789/0"
},
{
"rank": 2,
"name": "juju-badbad-12-lxd-0",
"addr": "10.11.12.84:6789/0",
"public_addr": "10.11.12.84:6789/0"
}
]
},
"osdmap": {
"osdmap": {
"epoch": 262,
"num_osds": 33,
"num_up_osds": 32,
"num_in_osds": 32,
"full": false,
"nearfull": false,
"num_remapped_pgs": 0
}
},
"pgmap": {
"pgs_by_state": [
{
"state_name": "active+clean",
"count": 1032
}
],
"num_pgs": 1032,
"num_pools": 20,
"num_objects": 561,
"data_bytes": 1584814720,
"bytes_used": 2884842602496,
"bytes_avail": 125077821714432,
"bytes_total": 127962664316928,
"read_bytes_sec": 1513,
"read_op_per_sec": 1,
"write_op_per_sec": 0
},
"fsmap": {
"epoch": 1,
"by_rank": []
},
"mgrmap": {
"epoch": 4,
"active_gid": 4131,
"active_name": "juju-badbad-1-lxd-0",
"active_addr": "10.11.12.75:6800/88914",
"available": true,
"standbys": [
{
"gid": 4134,
"name": "juju-badbad-0-lxd-0",
"available_modules": [
"balancer",
"dashboard",
"influx",
"localpool",
"prometheus",
"restful",
"selftest",
"status",
"zabbix"
]
},
{
"gid": 4299,
"name": "juju-badbad-12-lxd-0",
"available_modules": [
"balancer",
"dashboard",
"influx",
"localpool",
"prometheus",
"restful",
"selftest",
"status",
"zabbix"
]
}
],
"modules": [
"balancer",
"restful",
"status"
],
"available_modules": [
"balancer",
"dashboard",
"influx",
"localpool",
"prometheus",
"restful",
"selftest",
"status",
"zabbix"
],
"services": {}
},
"servicemap": {
"epoch": 29,
"modified": "2018-04-18 06:25:04.076050",
"services": {
"rgw": {
"daemons": {
"summary": "",
"radosgw.gateway": {
"start_epoch": 29,
"start_stamp": "2018-04-18 06:25:02.612368",
"gid": 231504,
"addr": "10.11.12.78:0/2747422053",
"metadata": {
"arch": "x86_64",
"ceph_version": "ceph version 12.2.2 (cf0baeeeeba3b47f9427c6c97e2144b094b7e5ba) luminous (stable)",
"cpu": "Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz",
"distro": "ubuntu",
"distro_description": "Ubuntu 16.04.4 LTS",
"distro_version": "16.04",
"frontend_config#0": "civetweb port=60",
"frontend_type#0": "civetweb",
"hostname": "juju-badbad-0-lxd-1",
"kernel_description": "#43~16.04.1-Ubuntu SMP Wed Mar 14 17:48:43 UTC 2018",
"kernel_version": "4.13.0-38-generic",
"mem_swap_kb": "8388604",
"mem_total_kb": "528154640",
"num_handles": "1",
"os": "Linux",
"pid": "225487",
"zone_id": "11111111-1111-1111-1111-111111111111",
"zone_name": "default",
"zonegroup_id": "11111111-1111-1111-1111-111111111111",
"zonegroup_name": "default"
}
}
}
}
}
}
}

View File

@ -26,7 +26,15 @@ import check_ceph_status
@patch('subprocess.check_output')
class NagiosTestCase(unittest.TestCase):
def test_health_ok(self, mock_subprocess):
def test_get_ceph_version(self, mock_subprocess):
mock_subprocess.return_value = 'ceph version 10.2.9 ' \
'(2ee413f77150c0f375ff6f10edd6c8f9c7d060d0)'.encode('UTF-8')
ceph_version = check_ceph_status.get_ceph_version()
self.assertEqual(ceph_version, [10, 2, 9])
@patch('check_ceph_status.get_ceph_version')
def test_health_ok(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_ok.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
@ -34,7 +42,19 @@ class NagiosTestCase(unittest.TestCase):
check_output = check_ceph_status.check_ceph_status(args)
self.assertRegex(check_output, r"^All OK$")
def test_health_warn(self, mock_subprocess):
@patch('check_ceph_status.get_ceph_version')
def test_health_ok_luminous(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [12, 2, 0]
with open('unit_tests/ceph_ok_luminous.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--degraded_thresh', '1'])
check_output = check_ceph_status.check_ceph_status(args)
self.assertRegex(check_output, r"^All OK$")
@patch('check_ceph_status.get_ceph_version')
def test_health_warn(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_warn.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
@ -42,7 +62,9 @@ class NagiosTestCase(unittest.TestCase):
self.assertRaises(check_ceph_status.WarnError,
lambda: check_ceph_status.check_ceph_status(args))
def test_health_crit(self, mock_subprocess):
@patch('check_ceph_status.get_ceph_version')
def test_health_crit(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_crit.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
@ -50,7 +72,19 @@ class NagiosTestCase(unittest.TestCase):
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
def test_health_lotsdegraded(self, mock_subprocess):
@patch('check_ceph_status.get_ceph_version')
def test_health_crit_luminous(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [12, 2, 0]
with open('unit_tests/ceph_crit_luminous.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--degraded_thresh', '1'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
@patch('check_ceph_status.get_ceph_version')
def test_health_lotsdegraded(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_params.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
@ -58,7 +92,9 @@ class NagiosTestCase(unittest.TestCase):
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
def test_health_nodeepscrub(self, mock_subprocess):
@patch('check_ceph_status.get_ceph_version')
def test_health_nodeepscrub(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_nodeepscrub.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
@ -66,7 +102,9 @@ class NagiosTestCase(unittest.TestCase):
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
def test_health_nodeepscrubok(self, mock_subprocess):
@patch('check_ceph_status.get_ceph_version')
def test_health_nodeepscrubok(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_nodeepscrub.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')