diff --git a/files/nagios/check_ceph_status.py b/files/nagios/check_ceph_status.py index 98275a51..ef978023 100755 --- a/files/nagios/check_ceph_status.py +++ b/files/nagios/check_ceph_status.py @@ -68,6 +68,23 @@ def check_file_freshness(filename, newer_than=3600): % (filename, time.ctime(mtime))) +def get_ceph_version(): + """ + Uses CLI to get the ceph version, because the status output changes from + Luminous onwards (12.2.0 or higher) + + :returns: list of integers, just the actual version number + """ + try: + out_string = subprocess.check_output(['ceph', + '--version']).decode('UTF-8') + except subprocess.CalledProcessError as e: + raise UnknownError( + "UNKNOWN: could not determine Ceph version, error: {}".format(e)) + out_version = [int(x) for x in out_string.split(" ")[2].split(".")] + return out_version + + def check_ceph_status(args): """ Used to check the status of a Ceph cluster. Uses the output of 'ceph @@ -109,15 +126,27 @@ def check_ceph_status(args): required_keys = ['health', 'monmap', 'pgmap'] if not all(key in status_data.keys() for key in required_keys): raise UnknownError('UNKNOWN: status data is incomplete') + ceph_version = get_ceph_version() + if ceph_version[0] >= 12 and ceph_version[1] >= 2: + # This is Luminous or above + overall_status = status_data['health'].get('status') + luminous = True + else: + overall_status = status_data['health'].get('overall_status') + luminous = False - if status_data['health']['overall_status'] != 'HEALTH_OK': + if overall_status != 'HEALTH_OK': # Health is not OK, check if any lines are not in our list of OK # any lines that don't match, check is critical status_msg = [] - for status in status_data['health']['summary']: - if not re.match(ignorable, status['summary']): + if luminous: + status_messages = [x['summary']['message'] for x in status_data['health'].get('checks').values()] + else: + status_messages = [x['summary'] for x in status_data['health']['summary']] + for status in status_messages: + if not re.match(ignorable, status): status_critical = True - status_msg.append(status['summary']) + status_msg.append(status) # If we got this far, then the status is not OK but the status lines # are all in our list of things we consider to be operational tasks. # Check the thresholds and return CRITICAL if exceeded, @@ -138,10 +167,10 @@ def check_ceph_status(args): status_msg.append("Recovering objects/sec {}".format(recovering)) if status_critical: msg = 'CRITICAL: ceph health: "{} {}"'.format( - status_data['health']['overall_status'], + overall_status, ", ".join(status_msg)) raise CriticalError(msg) - if status_data['health']['overall_status'] == 'HEALTH_WARN': + if overall_status == 'HEALTH_WARN': msg = "WARNING: {}".format(", ".join(status_msg)) raise WarnError(msg) message = "All OK" diff --git a/unit_tests/ceph_crit_luminous.json b/unit_tests/ceph_crit_luminous.json new file mode 100644 index 00000000..c81a3f36 --- /dev/null +++ b/unit_tests/ceph_crit_luminous.json @@ -0,0 +1,196 @@ +{ + "fsid": "a7285ad8-3961-11e8-b715-00163e030140", + "health": { + "checks": { + "OSD_DOWN": { + "severity": "HEALTH_WARN", + "summary": { + "message": "1 osds down" + } + }, + "PG_DEGRADED": { + "severity": "HEALTH_WARN", + "summary": { + "message": "Degraded data redundancy: 31/906 objects degraded (3.422%), 74 pgs unclean, 74 pgs degraded" + } + } + }, + "status": "HEALTH_WARN" + }, + "election_epoch": 28, + "quorum": [ + 0, + 1, + 2 + ], + "quorum_names": [ + "juju-7cfc1d-1-lxd-0", + "juju-7cfc1d-0-lxd-0", + "juju-7cfc1d-12-lxd-0" + ], + "monmap": { + "epoch": 2, + "fsid": "a7285ad8-3961-11e8-b715-00163e030140", + "modified": "2018-04-06 06:37:04.978765", + "created": "2018-04-06 06:35:06.513449", + "features": { + "persistent": [ + "kraken", + "luminous" + ], + "optional": [] + }, + "mons": [ + { + "rank": 0, + "name": "juju-7cfc1d-1-lxd-0", + "addr": "172.18.250.75:6789/0", + "public_addr": "172.18.250.75:6789/0" + }, + { + "rank": 1, + "name": "juju-7cfc1d-0-lxd-0", + "addr": "172.18.250.76:6789/0", + "public_addr": "172.18.250.76:6789/0" + }, + { + "rank": 2, + "name": "juju-7cfc1d-12-lxd-0", + "addr": "172.18.250.84:6789/0", + "public_addr": "172.18.250.84:6789/0" + } + ] + }, + "osdmap": { + "osdmap": { + "epoch": 257, + "num_osds": 33, + "num_up_osds": 32, + "num_in_osds": 33, + "full": false, + "nearfull": false, + "num_remapped_pgs": 0 + } + }, + "pgmap": { + "pgs_by_state": [ + { + "state_name": "active+clean", + "count": 958 + }, + { + "state_name": "active+undersized+degraded", + "count": 74 + } + ], + "num_pgs": 1032, + "num_pools": 20, + "num_objects": 302, + "data_bytes": 580388173, + "bytes_used": 2971890057216, + "bytes_avail": 128989599563776, + "bytes_total": 131961489620992, + "degraded_objects": 31, + "degraded_total": 906, + "degraded_ratio": 0.034216 + }, + "fsmap": { + "epoch": 1, + "by_rank": [] + }, + "mgrmap": { + "epoch": 4, + "active_gid": 4131, + "active_name": "juju-7cfc1d-1-lxd-0", + "active_addr": "172.18.250.75:6800/88914", + "available": true, + "standbys": [ + { + "gid": 4134, + "name": "juju-7cfc1d-0-lxd-0", + "available_modules": [ + "balancer", + "dashboard", + "influx", + "localpool", + "prometheus", + "restful", + "selftest", + "status", + "zabbix" + ] + }, + { + "gid": 4299, + "name": "juju-7cfc1d-12-lxd-0", + "available_modules": [ + "balancer", + "dashboard", + "influx", + "localpool", + "prometheus", + "restful", + "selftest", + "status", + "zabbix" + ] + } + ], + "modules": [ + "balancer", + "restful", + "status" + ], + "available_modules": [ + "balancer", + "dashboard", + "influx", + "localpool", + "prometheus", + "restful", + "selftest", + "status", + "zabbix" + ], + "services": {} + }, + "servicemap": { + "epoch": 22, + "modified": "2018-04-14 06:25:03.499825", + "services": { + "rgw": { + "daemons": { + "summary": "", + "radosgw.gateway": { + "start_epoch": 22, + "start_stamp": "2018-04-14 06:25:02.277715", + "gid": 156351, + "addr": "172.18.250.74:0/2962286796", + "metadata": { + "arch": "x86_64", + "ceph_version": "ceph version 12.2.2 (cf0baeeeeba3b47f9427c6c97e2144b094b7e5ba) luminous (stable)", + "cpu": "Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz", + "distro": "ubuntu", + "distro_description": "Ubuntu 16.04.4 LTS", + "distro_version": "16.04", + "frontend_config#0": "civetweb port=60", + "frontend_type#0": "civetweb", + "hostname": "juju-7cfc1d-1-lxd-1", + "kernel_description": "#43~16.04.1-Ubuntu SMP Wed Mar 14 17:48:43 UTC 2018", + "kernel_version": "4.13.0-38-generic", + "mem_swap_kb": "8388604", + "mem_total_kb": "528154640", + "num_handles": "1", + "os": "Linux", + "pid": "225019", + "zone_id": "34009c14-e608-47e6-84c5-bf2cefbe94f8", + "zone_name": "default", + "zonegroup_id": "7771c284-f980-41f0-861b-66c95357cb3d", + "zonegroup_name": "default" + } + } + } + } + } + } +} diff --git a/unit_tests/ceph_ok_luminous.json b/unit_tests/ceph_ok_luminous.json new file mode 100644 index 00000000..8a489d48 --- /dev/null +++ b/unit_tests/ceph_ok_luminous.json @@ -0,0 +1,180 @@ +{ + "fsid": "1111111-11111-1111-1111-111111111111", + "health": { + "checks": {}, + "status": "HEALTH_OK" + }, + "election_epoch": 28, + "quorum": [ + 0, + 1, + 2 + ], + "quorum_names": [ + "juju-badbad-1-lxd-0", + "juju-badbad-0-lxd-0", + "juju-badbad-12-lxd-0" + ], + "monmap": { + "epoch": 2, + "fsid": "1111111-11111-1111-1111-111111111111", + "modified": "2018-04-06 06:37:04.978765", + "created": "2018-04-06 06:35:06.513449", + "features": { + "persistent": [ + "kraken", + "luminous" + ], + "optional": [] + }, + "mons": [ + { + "rank": 0, + "name": "juju-badbad-1-lxd-0", + "addr": "10.11.12.75:6789/0", + "public_addr": "10.11.12.75:6789/0" + }, + { + "rank": 1, + "name": "juju-badbad-0-lxd-0", + "addr": "10.11.12.76:6789/0", + "public_addr": "10.11.12.76:6789/0" + }, + { + "rank": 2, + "name": "juju-badbad-12-lxd-0", + "addr": "10.11.12.84:6789/0", + "public_addr": "10.11.12.84:6789/0" + } + ] + }, + "osdmap": { + "osdmap": { + "epoch": 262, + "num_osds": 33, + "num_up_osds": 32, + "num_in_osds": 32, + "full": false, + "nearfull": false, + "num_remapped_pgs": 0 + } + }, + "pgmap": { + "pgs_by_state": [ + { + "state_name": "active+clean", + "count": 1032 + } + ], + "num_pgs": 1032, + "num_pools": 20, + "num_objects": 561, + "data_bytes": 1584814720, + "bytes_used": 2884842602496, + "bytes_avail": 125077821714432, + "bytes_total": 127962664316928, + "read_bytes_sec": 1513, + "read_op_per_sec": 1, + "write_op_per_sec": 0 + }, + "fsmap": { + "epoch": 1, + "by_rank": [] + }, + "mgrmap": { + "epoch": 4, + "active_gid": 4131, + "active_name": "juju-badbad-1-lxd-0", + "active_addr": "10.11.12.75:6800/88914", + "available": true, + "standbys": [ + { + "gid": 4134, + "name": "juju-badbad-0-lxd-0", + "available_modules": [ + "balancer", + "dashboard", + "influx", + "localpool", + "prometheus", + "restful", + "selftest", + "status", + "zabbix" + ] + }, + { + "gid": 4299, + "name": "juju-badbad-12-lxd-0", + "available_modules": [ + "balancer", + "dashboard", + "influx", + "localpool", + "prometheus", + "restful", + "selftest", + "status", + "zabbix" + ] + } + ], + "modules": [ + "balancer", + "restful", + "status" + ], + "available_modules": [ + "balancer", + "dashboard", + "influx", + "localpool", + "prometheus", + "restful", + "selftest", + "status", + "zabbix" + ], + "services": {} + }, + "servicemap": { + "epoch": 29, + "modified": "2018-04-18 06:25:04.076050", + "services": { + "rgw": { + "daemons": { + "summary": "", + "radosgw.gateway": { + "start_epoch": 29, + "start_stamp": "2018-04-18 06:25:02.612368", + "gid": 231504, + "addr": "10.11.12.78:0/2747422053", + "metadata": { + "arch": "x86_64", + "ceph_version": "ceph version 12.2.2 (cf0baeeeeba3b47f9427c6c97e2144b094b7e5ba) luminous (stable)", + "cpu": "Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz", + "distro": "ubuntu", + "distro_description": "Ubuntu 16.04.4 LTS", + "distro_version": "16.04", + "frontend_config#0": "civetweb port=60", + "frontend_type#0": "civetweb", + "hostname": "juju-badbad-0-lxd-1", + "kernel_description": "#43~16.04.1-Ubuntu SMP Wed Mar 14 17:48:43 UTC 2018", + "kernel_version": "4.13.0-38-generic", + "mem_swap_kb": "8388604", + "mem_total_kb": "528154640", + "num_handles": "1", + "os": "Linux", + "pid": "225487", + "zone_id": "11111111-1111-1111-1111-111111111111", + "zone_name": "default", + "zonegroup_id": "11111111-1111-1111-1111-111111111111", + "zonegroup_name": "default" + } + } + } + } + } + } +} + diff --git a/unit_tests/test_check_ceph_status.py b/unit_tests/test_check_ceph_status.py index eeb13606..caf89bc9 100644 --- a/unit_tests/test_check_ceph_status.py +++ b/unit_tests/test_check_ceph_status.py @@ -26,7 +26,15 @@ import check_ceph_status @patch('subprocess.check_output') class NagiosTestCase(unittest.TestCase): - def test_health_ok(self, mock_subprocess): + def test_get_ceph_version(self, mock_subprocess): + mock_subprocess.return_value = 'ceph version 10.2.9 ' \ + '(2ee413f77150c0f375ff6f10edd6c8f9c7d060d0)'.encode('UTF-8') + ceph_version = check_ceph_status.get_ceph_version() + self.assertEqual(ceph_version, [10, 2, 9]) + + @patch('check_ceph_status.get_ceph_version') + def test_health_ok(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] with open('unit_tests/ceph_ok.json') as f: tree = f.read() mock_subprocess.return_value = tree.encode('UTF-8') @@ -34,7 +42,19 @@ class NagiosTestCase(unittest.TestCase): check_output = check_ceph_status.check_ceph_status(args) self.assertRegex(check_output, r"^All OK$") - def test_health_warn(self, mock_subprocess): + @patch('check_ceph_status.get_ceph_version') + def test_health_ok_luminous(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [12, 2, 0] + with open('unit_tests/ceph_ok_luminous.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--degraded_thresh', '1']) + check_output = check_ceph_status.check_ceph_status(args) + self.assertRegex(check_output, r"^All OK$") + + @patch('check_ceph_status.get_ceph_version') + def test_health_warn(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] with open('unit_tests/ceph_warn.json') as f: tree = f.read() mock_subprocess.return_value = tree.encode('UTF-8') @@ -42,7 +62,9 @@ class NagiosTestCase(unittest.TestCase): self.assertRaises(check_ceph_status.WarnError, lambda: check_ceph_status.check_ceph_status(args)) - def test_health_crit(self, mock_subprocess): + @patch('check_ceph_status.get_ceph_version') + def test_health_crit(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] with open('unit_tests/ceph_crit.json') as f: tree = f.read() mock_subprocess.return_value = tree.encode('UTF-8') @@ -50,7 +72,19 @@ class NagiosTestCase(unittest.TestCase): self.assertRaises(check_ceph_status.CriticalError, lambda: check_ceph_status.check_ceph_status(args)) - def test_health_lotsdegraded(self, mock_subprocess): + @patch('check_ceph_status.get_ceph_version') + def test_health_crit_luminous(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [12, 2, 0] + with open('unit_tests/ceph_crit_luminous.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--degraded_thresh', '1']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + @patch('check_ceph_status.get_ceph_version') + def test_health_lotsdegraded(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] with open('unit_tests/ceph_params.json') as f: tree = f.read() mock_subprocess.return_value = tree.encode('UTF-8') @@ -58,7 +92,9 @@ class NagiosTestCase(unittest.TestCase): self.assertRaises(check_ceph_status.CriticalError, lambda: check_ceph_status.check_ceph_status(args)) - def test_health_nodeepscrub(self, mock_subprocess): + @patch('check_ceph_status.get_ceph_version') + def test_health_nodeepscrub(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] with open('unit_tests/ceph_nodeepscrub.json') as f: tree = f.read() mock_subprocess.return_value = tree.encode('UTF-8') @@ -66,7 +102,9 @@ class NagiosTestCase(unittest.TestCase): self.assertRaises(check_ceph_status.CriticalError, lambda: check_ceph_status.check_ceph_status(args)) - def test_health_nodeepscrubok(self, mock_subprocess): + @patch('check_ceph_status.get_ceph_version') + def test_health_nodeepscrubok(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] with open('unit_tests/ceph_nodeepscrub.json') as f: tree = f.read() mock_subprocess.return_value = tree.encode('UTF-8')