From 35c8e40e83e526755541a3fd7b8e366c6a96b2d4 Mon Sep 17 00:00:00 2001 From: Marian Gasparovic Date: Tue, 6 Nov 2018 12:20:25 +0100 Subject: [PATCH] Don't return Critical when ceph is in warning state. Current implementation returns Critical when Ceph is in warning state, checking for some known exceptions which are considered operational tasks. However this causes many Alarms. This patch changes the behavior to report Warning when Ceph is in HEALTH_WARN. If known operational tasks are exceeding thresholds, Critical is returned. Change-Id: I7a330189da8f0ba9168cedb534823c5e8f4795ba --- config.yaml | 14 +- files/nagios/check_ceph_status.py | 93 +++--- hooks/ceph_hooks.py | 4 +- unit_tests/ceph_degraded_luminous.json | 147 ++++++++ unit_tests/ceph_error.json | 118 +++++++ unit_tests/ceph_many_warnings_luminous.json | 147 ++++++++ unit_tests/ceph_nodeepscrub.json | 353 +++++++++++--------- unit_tests/ceph_nodeepscrub_luminous.json | 102 ++++++ unit_tests/test_ceph_hooks.py | 2 +- unit_tests/test_check_ceph_status.py | 203 ++++++++--- 10 files changed, 919 insertions(+), 264 deletions(-) create mode 100644 unit_tests/ceph_degraded_luminous.json create mode 100644 unit_tests/ceph_error.json create mode 100644 unit_tests/ceph_many_warnings_luminous.json create mode 100644 unit_tests/ceph_nodeepscrub_luminous.json diff --git a/config.yaml b/config.yaml index f66d96fc..9e23a7ba 100644 --- a/config.yaml +++ b/config.yaml @@ -189,17 +189,21 @@ options: type: float description: "Threshold for degraded ratio (0.1 = 10%)" nagios_misplaced_thresh: - default: 10.0 + default: 1.0 type: float description: "Threshold for misplaced ratio (0.1 = 10%)" nagios_recovery_rate: default: '1' type: string - description: Recovery rate below which we consider recovery to be stalled - nagios_ignore_nodeepscub: - default: False + description: | + Recovery rate (in objects/s) below which we consider recovery + to be stalled. + nagios_raise_nodeepscrub: + default: True type: boolean - description: Whether to ignore the nodeep-scrub flag + description: | + Whether to report Critical instead of Warning when the nodeep-scrub + flag is set. use-direct-io: type: boolean default: True diff --git a/files/nagios/check_ceph_status.py b/files/nagios/check_ceph_status.py index ef978023..9839bb85 100755 --- a/files/nagios/check_ceph_status.py +++ b/files/nagios/check_ceph_status.py @@ -102,10 +102,6 @@ def check_ceph_status(args): :returns string, describing the status of the ceph cluster. """ - ignorable = (r'\d+ pgs (?:backfill|degraded|recovery_wait|stuck unclean)|' - 'recovery \d+\/\d+ objects (?:degraded|misplaced)') - if args.ignore_nodeepscrub: - ignorable = ignorable + '|nodeep-scrub flag\(s\) set' status_critical = False if args.status_file: check_file_freshness(args.status_file) @@ -136,41 +132,60 @@ def check_ceph_status(args): luminous = False if overall_status != 'HEALTH_OK': - # Health is not OK, check if any lines are not in our list of OK - # any lines that don't match, check is critical + # Health is not OK, collect status message(s) and + # decide whether to return warning or critical + status_critical = False status_msg = [] if luminous: - status_messages = [x['summary']['message'] for x in status_data['health'].get('checks').values()] + status_messages = [x['summary']['message'] + for x in + status_data['health'].get('checks').values()] else: - status_messages = [x['summary'] for x in status_data['health']['summary']] + status_messages = [x['summary'] + for x in + status_data['health']['summary']] for status in status_messages: - if not re.match(ignorable, status): + status_msg.append(status) + # Check if nedeepscrub is set and whether it should raise an error + if args.raise_nodeepscrub: + if re.match("nodeep-scrub flag", status): + status_critical = True + if overall_status == 'HEALTH_CRITICAL' or \ + overall_status == 'HEALTH_ERR': + # HEALTH_ERR, report critical + status_critical = True + else: + # HEALTH_WARN + # Check the threshold for a list of operational tasks, + # and return CRITICAL if exceeded + degraded_ratio = float(status_data['pgmap'].get('degraded_ratio', + 0.0)) + if degraded_ratio > args.degraded_thresh: status_critical = True - status_msg.append(status) - # If we got this far, then the status is not OK but the status lines - # are all in our list of things we consider to be operational tasks. - # Check the thresholds and return CRITICAL if exceeded, - # otherwise there's something not accounted for and we want to know - # about it with a WARN alert. - degraded_ratio = status_data['pgmap'].get('degraded_ratio', 0.0) - if degraded_ratio > args.degraded_thresh: - status_critical = True - status_msg.append("Degraded ratio: {}".format(degraded_ratio)) - misplaced_ratio = status_data['pgmap'].get('misplaced_ratio', 0.0) - if misplaced_ratio > args.misplaced_thresh: - status_critical = True - status_msg.append("Misplaced ratio: {}".format(misplaced_ratio)) - recovering = status_data['pgmap'].get('recovering_objects_per_sec', - 0.0) - if recovering < args.recovery_rate: - status_critical = True - status_msg.append("Recovering objects/sec {}".format(recovering)) + if degraded_ratio > 0: + status_msg.append("Degraded ratio: {}".format(degraded_ratio)) + misplaced_ratio = float(status_data['pgmap'].get('misplaced_ratio', + 0.0)) + if misplaced_ratio > args.misplaced_thresh: + status_critical = True + if misplaced_ratio > 0: + status_msg.append("Misplaced ratio: {}". + format(misplaced_ratio)) + recovering = float(status_data['pgmap']. + get('recovering_objects_per_sec', 0.0)) + if (degraded_ratio > 0 or misplaced_ratio > 0) \ + and recovering > 0 \ + and recovering < args.recovery_rate: + status_critical = True + if recovering > 0: + status_msg.append("Recovering objects/s {}".format(recovering)) if status_critical: msg = 'CRITICAL: ceph health: "{} {}"'.format( overall_status, ", ".join(status_msg)) raise CriticalError(msg) - if overall_status == 'HEALTH_WARN': + else: + # overall_status == 'HEALTH_WARN': msg = "WARNING: {}".format(", ".join(status_msg)) raise WarnError(msg) message = "All OK" @@ -187,21 +202,21 @@ def parse_args(args): 'user account does not have rights for the Ceph ' 'config files.') parser.add_argument('--degraded_thresh', dest='degraded_thresh', - default=1, type=float, + default=1.0, type=float, help="Threshold for degraded ratio (0.1 = 10%)") parser.add_argument('--misplaced_thresh', dest='misplaced_thresh', - default=10, type=float, + default=1.0, type=float, help="Threshold for misplaced ratio (0.1 = 10%)") parser.add_argument('--recovery_rate', dest='recovery_rate', default=1, type=int, - help="Recovery rate below which we consider recovery " - "to be stalled") - parser.add_argument('--ignore_nodeepscrub', dest='ignore_nodeepscrub', + help="Recovery rate (in objects/s) below which we" + "consider recovery to be stalled") + parser.add_argument('--raise_nodeepscrub', dest='raise_nodeepscrub', default=False, action='store_true', - help="Whether to ignore the nodeep-scrub flag. If " - "the nodeep-scrub flag is set, the check returns " - "warning if this param is passed, otherwise " - "returns critical.") + help="Whether to raise an error for the nodeep-scrub" + "flag. If the nodeep-scrub flag is set," + "the check returns critical if this param is" + "passed, otherwise it returns warning.") return parser.parse_args(args) @@ -218,7 +233,7 @@ def main(args): exitcode = 'critical' except WarnError as msg: print(msg) - exitcode = 'critical' + exitcode = 'warning' except: print("%s raised unknown exception '%s'" % ('check_ceph_status', sys.exc_info()[0])) diff --git a/hooks/ceph_hooks.py b/hooks/ceph_hooks.py index ccbe7970..f007181c 100755 --- a/hooks/ceph_hooks.py +++ b/hooks/ceph_hooks.py @@ -748,8 +748,8 @@ def update_nrpe_config(): config('nagios_degraded_thresh'), config('nagios_misplaced_thresh'), config('nagios_recovery_rate')) - if config('nagios_ignore_nodeepscub'): - check_cmd = check_cmd + ' --ignore_nodeepscrub' + if config('nagios_raise_nodeepscrub'): + check_cmd = check_cmd + ' --raise_nodeepscrub' nrpe_setup.add_check( shortname="ceph", description='Check Ceph health {{{}}}'.format(current_unit), diff --git a/unit_tests/ceph_degraded_luminous.json b/unit_tests/ceph_degraded_luminous.json new file mode 100644 index 00000000..3cf3bdd3 --- /dev/null +++ b/unit_tests/ceph_degraded_luminous.json @@ -0,0 +1,147 @@ +{ + "fsid": "b03a2900-e297-11e8-a7db-00163ed10659", + "health": { + "checks": { + "OSD_DOWN": { + "severity": "HEALTH_WARN", + "summary": { + "message": "3 osds down" + } + }, + "OSD_HOST_DOWN": { + "severity": "HEALTH_WARN", + "summary": { + "message": "1 host (3 osds) down" + } + }, + "OBJECT_MISPLACED": { + "severity": "HEALTH_WARN", + "summary": { + "message": "9883/43779 objects misplaced (22.575%)" + } + }, + "PG_DEGRADED": { + "severity": "HEALTH_WARN", + "summary": { + "message": "Degraded data redundancy: 14001/43779 objects degraded (31.981%), 32 pgs degraded" + } + }, + "POOL_APP_NOT_ENABLED": { + "severity": "HEALTH_WARN", + "summary": { + "message": "application not enabled on 1 pool(s)" + } + }, + "TOO_FEW_PGS": { + "severity": "HEALTH_WARN", + "summary": { + "message": "too few PGs per OSD (7 < min 30)" + } + } + }, + "status": "HEALTH_WARN" + }, + "election_epoch": 5, + "quorum": [ + 0 + ], + "quorum_names": [ + "juju-460e0f-11" + ], + "monmap": { + "epoch": 1, + "fsid": "b03a2900-e297-11e8-a7db-00163ed10659", + "modified": "2018-11-07 14:17:12.324408", + "created": "2018-11-07 14:17:12.324408", + "features": { + "persistent": [ + "kraken", + "luminous" + ], + "optional": [] + }, + "mons": [ + { + "rank": 0, + "name": "juju-460e0f-11", + "addr": "192.168.100.81:6789/0", + "public_addr": "192.168.100.81:6789/0" + } + ] + }, + "osdmap": { + "osdmap": { + "epoch": 72, + "num_osds": 9, + "num_up_osds": 6, + "num_in_osds": 9, + "full": false, + "nearfull": false, + "num_remapped_pgs": 16 + } + }, + "pgmap": { + "pgs_by_state": [ + { + "state_name": "active+undersized+degraded", + "count": 16 + }, + { + "state_name": "active+undersized+degraded+remapped+backfill_wait", + "count": 14 + }, + { + "state_name": "active+undersized+degraded+remapped+backfilling", + "count": 2 + } + ], + "num_pgs": 32, + "num_pools": 1, + "num_objects": 14593, + "data_bytes": 61169729807, + "bytes_used": 14540595200, + "bytes_avail": 14889525248, + "bytes_total": 29430120448, + "degraded_objects": 14001, + "degraded_total": 43779, + "degraded_ratio": 0.319811, + "misplaced_objects": 9883, + "misplaced_total": 43779, + "misplaced_ratio": 0.225748 + }, + "fsmap": { + "epoch": 1, + "by_rank": [] + }, + "mgrmap": { + "epoch": 5, + "active_gid": 14097, + "active_name": "juju-460e0f-11", + "active_addr": "192.168.100.81:6800/204", + "available": true, + "standbys": [], + "modules": [ + "balancer", + "restful", + "status" + ], + "available_modules": [ + "balancer", + "dashboard", + "influx", + "localpool", + "prometheus", + "restful", + "selftest", + "status", + "zabbix" + ], + "services": {} + }, + "servicemap": { + "epoch": 1, + "modified": "0.000000", + "services": {} + } +} + diff --git a/unit_tests/ceph_error.json b/unit_tests/ceph_error.json new file mode 100644 index 00000000..eb9a161c --- /dev/null +++ b/unit_tests/ceph_error.json @@ -0,0 +1,118 @@ +{ + "health": { + "health": { + "health_services": [ + { + "mons": [ + { + "name": "juju-460e0f-12", + "kb_total": 1829760, + "kb_used": 835072, + "kb_avail": 994688, + "avail_percent": 54, + "last_updated": "2018-11-07 18:46:32.308592", + "store_stats": { + "bytes_total": 15678387, + "bytes_sst": 0, + "bytes_log": 420953, + "bytes_misc": 15257434, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + } + ] + } + ] + }, + "timechecks": { + "epoch": 3, + "round": 0, + "round_status": "finished" + }, + "summary": [ + { + "severity": "HEALTH_ERR", + "summary": "6 pgs are stuck inactive for more than 300 seconds" + }, + { + "severity": "HEALTH_WARN", + "summary": "7 pgs peering" + }, + { + "severity": "HEALTH_WARN", + "summary": "6 pgs stuck inactive" + }, + { + "severity": "HEALTH_WARN", + "summary": "6 pgs stuck unclean" + } + ], + "overall_status": "HEALTH_ERR", + "detail": [] + }, + "fsid": "68a9ca14-e297-11e8-843c-00163e64b0c0", + "election_epoch": 3, + "quorum": [ + 0 + ], + "quorum_names": [ + "juju-460e0f-12" + ], + "monmap": { + "epoch": 1, + "fsid": "68a9ca14-e297-11e8-843c-00163e64b0c0", + "modified": "2018-11-07 14:17:27.659064", + "created": "2018-11-07 14:17:27.659064", + "mons": [ + { + "rank": 0, + "name": "juju-460e0f-12", + "addr": "192.168.100.26:6789\/0" + } + ] + }, + "osdmap": { + "osdmap": { + "epoch": 28, + "num_osds": 9, + "num_up_osds": 9, + "num_in_osds": 9, + "full": false, + "nearfull": false, + "num_remapped_pgs": 0 + } + }, + "pgmap": { + "pgs_by_state": [ + { + "state_name": "creating", + "count": 113 + }, + { + "state_name": "active+clean", + "count": 64 + }, + { + "state_name": "activating", + "count": 8 + }, + { + "state_name": "peering", + "count": 7 + } + ], + "version": 7831, + "num_pgs": 192, + "data_bytes": 1790967809, + "bytes_used": 9995157504, + "bytes_avail": 9157476352, + "bytes_total": 19152633856, + "write_bytes_sec": 89844495, + "read_op_per_sec": 0, + "write_op_per_sec": 21 + }, + "fsmap": { + "epoch": 1, + "by_rank": [] + } +} diff --git a/unit_tests/ceph_many_warnings_luminous.json b/unit_tests/ceph_many_warnings_luminous.json new file mode 100644 index 00000000..3e5c11e8 --- /dev/null +++ b/unit_tests/ceph_many_warnings_luminous.json @@ -0,0 +1,147 @@ +{ + "fsid": "b03a2900-e297-11e8-a7db-00163ed10659", + "health": { + "checks": { + "OBJECT_MISPLACED": { + "severity": "HEALTH_WARN", + "summary": { + "message": "1560/12264 objects misplaced (12.720%)" + } + }, + "PG_AVAILABILITY": { + "severity": "HEALTH_WARN", + "summary": { + "message": "Reduced data availability: 27 pgs inactive, 30 pgs peering" + } + }, + "POOL_APP_NOT_ENABLED": { + "severity": "HEALTH_WARN", + "summary": { + "message": "application not enabled on 1 pool(s)" + } + }, + "TOO_FEW_PGS": { + "severity": "HEALTH_WARN", + "summary": { + "message": "too few PGs per OSD (21 < min 30)" + } + } + }, + "status": "HEALTH_WARN" + }, + "election_epoch": 5, + "quorum": [ + 0 + ], + "quorum_names": [ + "juju-460e0f-11" + ], + "monmap": { + "epoch": 1, + "fsid": "b03a2900-e297-11e8-a7db-00163ed10659", + "modified": "2018-11-07 14:17:12.324408", + "created": "2018-11-07 14:17:12.324408", + "features": { + "persistent": [ + "kraken", + "luminous" + ], + "optional": [] + }, + "mons": [ + { + "rank": 0, + "name": "juju-460e0f-11", + "addr": "192.168.100.81:6789/0", + "public_addr": "192.168.100.81:6789/0" + } + ] + }, + "osdmap": { + "osdmap": { + "epoch": 118, + "num_osds": 9, + "num_up_osds": 9, + "num_in_osds": 9, + "full": false, + "nearfull": false, + "num_remapped_pgs": 15 + } + }, + "pgmap": { + "pgs_by_state": [ + { + "state_name": "unknown", + "count": 65 + }, + { + "state_name": "peering", + "count": 31 + }, + { + "state_name": "activating", + "count": 17 + }, + { + "state_name": "activating+remapped", + "count": 15 + } + ], + "num_pgs": 128, + "num_pools": 1, + "num_objects": 4088, + "data_bytes": 17187733578, + "bytes_used": 14360064000, + "bytes_avail": 15023263744, + "bytes_total": 29383327744, + "unknown_pgs_ratio": 0.507812, + "inactive_pgs_ratio": 0.492188, + "misplaced_objects": 1560, + "misplaced_total": 12264, + "misplaced_ratio": 0.127202, + "recovering_objects_per_sec": 14, + "recovering_bytes_per_sec": 60779755, + "recovering_keys_per_sec": 0, + "num_objects_recovered": 113, + "num_bytes_recovered": 471859200, + "num_keys_recovered": 0, + "read_bytes_sec": 0, + "write_bytes_sec": 244132150, + "read_op_per_sec": 0, + "write_op_per_sec": 116 + }, + "fsmap": { + "epoch": 1, + "by_rank": [] + }, + "mgrmap": { + "epoch": 5, + "active_gid": 14097, + "active_name": "juju-460e0f-11", + "active_addr": "192.168.100.81:6800/204", + "available": true, + "standbys": [], + "modules": [ + "balancer", + "restful", + "status" + ], + "available_modules": [ + "balancer", + "dashboard", + "influx", + "localpool", + "prometheus", + "restful", + "selftest", + "status", + "zabbix" + ], + "services": {} + }, + "servicemap": { + "epoch": 1, + "modified": "0.000000", + "services": {} + } +} diff --git a/unit_tests/ceph_nodeepscrub.json b/unit_tests/ceph_nodeepscrub.json index fe3aedf8..2488fabb 100644 --- a/unit_tests/ceph_nodeepscrub.json +++ b/unit_tests/ceph_nodeepscrub.json @@ -1,177 +1,202 @@ { - "health": { "health": { - "health_services": [ - { - "mons": [ + "health": { + "health_services": [ + { + "mons": [ + { + "name": "juju-c62a41-21-lxd-0", + "kb_total": 334602320, + "kb_used": 2127960, + "kb_avail": 315454468, + "avail_percent": 94, + "last_updated": "2018-11-08 09:47:09.932189", + "store_stats": { + "bytes_total": 34880542, + "bytes_sst": 0, + "bytes_log": 1647123, + "bytes_misc": 33233419, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + }, + { + "name": "juju-c62a41-24-lxd-0", + "kb_total": 334602320, + "kb_used": 2128116, + "kb_avail": 315454312, + "avail_percent": 94, + "last_updated": "2018-11-08 09:47:16.418007", + "store_stats": { + "bytes_total": 36811676, + "bytes_sst": 0, + "bytes_log": 3574345, + "bytes_misc": 33237331, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + }, + { + "name": "juju-c62a41-25-lxd-0", + "kb_total": 334602320, + "kb_used": 2128860, + "kb_avail": 315453568, + "avail_percent": 94, + "last_updated": "2018-11-08 09:47:21.198816", + "store_stats": { + "bytes_total": 37388424, + "bytes_sst": 0, + "bytes_log": 4151569, + "bytes_misc": 33236855, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + } + ] + } + ] + }, + "timechecks": { + "epoch": 14, + "round": 4480, + "round_status": "finished", + "mons": [ + { + "name": "juju-c62a41-21-lxd-0", + "skew": 0.000000, + "latency": 0.000000, + "health": "HEALTH_OK" + }, + { + "name": "juju-c62a41-24-lxd-0", + "skew": 0.000282, + "latency": 0.000989, + "health": "HEALTH_OK" + }, + { + "name": "juju-c62a41-25-lxd-0", + "skew": -0.001223, + "latency": 0.000776, + "health": "HEALTH_OK" + } + ] + }, + "summary": [ { - "name": "node1", - "kb_total": 140956600, - "kb_used": 15916132, - "kb_avail": 117857208, - "avail_percent": 83, - "last_updated": "2017-05-17 03:23:11.248297", - "store_stats": { - "bytes_total": 140014259, - "bytes_sst": 0, - "bytes_log": 13670758, - "bytes_misc": 126343501, - "last_updated": "0.000000" - }, - "health": "HEALTH_OK" + "severity": "HEALTH_WARN", + "summary": "19 pgs backfill_wait" }, { - "name": "node2", - "kb_total": 70395920, - "kb_used": 10532504, - "kb_avail": 56264436, - "avail_percent": 79, - "last_updated": "2017-05-17 03:23:16.952673", - "store_stats": { - "bytes_total": 315512452, - "bytes_sst": 0, - "bytes_log": 21691698, - "bytes_misc": 293820754, - "last_updated": "0.000000" - }, - "health": "HEALTH_OK" + "severity": "HEALTH_WARN", + "summary": "4 pgs backfilling" }, { - "name": "juju-machine-85-lxc-10", - "kb_total": 131927524, - "kb_used": 79521024, - "kb_avail": 45954016, - "avail_percent": 34, - "last_updated": "2017-05-17 03:23:13.794034", - "store_stats": { - "bytes_total": 89036349, - "bytes_sst": 0, - "bytes_log": 21055337, - "bytes_misc": 67981012, - "last_updated": "0.000000" - }, - "health": "HEALTH_OK" + "severity": "HEALTH_WARN", + "summary": "1 pgs peering" + }, + { + "severity": "HEALTH_WARN", + "summary": "24 pgs stuck unclean" + }, + { + "severity": "HEALTH_WARN", + "summary": "recovery 17386\/112794 objects misplaced (15.414%)" + }, + { + "severity": "HEALTH_WARN", + "summary": "pool pool1 has many more objects per pg than average (too few pgs?)" + }, + { + "severity": "HEALTH_WARN", + "summary": "nodeep-scrub flag(s) set" } - ] - } - ] + ], + "overall_status": "HEALTH_WARN", + "detail": [] }, - "timechecks": { - "epoch": 280, - "round": 19874, - "round_status": "finished", - "mons": [ - { - "name": "node1", - "skew": "0.000000", - "latency": "0.000000", - "health": "HEALTH_OK" - }, - { - "name": "node2", - "skew": "-0.000000", - "latency": "0.000866", - "health": "HEALTH_OK" - }, - { - "name": "juju-machine-85-lxc-10", - "skew": "-0.000000", - "latency": "0.018848", - "health": "HEALTH_OK" - } - ] - }, - "summary": [ - { - "severity": "HEALTH_WARN", - "summary": "nodeep-scrub flag(s) set" - } + "fsid": "66af7af5-2f60-4e0e-94dc-49f49bd37284", + "election_epoch": 14, + "quorum": [ + 0, + 1, + 2 ], - "overall_status": "HEALTH_WARN", - "detail": [] - }, - "fsid": "some_fsid", - "election_epoch": 280, - "quorum": [ - 0, - 1, - 2 - ], - "quorum_names": [ - "node1", - "node2", - "juju-machine-85-lxc-10" - ], - "monmap": { - "epoch": 3, - "fsid": "some_fsid", - "modified": "2016-11-25 00:08:51.235813", - "created": "0.000000", - "mons": [ - { - "rank": 0, - "name": "node1", - "addr": "10.24.0.15:6789/0" - }, - { - "rank": 1, - "name": "node2", - "addr": "10.24.0.17:6789/0" - }, - { - "rank": 2, - "name": "juju-machine-85-lxc-10", - "addr": "10.24.0.195:6789/0" - } - ] - }, - "osdmap": { + "quorum_names": [ + "juju-c62a41-21-lxd-0", + "juju-c62a41-24-lxd-0", + "juju-c62a41-25-lxd-0" + ], + "monmap": { + "epoch": 2, + "fsid": "66af7af5-2f60-4e0e-94dc-49f49bd37284", + "modified": "2018-10-31 15:37:56.902830", + "created": "2018-10-31 15:37:40.288870", + "mons": [ + { + "rank": 0, + "name": "juju-c62a41-21-lxd-0", + "addr": "100.84.195.4:6789\/0" + }, + { + "rank": 1, + "name": "juju-c62a41-24-lxd-0", + "addr": "100.84.196.4:6789\/0" + }, + { + "rank": 2, + "name": "juju-c62a41-25-lxd-0", + "addr": "100.84.196.5:6789\/0" + } + ] + }, "osdmap": { - "epoch": 37820, - "num_osds": 46, - "num_up_osds": 46, - "num_in_osds": 46, - "full": false, - "nearfull": false + "osdmap": { + "epoch": 316, + "num_osds": 48, + "num_up_osds": 48, + "num_in_osds": 48, + "full": false, + "nearfull": false, + "num_remapped_pgs": 22 + } + }, + "pgmap": { + "pgs_by_state": [ + { + "state_name": "active+clean", + "count": 3448 + }, + { + "state_name": "active+remapped+wait_backfill", + "count": 19 + }, + { + "state_name": "active+remapped+backfilling", + "count": 4 + }, + { + "state_name": "peering", + "count": 1 + } + ], + "version": 141480, + "num_pgs": 3472, + "data_bytes": 157009583781, + "bytes_used": 487185850368, + "bytes_avail": 75282911256576, + "bytes_total": 75770097106944, + "misplaced_objects": 17386, + "misplaced_total": 112794, + "misplaced_ratio": 0.154139, + "recovering_objects_per_sec": 436, + "recovering_bytes_per_sec": 1832614589, + "recovering_keys_per_sec": 0, + "num_objects_recovered": 446, + "num_bytes_recovered": 1870659584, + "num_keys_recovered": 0 + }, + "fsmap": { + "epoch": 1, + "by_rank": [] } - }, - "pgmap": { - "pgs_by_state": [ - { - "state_name": "active+clean", - "count": 1988 - }, - { - "state_name": "active+remapped+wait_backfill", - "count": 3 - }, - { - "state_name": "active+remapped+backfilling", - "count": 1 - } - ], - "version": 58873447, - "num_pgs": 1992, - "data_bytes": 35851846298041, - "bytes_used": 107730678743040, - "bytes_avail": 63413590548480, - "bytes_total": 171144269291520, - "degraded_objects": 0, - "degraded_total": 25759217, - "degraded_ratio": 0, - "recovering_objects_per_sec": 17, - "recovering_bytes_per_sec": 72552794, - "recovering_keys_per_sec": 0, - "read_bytes_sec": 23935944, - "write_bytes_sec": 7024503, - "op_per_sec": 5332 - }, - "mdsmap": { - "epoch": 1, - "up": 0, - "in": 0, - "max": 1, - "by_rank": [] - } } - diff --git a/unit_tests/ceph_nodeepscrub_luminous.json b/unit_tests/ceph_nodeepscrub_luminous.json new file mode 100644 index 00000000..3d161fba --- /dev/null +++ b/unit_tests/ceph_nodeepscrub_luminous.json @@ -0,0 +1,102 @@ +{ + "fsid": "b03a2900-e297-11e8-a7db-00163ed10659", + "health": { + "checks": { + "OSDMAP_FLAGS": { + "severity": "HEALTH_WARN", + "summary": { + "message": "nodeep-scrub flag(s) set" + } + } + }, + "status": "HEALTH_WARN" + }, + "election_epoch": 5, + "quorum": [ + 0 + ], + "quorum_names": [ + "juju-460e0f-11" + ], + "monmap": { + "epoch": 1, + "fsid": "b03a2900-e297-11e8-a7db-00163ed10659", + "modified": "2018-11-07 14:17:12.324408", + "created": "2018-11-07 14:17:12.324408", + "features": { + "persistent": [ + "kraken", + "luminous" + ], + "optional": [] + }, + "mons": [ + { + "rank": 0, + "name": "juju-460e0f-11", + "addr": "192.168.100.81:6789/0", + "public_addr": "192.168.100.81:6789/0" + } + ] + }, + "osdmap": { + "osdmap": { + "epoch": 518, + "num_osds": 9, + "num_up_osds": 9, + "num_in_osds": 9, + "full": false, + "nearfull": false, + "num_remapped_pgs": 0 + } + }, + "pgmap": { + "pgs_by_state": [ + { + "state_name": "active+clean", + "count": 128 + } + ], + "num_pgs": 128, + "num_pools": 1, + "num_objects": 14896, + "data_bytes": 62440603919, + "bytes_used": 14225776640, + "bytes_avail": 9450938368, + "bytes_total": 23676715008 + }, + "fsmap": { + "epoch": 1, + "by_rank": [] + }, + "mgrmap": { + "epoch": 5, + "active_gid": 14097, + "active_name": "juju-460e0f-11", + "active_addr": "192.168.100.81:6800/204", + "available": true, + "standbys": [], + "modules": [ + "balancer", + "restful", + "status" + ], + "available_modules": [ + "balancer", + "dashboard", + "influx", + "localpool", + "prometheus", + "restful", + "selftest", + "status", + "zabbix" + ], + "services": {} + }, + "servicemap": { + "epoch": 1, + "modified": "0.000000", + "services": {} + } +} diff --git a/unit_tests/test_ceph_hooks.py b/unit_tests/test_ceph_hooks.py index 0f96d2da..fa792042 100644 --- a/unit_tests/test_ceph_hooks.py +++ b/unit_tests/test_ceph_hooks.py @@ -48,7 +48,7 @@ CHARM_CONFIG = {'config-flags': '', 'nagios_degraded_thresh': '1', 'nagios_misplaced_thresh': '10', 'nagios_recovery_rate': '1', - 'nagios_ignore_nodeepscub': False, + 'nagios_raise_nodeepscrub': True, 'disable-pg-max-object-skew': False} diff --git a/unit_tests/test_check_ceph_status.py b/unit_tests/test_check_ceph_status.py index caf89bc9..69ac4177 100644 --- a/unit_tests/test_check_ceph_status.py +++ b/unit_tests/test_check_ceph_status.py @@ -32,6 +32,7 @@ class NagiosTestCase(unittest.TestCase): ceph_version = check_ceph_status.get_ceph_version() self.assertEqual(ceph_version, [10, 2, 9]) + # All OK, pre-luminoius @patch('check_ceph_status.get_ceph_version') def test_health_ok(self, mock_ceph_version, mock_subprocess): mock_ceph_version.return_value = [10, 2, 9] @@ -42,6 +43,84 @@ class NagiosTestCase(unittest.TestCase): check_output = check_ceph_status.check_ceph_status(args) self.assertRegex(check_output, r"^All OK$") + # Warning, pre-luminous + @patch('check_ceph_status.get_ceph_version') + def test_health_warn(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] + with open('unit_tests/ceph_warn.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args("") + self.assertRaises(check_ceph_status.WarnError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, pre-luminous, health_critical status + @patch('check_ceph_status.get_ceph_version') + def test_health_err(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] + with open('unit_tests/ceph_crit.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args("") + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, pre-luminous, overall HEALTH_ERR + @patch('check_ceph_status.get_ceph_version') + def test_health_crit(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] + with open('unit_tests/ceph_error.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args("") + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, pre-luminous, because misplaced ratio is too big + @patch('check_ceph_status.get_ceph_version') + def test_health_crit_misplaced(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] + with open('unit_tests/ceph_params.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--misplaced_thresh', '0.1']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, pre-luminous, because recovery rate is too low + @patch('check_ceph_status.get_ceph_version') + def test_health_crit_recovery(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] + with open('unit_tests/ceph_params.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--recovery_rate', '400']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Warning, pre-luminous, deepscrub + @patch('check_ceph_status.get_ceph_version') + def test_health_warn_deepscrub(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] + with open('unit_tests/ceph_nodeepscrub.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args("") + self.assertRaises(check_ceph_status.WarnError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, pre-luminous, deepscrub + @patch('check_ceph_status.get_ceph_version') + def test_health_crit_deepscrub(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] + with open('unit_tests/ceph_nodeepscrub.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--raise_nodeepscrub']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # All OK, luminous @patch('check_ceph_status.get_ceph_version') def test_health_ok_luminous(self, mock_ceph_version, mock_subprocess): mock_ceph_version.return_value = [12, 2, 0] @@ -52,62 +131,80 @@ class NagiosTestCase(unittest.TestCase): check_output = check_ceph_status.check_ceph_status(args) self.assertRegex(check_output, r"^All OK$") + # Warning, luminous @patch('check_ceph_status.get_ceph_version') - def test_health_warn(self, mock_ceph_version, mock_subprocess): - mock_ceph_version.return_value = [10, 2, 9] - with open('unit_tests/ceph_warn.json') as f: - tree = f.read() - mock_subprocess.return_value = tree.encode('UTF-8') - args = check_ceph_status.parse_args(['--degraded_thresh', '1']) - self.assertRaises(check_ceph_status.WarnError, - lambda: check_ceph_status.check_ceph_status(args)) - - @patch('check_ceph_status.get_ceph_version') - def test_health_crit(self, mock_ceph_version, mock_subprocess): - mock_ceph_version.return_value = [10, 2, 9] - with open('unit_tests/ceph_crit.json') as f: - tree = f.read() - mock_subprocess.return_value = tree.encode('UTF-8') - args = check_ceph_status.parse_args(['--degraded_thresh', '1']) - self.assertRaises(check_ceph_status.CriticalError, - lambda: check_ceph_status.check_ceph_status(args)) - - @patch('check_ceph_status.get_ceph_version') - def test_health_crit_luminous(self, mock_ceph_version, mock_subprocess): + def test_health_warn_luminous(self, mock_ceph_version, mock_subprocess): mock_ceph_version.return_value = [12, 2, 0] - with open('unit_tests/ceph_crit_luminous.json') as f: + with open('unit_tests/ceph_many_warnings_luminous.json') as f: tree = f.read() mock_subprocess.return_value = tree.encode('UTF-8') - args = check_ceph_status.parse_args(['--degraded_thresh', '1']) - self.assertRaises(check_ceph_status.CriticalError, - lambda: check_ceph_status.check_ceph_status(args)) - - @patch('check_ceph_status.get_ceph_version') - def test_health_lotsdegraded(self, mock_ceph_version, mock_subprocess): - mock_ceph_version.return_value = [10, 2, 9] - with open('unit_tests/ceph_params.json') as f: - tree = f.read() - mock_subprocess.return_value = tree.encode('UTF-8') - args = check_ceph_status.parse_args(['--degraded_thresh', '1']) - self.assertRaises(check_ceph_status.CriticalError, - lambda: check_ceph_status.check_ceph_status(args)) - - @patch('check_ceph_status.get_ceph_version') - def test_health_nodeepscrub(self, mock_ceph_version, mock_subprocess): - mock_ceph_version.return_value = [10, 2, 9] - with open('unit_tests/ceph_nodeepscrub.json') as f: - tree = f.read() - mock_subprocess.return_value = tree.encode('UTF-8') - args = check_ceph_status.parse_args(['--degraded_thresh', '1']) - self.assertRaises(check_ceph_status.CriticalError, - lambda: check_ceph_status.check_ceph_status(args)) - - @patch('check_ceph_status.get_ceph_version') - def test_health_nodeepscrubok(self, mock_ceph_version, mock_subprocess): - mock_ceph_version.return_value = [10, 2, 9] - with open('unit_tests/ceph_nodeepscrub.json') as f: - tree = f.read() - mock_subprocess.return_value = tree.encode('UTF-8') - args = check_ceph_status.parse_args(['--ignore_nodeepscrub']) + args = check_ceph_status.parse_args("") self.assertRaises(check_ceph_status.WarnError, lambda: check_ceph_status.check_ceph_status(args)) + +# Error, luminous, because of overall status + + # Error, luminous, because misplaced ratio is too big + @patch('check_ceph_status.get_ceph_version') + def test_health_critical_misplaced_luminous(self, + mock_ceph_version, + mock_subprocess): + mock_ceph_version.return_value = [12, 2, 0] + with open('unit_tests/ceph_many_warnings_luminous.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--misplaced_thresh', '0.1']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, luminous, because degraded ratio is too big + @patch('check_ceph_status.get_ceph_version') + def test_health_critical_degraded_luminous(self, + mock_ceph_version, + mock_subprocess): + mock_ceph_version.return_value = [12, 2, 0] + with open('unit_tests/ceph_degraded_luminous.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--degraded_thresh', '0.1']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, luminous, because recovery rate is too low + @patch('check_ceph_status.get_ceph_version') + def test_health_critical_recovery_luminous(self, + mock_ceph_version, + mock_subprocess): + mock_ceph_version.return_value = [12, 2, 0] + with open('unit_tests/ceph_many_warnings_luminous.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--recovery_rate', '20']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Warning, luminous, deepscrub + @patch('check_ceph_status.get_ceph_version') + def test_health_warn_deepscrub_luminous(self, + mock_ceph_version, + mock_subprocess): + mock_ceph_version.return_value = [12, 2, 0] + with open('unit_tests/ceph_nodeepscrub_luminous.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args("") + self.assertRaises(check_ceph_status.WarnError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, luminous, deepscrub + @patch('check_ceph_status.get_ceph_version') + def test_health_crit_deepscrub_luminous(self, + mock_ceph_version, + mock_subprocess): + mock_ceph_version.return_value = [12, 2, 0] + with open('unit_tests/ceph_nodeepscrub_luminous.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--raise_nodeepscrub']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args))