diff --git a/config.yaml b/config.yaml index f66d96fc..9e23a7ba 100644 --- a/config.yaml +++ b/config.yaml @@ -189,17 +189,21 @@ options: type: float description: "Threshold for degraded ratio (0.1 = 10%)" nagios_misplaced_thresh: - default: 10.0 + default: 1.0 type: float description: "Threshold for misplaced ratio (0.1 = 10%)" nagios_recovery_rate: default: '1' type: string - description: Recovery rate below which we consider recovery to be stalled - nagios_ignore_nodeepscub: - default: False + description: | + Recovery rate (in objects/s) below which we consider recovery + to be stalled. + nagios_raise_nodeepscrub: + default: True type: boolean - description: Whether to ignore the nodeep-scrub flag + description: | + Whether to report Critical instead of Warning when the nodeep-scrub + flag is set. use-direct-io: type: boolean default: True diff --git a/files/nagios/check_ceph_status.py b/files/nagios/check_ceph_status.py index ef978023..9839bb85 100755 --- a/files/nagios/check_ceph_status.py +++ b/files/nagios/check_ceph_status.py @@ -102,10 +102,6 @@ def check_ceph_status(args): :returns string, describing the status of the ceph cluster. """ - ignorable = (r'\d+ pgs (?:backfill|degraded|recovery_wait|stuck unclean)|' - 'recovery \d+\/\d+ objects (?:degraded|misplaced)') - if args.ignore_nodeepscrub: - ignorable = ignorable + '|nodeep-scrub flag\(s\) set' status_critical = False if args.status_file: check_file_freshness(args.status_file) @@ -136,41 +132,60 @@ def check_ceph_status(args): luminous = False if overall_status != 'HEALTH_OK': - # Health is not OK, check if any lines are not in our list of OK - # any lines that don't match, check is critical + # Health is not OK, collect status message(s) and + # decide whether to return warning or critical + status_critical = False status_msg = [] if luminous: - status_messages = [x['summary']['message'] for x in status_data['health'].get('checks').values()] + status_messages = [x['summary']['message'] + for x in + status_data['health'].get('checks').values()] else: - status_messages = [x['summary'] for x in status_data['health']['summary']] + status_messages = [x['summary'] + for x in + status_data['health']['summary']] for status in status_messages: - if not re.match(ignorable, status): + status_msg.append(status) + # Check if nedeepscrub is set and whether it should raise an error + if args.raise_nodeepscrub: + if re.match("nodeep-scrub flag", status): + status_critical = True + if overall_status == 'HEALTH_CRITICAL' or \ + overall_status == 'HEALTH_ERR': + # HEALTH_ERR, report critical + status_critical = True + else: + # HEALTH_WARN + # Check the threshold for a list of operational tasks, + # and return CRITICAL if exceeded + degraded_ratio = float(status_data['pgmap'].get('degraded_ratio', + 0.0)) + if degraded_ratio > args.degraded_thresh: status_critical = True - status_msg.append(status) - # If we got this far, then the status is not OK but the status lines - # are all in our list of things we consider to be operational tasks. - # Check the thresholds and return CRITICAL if exceeded, - # otherwise there's something not accounted for and we want to know - # about it with a WARN alert. - degraded_ratio = status_data['pgmap'].get('degraded_ratio', 0.0) - if degraded_ratio > args.degraded_thresh: - status_critical = True - status_msg.append("Degraded ratio: {}".format(degraded_ratio)) - misplaced_ratio = status_data['pgmap'].get('misplaced_ratio', 0.0) - if misplaced_ratio > args.misplaced_thresh: - status_critical = True - status_msg.append("Misplaced ratio: {}".format(misplaced_ratio)) - recovering = status_data['pgmap'].get('recovering_objects_per_sec', - 0.0) - if recovering < args.recovery_rate: - status_critical = True - status_msg.append("Recovering objects/sec {}".format(recovering)) + if degraded_ratio > 0: + status_msg.append("Degraded ratio: {}".format(degraded_ratio)) + misplaced_ratio = float(status_data['pgmap'].get('misplaced_ratio', + 0.0)) + if misplaced_ratio > args.misplaced_thresh: + status_critical = True + if misplaced_ratio > 0: + status_msg.append("Misplaced ratio: {}". + format(misplaced_ratio)) + recovering = float(status_data['pgmap']. + get('recovering_objects_per_sec', 0.0)) + if (degraded_ratio > 0 or misplaced_ratio > 0) \ + and recovering > 0 \ + and recovering < args.recovery_rate: + status_critical = True + if recovering > 0: + status_msg.append("Recovering objects/s {}".format(recovering)) if status_critical: msg = 'CRITICAL: ceph health: "{} {}"'.format( overall_status, ", ".join(status_msg)) raise CriticalError(msg) - if overall_status == 'HEALTH_WARN': + else: + # overall_status == 'HEALTH_WARN': msg = "WARNING: {}".format(", ".join(status_msg)) raise WarnError(msg) message = "All OK" @@ -187,21 +202,21 @@ def parse_args(args): 'user account does not have rights for the Ceph ' 'config files.') parser.add_argument('--degraded_thresh', dest='degraded_thresh', - default=1, type=float, + default=1.0, type=float, help="Threshold for degraded ratio (0.1 = 10%)") parser.add_argument('--misplaced_thresh', dest='misplaced_thresh', - default=10, type=float, + default=1.0, type=float, help="Threshold for misplaced ratio (0.1 = 10%)") parser.add_argument('--recovery_rate', dest='recovery_rate', default=1, type=int, - help="Recovery rate below which we consider recovery " - "to be stalled") - parser.add_argument('--ignore_nodeepscrub', dest='ignore_nodeepscrub', + help="Recovery rate (in objects/s) below which we" + "consider recovery to be stalled") + parser.add_argument('--raise_nodeepscrub', dest='raise_nodeepscrub', default=False, action='store_true', - help="Whether to ignore the nodeep-scrub flag. If " - "the nodeep-scrub flag is set, the check returns " - "warning if this param is passed, otherwise " - "returns critical.") + help="Whether to raise an error for the nodeep-scrub" + "flag. If the nodeep-scrub flag is set," + "the check returns critical if this param is" + "passed, otherwise it returns warning.") return parser.parse_args(args) @@ -218,7 +233,7 @@ def main(args): exitcode = 'critical' except WarnError as msg: print(msg) - exitcode = 'critical' + exitcode = 'warning' except: print("%s raised unknown exception '%s'" % ('check_ceph_status', sys.exc_info()[0])) diff --git a/hooks/ceph_hooks.py b/hooks/ceph_hooks.py index ccbe7970..f007181c 100755 --- a/hooks/ceph_hooks.py +++ b/hooks/ceph_hooks.py @@ -748,8 +748,8 @@ def update_nrpe_config(): config('nagios_degraded_thresh'), config('nagios_misplaced_thresh'), config('nagios_recovery_rate')) - if config('nagios_ignore_nodeepscub'): - check_cmd = check_cmd + ' --ignore_nodeepscrub' + if config('nagios_raise_nodeepscrub'): + check_cmd = check_cmd + ' --raise_nodeepscrub' nrpe_setup.add_check( shortname="ceph", description='Check Ceph health {{{}}}'.format(current_unit), diff --git a/unit_tests/ceph_degraded_luminous.json b/unit_tests/ceph_degraded_luminous.json new file mode 100644 index 00000000..3cf3bdd3 --- /dev/null +++ b/unit_tests/ceph_degraded_luminous.json @@ -0,0 +1,147 @@ +{ + "fsid": "b03a2900-e297-11e8-a7db-00163ed10659", + "health": { + "checks": { + "OSD_DOWN": { + "severity": "HEALTH_WARN", + "summary": { + "message": "3 osds down" + } + }, + "OSD_HOST_DOWN": { + "severity": "HEALTH_WARN", + "summary": { + "message": "1 host (3 osds) down" + } + }, + "OBJECT_MISPLACED": { + "severity": "HEALTH_WARN", + "summary": { + "message": "9883/43779 objects misplaced (22.575%)" + } + }, + "PG_DEGRADED": { + "severity": "HEALTH_WARN", + "summary": { + "message": "Degraded data redundancy: 14001/43779 objects degraded (31.981%), 32 pgs degraded" + } + }, + "POOL_APP_NOT_ENABLED": { + "severity": "HEALTH_WARN", + "summary": { + "message": "application not enabled on 1 pool(s)" + } + }, + "TOO_FEW_PGS": { + "severity": "HEALTH_WARN", + "summary": { + "message": "too few PGs per OSD (7 < min 30)" + } + } + }, + "status": "HEALTH_WARN" + }, + "election_epoch": 5, + "quorum": [ + 0 + ], + "quorum_names": [ + "juju-460e0f-11" + ], + "monmap": { + "epoch": 1, + "fsid": "b03a2900-e297-11e8-a7db-00163ed10659", + "modified": "2018-11-07 14:17:12.324408", + "created": "2018-11-07 14:17:12.324408", + "features": { + "persistent": [ + "kraken", + "luminous" + ], + "optional": [] + }, + "mons": [ + { + "rank": 0, + "name": "juju-460e0f-11", + "addr": "192.168.100.81:6789/0", + "public_addr": "192.168.100.81:6789/0" + } + ] + }, + "osdmap": { + "osdmap": { + "epoch": 72, + "num_osds": 9, + "num_up_osds": 6, + "num_in_osds": 9, + "full": false, + "nearfull": false, + "num_remapped_pgs": 16 + } + }, + "pgmap": { + "pgs_by_state": [ + { + "state_name": "active+undersized+degraded", + "count": 16 + }, + { + "state_name": "active+undersized+degraded+remapped+backfill_wait", + "count": 14 + }, + { + "state_name": "active+undersized+degraded+remapped+backfilling", + "count": 2 + } + ], + "num_pgs": 32, + "num_pools": 1, + "num_objects": 14593, + "data_bytes": 61169729807, + "bytes_used": 14540595200, + "bytes_avail": 14889525248, + "bytes_total": 29430120448, + "degraded_objects": 14001, + "degraded_total": 43779, + "degraded_ratio": 0.319811, + "misplaced_objects": 9883, + "misplaced_total": 43779, + "misplaced_ratio": 0.225748 + }, + "fsmap": { + "epoch": 1, + "by_rank": [] + }, + "mgrmap": { + "epoch": 5, + "active_gid": 14097, + "active_name": "juju-460e0f-11", + "active_addr": "192.168.100.81:6800/204", + "available": true, + "standbys": [], + "modules": [ + "balancer", + "restful", + "status" + ], + "available_modules": [ + "balancer", + "dashboard", + "influx", + "localpool", + "prometheus", + "restful", + "selftest", + "status", + "zabbix" + ], + "services": {} + }, + "servicemap": { + "epoch": 1, + "modified": "0.000000", + "services": {} + } +} + diff --git a/unit_tests/ceph_error.json b/unit_tests/ceph_error.json new file mode 100644 index 00000000..eb9a161c --- /dev/null +++ b/unit_tests/ceph_error.json @@ -0,0 +1,118 @@ +{ + "health": { + "health": { + "health_services": [ + { + "mons": [ + { + "name": "juju-460e0f-12", + "kb_total": 1829760, + "kb_used": 835072, + "kb_avail": 994688, + "avail_percent": 54, + "last_updated": "2018-11-07 18:46:32.308592", + "store_stats": { + "bytes_total": 15678387, + "bytes_sst": 0, + "bytes_log": 420953, + "bytes_misc": 15257434, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + } + ] + } + ] + }, + "timechecks": { + "epoch": 3, + "round": 0, + "round_status": "finished" + }, + "summary": [ + { + "severity": "HEALTH_ERR", + "summary": "6 pgs are stuck inactive for more than 300 seconds" + }, + { + "severity": "HEALTH_WARN", + "summary": "7 pgs peering" + }, + { + "severity": "HEALTH_WARN", + "summary": "6 pgs stuck inactive" + }, + { + "severity": "HEALTH_WARN", + "summary": "6 pgs stuck unclean" + } + ], + "overall_status": "HEALTH_ERR", + "detail": [] + }, + "fsid": "68a9ca14-e297-11e8-843c-00163e64b0c0", + "election_epoch": 3, + "quorum": [ + 0 + ], + "quorum_names": [ + "juju-460e0f-12" + ], + "monmap": { + "epoch": 1, + "fsid": "68a9ca14-e297-11e8-843c-00163e64b0c0", + "modified": "2018-11-07 14:17:27.659064", + "created": "2018-11-07 14:17:27.659064", + "mons": [ + { + "rank": 0, + "name": "juju-460e0f-12", + "addr": "192.168.100.26:6789\/0" + } + ] + }, + "osdmap": { + "osdmap": { + "epoch": 28, + "num_osds": 9, + "num_up_osds": 9, + "num_in_osds": 9, + "full": false, + "nearfull": false, + "num_remapped_pgs": 0 + } + }, + "pgmap": { + "pgs_by_state": [ + { + "state_name": "creating", + "count": 113 + }, + { + "state_name": "active+clean", + "count": 64 + }, + { + "state_name": "activating", + "count": 8 + }, + { + "state_name": "peering", + "count": 7 + } + ], + "version": 7831, + "num_pgs": 192, + "data_bytes": 1790967809, + "bytes_used": 9995157504, + "bytes_avail": 9157476352, + "bytes_total": 19152633856, + "write_bytes_sec": 89844495, + "read_op_per_sec": 0, + "write_op_per_sec": 21 + }, + "fsmap": { + "epoch": 1, + "by_rank": [] + } +} diff --git a/unit_tests/ceph_many_warnings_luminous.json b/unit_tests/ceph_many_warnings_luminous.json new file mode 100644 index 00000000..3e5c11e8 --- /dev/null +++ b/unit_tests/ceph_many_warnings_luminous.json @@ -0,0 +1,147 @@ +{ + "fsid": "b03a2900-e297-11e8-a7db-00163ed10659", + "health": { + "checks": { + "OBJECT_MISPLACED": { + "severity": "HEALTH_WARN", + "summary": { + "message": "1560/12264 objects misplaced (12.720%)" + } + }, + "PG_AVAILABILITY": { + "severity": "HEALTH_WARN", + "summary": { + "message": "Reduced data availability: 27 pgs inactive, 30 pgs peering" + } + }, + "POOL_APP_NOT_ENABLED": { + "severity": "HEALTH_WARN", + "summary": { + "message": "application not enabled on 1 pool(s)" + } + }, + "TOO_FEW_PGS": { + "severity": "HEALTH_WARN", + "summary": { + "message": "too few PGs per OSD (21 < min 30)" + } + } + }, + "status": "HEALTH_WARN" + }, + "election_epoch": 5, + "quorum": [ + 0 + ], + "quorum_names": [ + "juju-460e0f-11" + ], + "monmap": { + "epoch": 1, + "fsid": "b03a2900-e297-11e8-a7db-00163ed10659", + "modified": "2018-11-07 14:17:12.324408", + "created": "2018-11-07 14:17:12.324408", + "features": { + "persistent": [ + "kraken", + "luminous" + ], + "optional": [] + }, + "mons": [ + { + "rank": 0, + "name": "juju-460e0f-11", + "addr": "192.168.100.81:6789/0", + "public_addr": "192.168.100.81:6789/0" + } + ] + }, + "osdmap": { + "osdmap": { + "epoch": 118, + "num_osds": 9, + "num_up_osds": 9, + "num_in_osds": 9, + "full": false, + "nearfull": false, + "num_remapped_pgs": 15 + } + }, + "pgmap": { + "pgs_by_state": [ + { + "state_name": "unknown", + "count": 65 + }, + { + "state_name": "peering", + "count": 31 + }, + { + "state_name": "activating", + "count": 17 + }, + { + "state_name": "activating+remapped", + "count": 15 + } + ], + "num_pgs": 128, + "num_pools": 1, + "num_objects": 4088, + "data_bytes": 17187733578, + "bytes_used": 14360064000, + "bytes_avail": 15023263744, + "bytes_total": 29383327744, + "unknown_pgs_ratio": 0.507812, + "inactive_pgs_ratio": 0.492188, + "misplaced_objects": 1560, + "misplaced_total": 12264, + "misplaced_ratio": 0.127202, + "recovering_objects_per_sec": 14, + "recovering_bytes_per_sec": 60779755, + "recovering_keys_per_sec": 0, + "num_objects_recovered": 113, + "num_bytes_recovered": 471859200, + "num_keys_recovered": 0, + "read_bytes_sec": 0, + "write_bytes_sec": 244132150, + "read_op_per_sec": 0, + "write_op_per_sec": 116 + }, + "fsmap": { + "epoch": 1, + "by_rank": [] + }, + "mgrmap": { + "epoch": 5, + "active_gid": 14097, + "active_name": "juju-460e0f-11", + "active_addr": "192.168.100.81:6800/204", + "available": true, + "standbys": [], + "modules": [ + "balancer", + "restful", + "status" + ], + "available_modules": [ + "balancer", + "dashboard", + "influx", + "localpool", + "prometheus", + "restful", + "selftest", + "status", + "zabbix" + ], + "services": {} + }, + "servicemap": { + "epoch": 1, + "modified": "0.000000", + "services": {} + } +} diff --git a/unit_tests/ceph_nodeepscrub.json b/unit_tests/ceph_nodeepscrub.json index fe3aedf8..2488fabb 100644 --- a/unit_tests/ceph_nodeepscrub.json +++ b/unit_tests/ceph_nodeepscrub.json @@ -1,177 +1,202 @@ { - "health": { "health": { - "health_services": [ - { - "mons": [ + "health": { + "health_services": [ + { + "mons": [ + { + "name": "juju-c62a41-21-lxd-0", + "kb_total": 334602320, + "kb_used": 2127960, + "kb_avail": 315454468, + "avail_percent": 94, + "last_updated": "2018-11-08 09:47:09.932189", + "store_stats": { + "bytes_total": 34880542, + "bytes_sst": 0, + "bytes_log": 1647123, + "bytes_misc": 33233419, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + }, + { + "name": "juju-c62a41-24-lxd-0", + "kb_total": 334602320, + "kb_used": 2128116, + "kb_avail": 315454312, + "avail_percent": 94, + "last_updated": "2018-11-08 09:47:16.418007", + "store_stats": { + "bytes_total": 36811676, + "bytes_sst": 0, + "bytes_log": 3574345, + "bytes_misc": 33237331, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + }, + { + "name": "juju-c62a41-25-lxd-0", + "kb_total": 334602320, + "kb_used": 2128860, + "kb_avail": 315453568, + "avail_percent": 94, + "last_updated": "2018-11-08 09:47:21.198816", + "store_stats": { + "bytes_total": 37388424, + "bytes_sst": 0, + "bytes_log": 4151569, + "bytes_misc": 33236855, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + } + ] + } + ] + }, + "timechecks": { + "epoch": 14, + "round": 4480, + "round_status": "finished", + "mons": [ + { + "name": "juju-c62a41-21-lxd-0", + "skew": 0.000000, + "latency": 0.000000, + "health": "HEALTH_OK" + }, + { + "name": "juju-c62a41-24-lxd-0", + "skew": 0.000282, + "latency": 0.000989, + "health": "HEALTH_OK" + }, + { + "name": "juju-c62a41-25-lxd-0", + "skew": -0.001223, + "latency": 0.000776, + "health": "HEALTH_OK" + } + ] + }, + "summary": [ { - "name": "node1", - "kb_total": 140956600, - "kb_used": 15916132, - "kb_avail": 117857208, - "avail_percent": 83, - "last_updated": "2017-05-17 03:23:11.248297", - "store_stats": { - "bytes_total": 140014259, - "bytes_sst": 0, - "bytes_log": 13670758, - "bytes_misc": 126343501, - "last_updated": "0.000000" - }, - "health": "HEALTH_OK" + "severity": "HEALTH_WARN", + "summary": "19 pgs backfill_wait" }, { - "name": "node2", - "kb_total": 70395920, - "kb_used": 10532504, - "kb_avail": 56264436, - "avail_percent": 79, - "last_updated": "2017-05-17 03:23:16.952673", - "store_stats": { - "bytes_total": 315512452, - "bytes_sst": 0, - "bytes_log": 21691698, - "bytes_misc": 293820754, - "last_updated": "0.000000" - }, - "health": "HEALTH_OK" + "severity": "HEALTH_WARN", + "summary": "4 pgs backfilling" }, { - "name": "juju-machine-85-lxc-10", - "kb_total": 131927524, - "kb_used": 79521024, - "kb_avail": 45954016, - "avail_percent": 34, - "last_updated": "2017-05-17 03:23:13.794034", - "store_stats": { - "bytes_total": 89036349, - "bytes_sst": 0, - "bytes_log": 21055337, - "bytes_misc": 67981012, - "last_updated": "0.000000" - }, - "health": "HEALTH_OK" + "severity": "HEALTH_WARN", + "summary": "1 pgs peering" + }, + { + "severity": "HEALTH_WARN", + "summary": "24 pgs stuck unclean" + }, + { + "severity": "HEALTH_WARN", + "summary": "recovery 17386\/112794 objects misplaced (15.414%)" + }, + { + "severity": "HEALTH_WARN", + "summary": "pool pool1 has many more objects per pg than average (too few pgs?)" + }, + { + "severity": "HEALTH_WARN", + "summary": "nodeep-scrub flag(s) set" } - ] - } - ] + ], + "overall_status": "HEALTH_WARN", + "detail": [] }, - "timechecks": { - "epoch": 280, - "round": 19874, - "round_status": "finished", - "mons": [ - { - "name": "node1", - "skew": "0.000000", - "latency": "0.000000", - "health": "HEALTH_OK" - }, - { - "name": "node2", - "skew": "-0.000000", - "latency": "0.000866", - "health": "HEALTH_OK" - }, - { - "name": "juju-machine-85-lxc-10", - "skew": "-0.000000", - "latency": "0.018848", - "health": "HEALTH_OK" - } - ] - }, - "summary": [ - { - "severity": "HEALTH_WARN", - "summary": "nodeep-scrub flag(s) set" - } + "fsid": "66af7af5-2f60-4e0e-94dc-49f49bd37284", + "election_epoch": 14, + "quorum": [ + 0, + 1, + 2 ], - "overall_status": "HEALTH_WARN", - "detail": [] - }, - "fsid": "some_fsid", - "election_epoch": 280, - "quorum": [ - 0, - 1, - 2 - ], - "quorum_names": [ - "node1", - "node2", - "juju-machine-85-lxc-10" - ], - "monmap": { - "epoch": 3, - "fsid": "some_fsid", - "modified": "2016-11-25 00:08:51.235813", - "created": "0.000000", - "mons": [ - { - "rank": 0, - "name": "node1", - "addr": "10.24.0.15:6789/0" - }, - { - "rank": 1, - "name": "node2", - "addr": "10.24.0.17:6789/0" - }, - { - "rank": 2, - "name": "juju-machine-85-lxc-10", - "addr": "10.24.0.195:6789/0" - } - ] - }, - "osdmap": { + "quorum_names": [ + "juju-c62a41-21-lxd-0", + "juju-c62a41-24-lxd-0", + "juju-c62a41-25-lxd-0" + ], + "monmap": { + "epoch": 2, + "fsid": "66af7af5-2f60-4e0e-94dc-49f49bd37284", + "modified": "2018-10-31 15:37:56.902830", + "created": "2018-10-31 15:37:40.288870", + "mons": [ + { + "rank": 0, + "name": "juju-c62a41-21-lxd-0", + "addr": "100.84.195.4:6789\/0" + }, + { + "rank": 1, + "name": "juju-c62a41-24-lxd-0", + "addr": "100.84.196.4:6789\/0" + }, + { + "rank": 2, + "name": "juju-c62a41-25-lxd-0", + "addr": "100.84.196.5:6789\/0" + } + ] + }, "osdmap": { - "epoch": 37820, - "num_osds": 46, - "num_up_osds": 46, - "num_in_osds": 46, - "full": false, - "nearfull": false + "osdmap": { + "epoch": 316, + "num_osds": 48, + "num_up_osds": 48, + "num_in_osds": 48, + "full": false, + "nearfull": false, + "num_remapped_pgs": 22 + } + }, + "pgmap": { + "pgs_by_state": [ + { + "state_name": "active+clean", + "count": 3448 + }, + { + "state_name": "active+remapped+wait_backfill", + "count": 19 + }, + { + "state_name": "active+remapped+backfilling", + "count": 4 + }, + { + "state_name": "peering", + "count": 1 + } + ], + "version": 141480, + "num_pgs": 3472, + "data_bytes": 157009583781, + "bytes_used": 487185850368, + "bytes_avail": 75282911256576, + "bytes_total": 75770097106944, + "misplaced_objects": 17386, + "misplaced_total": 112794, + "misplaced_ratio": 0.154139, + "recovering_objects_per_sec": 436, + "recovering_bytes_per_sec": 1832614589, + "recovering_keys_per_sec": 0, + "num_objects_recovered": 446, + "num_bytes_recovered": 1870659584, + "num_keys_recovered": 0 + }, + "fsmap": { + "epoch": 1, + "by_rank": [] } - }, - "pgmap": { - "pgs_by_state": [ - { - "state_name": "active+clean", - "count": 1988 - }, - { - "state_name": "active+remapped+wait_backfill", - "count": 3 - }, - { - "state_name": "active+remapped+backfilling", - "count": 1 - } - ], - "version": 58873447, - "num_pgs": 1992, - "data_bytes": 35851846298041, - "bytes_used": 107730678743040, - "bytes_avail": 63413590548480, - "bytes_total": 171144269291520, - "degraded_objects": 0, - "degraded_total": 25759217, - "degraded_ratio": 0, - "recovering_objects_per_sec": 17, - "recovering_bytes_per_sec": 72552794, - "recovering_keys_per_sec": 0, - "read_bytes_sec": 23935944, - "write_bytes_sec": 7024503, - "op_per_sec": 5332 - }, - "mdsmap": { - "epoch": 1, - "up": 0, - "in": 0, - "max": 1, - "by_rank": [] - } } - diff --git a/unit_tests/ceph_nodeepscrub_luminous.json b/unit_tests/ceph_nodeepscrub_luminous.json new file mode 100644 index 00000000..3d161fba --- /dev/null +++ b/unit_tests/ceph_nodeepscrub_luminous.json @@ -0,0 +1,102 @@ +{ + "fsid": "b03a2900-e297-11e8-a7db-00163ed10659", + "health": { + "checks": { + "OSDMAP_FLAGS": { + "severity": "HEALTH_WARN", + "summary": { + "message": "nodeep-scrub flag(s) set" + } + } + }, + "status": "HEALTH_WARN" + }, + "election_epoch": 5, + "quorum": [ + 0 + ], + "quorum_names": [ + "juju-460e0f-11" + ], + "monmap": { + "epoch": 1, + "fsid": "b03a2900-e297-11e8-a7db-00163ed10659", + "modified": "2018-11-07 14:17:12.324408", + "created": "2018-11-07 14:17:12.324408", + "features": { + "persistent": [ + "kraken", + "luminous" + ], + "optional": [] + }, + "mons": [ + { + "rank": 0, + "name": "juju-460e0f-11", + "addr": "192.168.100.81:6789/0", + "public_addr": "192.168.100.81:6789/0" + } + ] + }, + "osdmap": { + "osdmap": { + "epoch": 518, + "num_osds": 9, + "num_up_osds": 9, + "num_in_osds": 9, + "full": false, + "nearfull": false, + "num_remapped_pgs": 0 + } + }, + "pgmap": { + "pgs_by_state": [ + { + "state_name": "active+clean", + "count": 128 + } + ], + "num_pgs": 128, + "num_pools": 1, + "num_objects": 14896, + "data_bytes": 62440603919, + "bytes_used": 14225776640, + "bytes_avail": 9450938368, + "bytes_total": 23676715008 + }, + "fsmap": { + "epoch": 1, + "by_rank": [] + }, + "mgrmap": { + "epoch": 5, + "active_gid": 14097, + "active_name": "juju-460e0f-11", + "active_addr": "192.168.100.81:6800/204", + "available": true, + "standbys": [], + "modules": [ + "balancer", + "restful", + "status" + ], + "available_modules": [ + "balancer", + "dashboard", + "influx", + "localpool", + "prometheus", + "restful", + "selftest", + "status", + "zabbix" + ], + "services": {} + }, + "servicemap": { + "epoch": 1, + "modified": "0.000000", + "services": {} + } +} diff --git a/unit_tests/test_ceph_hooks.py b/unit_tests/test_ceph_hooks.py index 0f96d2da..fa792042 100644 --- a/unit_tests/test_ceph_hooks.py +++ b/unit_tests/test_ceph_hooks.py @@ -48,7 +48,7 @@ CHARM_CONFIG = {'config-flags': '', 'nagios_degraded_thresh': '1', 'nagios_misplaced_thresh': '10', 'nagios_recovery_rate': '1', - 'nagios_ignore_nodeepscub': False, + 'nagios_raise_nodeepscrub': True, 'disable-pg-max-object-skew': False} diff --git a/unit_tests/test_check_ceph_status.py b/unit_tests/test_check_ceph_status.py index caf89bc9..69ac4177 100644 --- a/unit_tests/test_check_ceph_status.py +++ b/unit_tests/test_check_ceph_status.py @@ -32,6 +32,7 @@ class NagiosTestCase(unittest.TestCase): ceph_version = check_ceph_status.get_ceph_version() self.assertEqual(ceph_version, [10, 2, 9]) + # All OK, pre-luminoius @patch('check_ceph_status.get_ceph_version') def test_health_ok(self, mock_ceph_version, mock_subprocess): mock_ceph_version.return_value = [10, 2, 9] @@ -42,6 +43,84 @@ class NagiosTestCase(unittest.TestCase): check_output = check_ceph_status.check_ceph_status(args) self.assertRegex(check_output, r"^All OK$") + # Warning, pre-luminous + @patch('check_ceph_status.get_ceph_version') + def test_health_warn(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] + with open('unit_tests/ceph_warn.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args("") + self.assertRaises(check_ceph_status.WarnError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, pre-luminous, health_critical status + @patch('check_ceph_status.get_ceph_version') + def test_health_err(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] + with open('unit_tests/ceph_crit.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args("") + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, pre-luminous, overall HEALTH_ERR + @patch('check_ceph_status.get_ceph_version') + def test_health_crit(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] + with open('unit_tests/ceph_error.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args("") + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, pre-luminous, because misplaced ratio is too big + @patch('check_ceph_status.get_ceph_version') + def test_health_crit_misplaced(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] + with open('unit_tests/ceph_params.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--misplaced_thresh', '0.1']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, pre-luminous, because recovery rate is too low + @patch('check_ceph_status.get_ceph_version') + def test_health_crit_recovery(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] + with open('unit_tests/ceph_params.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--recovery_rate', '400']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Warning, pre-luminous, deepscrub + @patch('check_ceph_status.get_ceph_version') + def test_health_warn_deepscrub(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] + with open('unit_tests/ceph_nodeepscrub.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args("") + self.assertRaises(check_ceph_status.WarnError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, pre-luminous, deepscrub + @patch('check_ceph_status.get_ceph_version') + def test_health_crit_deepscrub(self, mock_ceph_version, mock_subprocess): + mock_ceph_version.return_value = [10, 2, 9] + with open('unit_tests/ceph_nodeepscrub.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--raise_nodeepscrub']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # All OK, luminous @patch('check_ceph_status.get_ceph_version') def test_health_ok_luminous(self, mock_ceph_version, mock_subprocess): mock_ceph_version.return_value = [12, 2, 0] @@ -52,62 +131,80 @@ class NagiosTestCase(unittest.TestCase): check_output = check_ceph_status.check_ceph_status(args) self.assertRegex(check_output, r"^All OK$") + # Warning, luminous @patch('check_ceph_status.get_ceph_version') - def test_health_warn(self, mock_ceph_version, mock_subprocess): - mock_ceph_version.return_value = [10, 2, 9] - with open('unit_tests/ceph_warn.json') as f: - tree = f.read() - mock_subprocess.return_value = tree.encode('UTF-8') - args = check_ceph_status.parse_args(['--degraded_thresh', '1']) - self.assertRaises(check_ceph_status.WarnError, - lambda: check_ceph_status.check_ceph_status(args)) - - @patch('check_ceph_status.get_ceph_version') - def test_health_crit(self, mock_ceph_version, mock_subprocess): - mock_ceph_version.return_value = [10, 2, 9] - with open('unit_tests/ceph_crit.json') as f: - tree = f.read() - mock_subprocess.return_value = tree.encode('UTF-8') - args = check_ceph_status.parse_args(['--degraded_thresh', '1']) - self.assertRaises(check_ceph_status.CriticalError, - lambda: check_ceph_status.check_ceph_status(args)) - - @patch('check_ceph_status.get_ceph_version') - def test_health_crit_luminous(self, mock_ceph_version, mock_subprocess): + def test_health_warn_luminous(self, mock_ceph_version, mock_subprocess): mock_ceph_version.return_value = [12, 2, 0] - with open('unit_tests/ceph_crit_luminous.json') as f: + with open('unit_tests/ceph_many_warnings_luminous.json') as f: tree = f.read() mock_subprocess.return_value = tree.encode('UTF-8') - args = check_ceph_status.parse_args(['--degraded_thresh', '1']) - self.assertRaises(check_ceph_status.CriticalError, - lambda: check_ceph_status.check_ceph_status(args)) - - @patch('check_ceph_status.get_ceph_version') - def test_health_lotsdegraded(self, mock_ceph_version, mock_subprocess): - mock_ceph_version.return_value = [10, 2, 9] - with open('unit_tests/ceph_params.json') as f: - tree = f.read() - mock_subprocess.return_value = tree.encode('UTF-8') - args = check_ceph_status.parse_args(['--degraded_thresh', '1']) - self.assertRaises(check_ceph_status.CriticalError, - lambda: check_ceph_status.check_ceph_status(args)) - - @patch('check_ceph_status.get_ceph_version') - def test_health_nodeepscrub(self, mock_ceph_version, mock_subprocess): - mock_ceph_version.return_value = [10, 2, 9] - with open('unit_tests/ceph_nodeepscrub.json') as f: - tree = f.read() - mock_subprocess.return_value = tree.encode('UTF-8') - args = check_ceph_status.parse_args(['--degraded_thresh', '1']) - self.assertRaises(check_ceph_status.CriticalError, - lambda: check_ceph_status.check_ceph_status(args)) - - @patch('check_ceph_status.get_ceph_version') - def test_health_nodeepscrubok(self, mock_ceph_version, mock_subprocess): - mock_ceph_version.return_value = [10, 2, 9] - with open('unit_tests/ceph_nodeepscrub.json') as f: - tree = f.read() - mock_subprocess.return_value = tree.encode('UTF-8') - args = check_ceph_status.parse_args(['--ignore_nodeepscrub']) + args = check_ceph_status.parse_args("") self.assertRaises(check_ceph_status.WarnError, lambda: check_ceph_status.check_ceph_status(args)) + +# Error, luminous, because of overall status + + # Error, luminous, because misplaced ratio is too big + @patch('check_ceph_status.get_ceph_version') + def test_health_critical_misplaced_luminous(self, + mock_ceph_version, + mock_subprocess): + mock_ceph_version.return_value = [12, 2, 0] + with open('unit_tests/ceph_many_warnings_luminous.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--misplaced_thresh', '0.1']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, luminous, because degraded ratio is too big + @patch('check_ceph_status.get_ceph_version') + def test_health_critical_degraded_luminous(self, + mock_ceph_version, + mock_subprocess): + mock_ceph_version.return_value = [12, 2, 0] + with open('unit_tests/ceph_degraded_luminous.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--degraded_thresh', '0.1']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, luminous, because recovery rate is too low + @patch('check_ceph_status.get_ceph_version') + def test_health_critical_recovery_luminous(self, + mock_ceph_version, + mock_subprocess): + mock_ceph_version.return_value = [12, 2, 0] + with open('unit_tests/ceph_many_warnings_luminous.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--recovery_rate', '20']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Warning, luminous, deepscrub + @patch('check_ceph_status.get_ceph_version') + def test_health_warn_deepscrub_luminous(self, + mock_ceph_version, + mock_subprocess): + mock_ceph_version.return_value = [12, 2, 0] + with open('unit_tests/ceph_nodeepscrub_luminous.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args("") + self.assertRaises(check_ceph_status.WarnError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Error, luminous, deepscrub + @patch('check_ceph_status.get_ceph_version') + def test_health_crit_deepscrub_luminous(self, + mock_ceph_version, + mock_subprocess): + mock_ceph_version.return_value = [12, 2, 0] + with open('unit_tests/ceph_nodeepscrub_luminous.json') as f: + tree = f.read() + mock_subprocess.return_value = tree.encode('UTF-8') + args = check_ceph_status.parse_args(['--raise_nodeepscrub']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args))