From e93c4a903a79e73f979f31cbf26d35344bf972af Mon Sep 17 00:00:00 2001 From: Xav Paice Date: Fri, 5 May 2017 21:50:56 +1200 Subject: [PATCH] Update ceph nagios plugin Changes ceph plugin so it ignores ceph rebalancing unless there is a large percentage of misplaced/degraded objects (return warning for that). Adds config options to tweak that monitoring, and also just warn if nodeep-scrub was deliberately set. Includes some basic unit tests. Change-Id: I317448cd769597068a706d3944d9d5419e0445c1 --- config.yaml | 16 ++ files/nagios/check_ceph_status.py | 224 +++++++++++++++++++++----- files/nagios/collect_ceph_status.sh | 2 +- hooks/ceph_hooks.py | 10 +- unit_tests/ceph_crit.json | 226 +++++++++++++++++++++++++++ unit_tests/ceph_nodeepscrub.json | 177 +++++++++++++++++++++ unit_tests/ceph_ok.json | 1 + unit_tests/ceph_params.json | 222 ++++++++++++++++++++++++++ unit_tests/ceph_warn.json | 1 + unit_tests/test_ceph_hooks.py | 18 ++- unit_tests/test_check_ceph_status.py | 75 +++++++++ 11 files changed, 931 insertions(+), 41 deletions(-) create mode 100644 unit_tests/ceph_crit.json create mode 100644 unit_tests/ceph_nodeepscrub.json create mode 100644 unit_tests/ceph_ok.json create mode 100644 unit_tests/ceph_params.json create mode 100644 unit_tests/ceph_warn.json create mode 100644 unit_tests/test_check_ceph_status.py diff --git a/config.yaml b/config.yaml index 39688be6..e718c372 100644 --- a/config.yaml +++ b/config.yaml @@ -195,6 +195,22 @@ options: description: | A comma-separated list of nagios servicegroups. If left empty, the nagios_context will be used as the servicegroup. + nagios_degraded_thresh: + default: 1.0 + type: float + description: "Threshold for degraded ratio (0.1 = 10%)" + nagios_misplaced_thresh: + default: 10.0 + type: float + description: "Threshold for misplaced ratio (0.1 = 10%)" + nagios_recovery_rate: + default: '1' + type: string + description: Recovery rate below which we consider recovery to be stalled + nagios_ignore_nodeepscub: + default: False + type: boolean + description: Whether to ignore the nodeep-scrub flag use-direct-io: type: boolean default: True diff --git a/files/nagios/check_ceph_status.py b/files/nagios/check_ceph_status.py index 09ee5f8d..2df223ff 100755 --- a/files/nagios/check_ceph_status.py +++ b/files/nagios/check_ceph_status.py @@ -1,53 +1,205 @@ #!/usr/bin/env python -# Copyright (C) 2014 Canonical -# All Rights Reserved -# Author: Jacek Nykis +# Copyright (C) 2005, 2006, 2007, 2012 James Troup +# Copyright (C) 2014, 2017 Canonical +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Authors: Jacek Nykis +# Xav Paice +# James Troup import re import argparse +import json +import os import subprocess -import nagios_plugin +import sys +import time +import traceback + + +class CriticalError(Exception): + """This indicates a critical error.""" + pass + + +class WarnError(Exception): + """This indicates a warning condition.""" + pass + + +class UnknownError(Exception): + """This indicates a unknown error was encountered.""" + pass + + +def check_file_freshness(filename, newer_than=3600): + """ + Check a file exists, is readable and is newer than seconds (where + defaults to 3600). + """ + # First check the file exists and is readable + if not os.path.exists(filename): + raise CriticalError("%s: does not exist." % (filename)) + if os.access(filename, os.R_OK) == 0: + raise CriticalError("%s: is not readable." % (filename)) + + # Then ensure the file is up-to-date enough + mtime = os.stat(filename).st_mtime + last_modified = time.time() - mtime + if last_modified > newer_than: + raise CriticalError("%s: was last modified on %s and is too old " + "(> %s seconds)." + % (filename, time.ctime(mtime), newer_than)) + if last_modified < 0: + raise CriticalError("%s: was last modified on %s which is in the " + "future." + % (filename, time.ctime(mtime))) def check_ceph_status(args): + """ + Used to check the status of a Ceph cluster. Uses the output of 'ceph + status' to determine if health is OK, and if not, should we alert on that + situation. + + If status is HEALTH_OK then this function returns OK with no further check. + Otherwise, look for known situations which could cause ceph status to + return not OK, but things which represent general operations and don't + warrant a pager event. These include OSD reweight actions, and + nodeep-scrub flag setting, with limits for the amount of misplaced data. + + :param args: argparse object formatted in the convention of generic Nagios + checks + :returns string, describing the status of the ceph cluster. + """ + + ignorable = (r'\d+ pgs (?:backfill|degraded|recovery_wait|stuck unclean)|' + 'recovery \d+\/\d+ objects (?:degraded|misplaced)') + if args.ignore_nodeepscrub: + ignorable = ignorable + '|nodeep-scrub flag\(s\) set' + status_critical = False if args.status_file: - nagios_plugin.check_file_freshness(args.status_file, 3600) - with open(args.status_file, "r") as f: - lines = f.readlines() + check_file_freshness(args.status_file) + with open(args.status_file) as f: + tree = f.read() + status_data = json.loads(tree) else: - lines = subprocess.check_output(["ceph", "status"]).split('\n') - status_data = dict(l.strip().split(' ', 1) for l in lines if len(l) > 1) + try: + tree = subprocess.check_output(['ceph', + 'status', + '--format', 'json']) + except subprocess.CalledProcessError as e: + raise UnknownError( + "UNKNOWN: ceph status command failed with error: {}".format(e)) + status_data = json.loads(tree) - if ('health' not in status_data or - 'monmap' not in status_data or - 'osdmap' not in status_data): - raise nagios_plugin.UnknownError('UNKNOWN: status data is incomplete') + required_keys = ['health', 'monmap', 'pgmap'] + if not all(key in status_data.keys() for key in required_keys): + raise UnknownError('UNKNOWN: status data is incomplete') - if status_data['health'] != 'HEALTH_OK': - msg = 'CRITICAL: ceph health status: "{}'.format(status_data['health']) - if (len(status_data['health'].split(' '))) == 1: - a = iter(lines) - for line in a: - if re.search('health', line) is not None: - msg1 = next(a) - msg += " " - msg += msg1.strip() - break - msg += '"' - raise nagios_plugin.CriticalError(msg) + if status_data['health']['overall_status'] != 'HEALTH_OK': + # Health is not OK, check if any lines are not in our list of OK + # any lines that don't match, check is critical + status_msg = [] + for status in status_data['health']['summary']: + if not re.match(ignorable, status['summary']): + status_critical = True + status_msg.append(status['summary']) + # If we got this far, then the status is not OK but the status lines + # are all in our list of things we consider to be operational tasks. + # Check the thresholds and return CRITICAL if exceeded, + # otherwise there's something not accounted for and we want to know + # about it with a WARN alert. + degraded_ratio = status_data['pgmap'].get('degraded_ratio', 0.0) + if degraded_ratio > args.degraded_thresh: + status_critical = True + status_msg.append("Degraded ratio: {}".format(degraded_ratio)) + misplaced_ratio = status_data['pgmap'].get('misplaced_ratio', 0.0) + if misplaced_ratio > args.misplaced_thresh: + status_critical = True + status_msg.append("Misplaced ratio: {}".format(misplaced_ratio)) + recovering = status_data['pgmap'].get('recovering_objects_per_sec', + 0.0) + if recovering < args.recovery_rate: + status_critical = True + status_msg.append("Recovering objects/sec {}".format(recovering)) + if status_critical: + msg = 'CRITICAL: ceph health: "{} {}"'.format( + status_data['health']['overall_status'], + ", ".join(status_msg)) + raise CriticalError(msg) + if status_data['health']['overall_status'] == 'HEALTH_WARN': + msg = "WARNING: {}".format(", ".join(status_msg)) + raise WarnError(msg) + message = "All OK" + print(message) + return message - osds = re.search("^.*: (\d+) osds: (\d+) up, (\d+) in", status_data['osdmap']) - if osds.group(1) > osds.group(2): # not all OSDs are "up" - msg = 'CRITICAL: Some OSDs are not up. Total: {}, up: {}'.format( - osds.group(1), osds.group(2)) - raise nagios_plugin.CriticalError(msg) - print "All OK" + +def parse_args(args): + parser = argparse.ArgumentParser(description='Check ceph status') + parser.add_argument('-f', '--file', dest='status_file', + default=False, + help='Optional file with "ceph status" output. ' + 'Generally useful for testing, and if the Nagios ' + 'user account does not have rights for the Ceph ' + 'config files.') + parser.add_argument('--degraded_thresh', dest='degraded_thresh', + default=1, type=float, + help="Threshold for degraded ratio (0.1 = 10%)") + parser.add_argument('--misplaced_thresh', dest='misplaced_thresh', + default=10, type=float, + help="Threshold for misplaced ratio (0.1 = 10%)") + parser.add_argument('--recovery_rate', dest='recovery_rate', + default=1, type=int, + help="Recovery rate below which we consider recovery " + "to be stalled") + parser.add_argument('--ignore_nodeepscrub', dest='ignore_nodeepscrub', + default=False, action='store_true', + help="Whether to ignore the nodeep-scrub flag. If " + "the nodeep-scrub flag is set, the check returns " + "warning if this param is passed, otherwise " + "returns critical.") + return parser.parse_args(args) + + +def main(args): + EXIT_CODES = {'ok': 0, 'warning': 1, 'critical': 2, 'unknown': 3} + exitcode = 'ok' + try: + msg = check_ceph_status(args) + except UnknownError as msg: + print(msg) + exitcode = 'unknown' + except CriticalError as msg: + print(msg) + exitcode = 'critical' + except WarnError as msg: + print(msg) + exitcode = 'critical' + except: + print("%s raised unknown exception '%s'" % ('check_ceph_status', + sys.exc_info()[0])) + print('=' * 60) + traceback.print_exc(file=sys.stdout) + print('=' * 60) + exitcode = 'unknown' + return EXIT_CODES[exitcode] if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Check ceph status') - parser.add_argument('-f', '--file', dest='status_file', - default=False, help='Optional file with "ceph status" output') - args = parser.parse_args() - nagios_plugin.try_check(check_ceph_status, args) + args = parse_args(sys.argv[1:]) + status = main(args) + sys.exit(status) diff --git a/files/nagios/collect_ceph_status.sh b/files/nagios/collect_ceph_status.sh index dbdd3acf..2f72a42c 100755 --- a/files/nagios/collect_ceph_status.sh +++ b/files/nagios/collect_ceph_status.sh @@ -15,4 +15,4 @@ if [ ! -d $DATA_DIR ]; then mkdir -p $DATA_DIR fi -ceph status >${DATA_DIR}/cat-ceph-status.txt +ceph status --format json >${DATA_DIR}/cat-ceph-status.txt diff --git a/hooks/ceph_hooks.py b/hooks/ceph_hooks.py index 9de2cd75..7e158383 100755 --- a/hooks/ceph_hooks.py +++ b/hooks/ceph_hooks.py @@ -654,10 +654,18 @@ def update_nrpe_config(): hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() nrpe_setup = nrpe.NRPE(hostname=hostname) + check_cmd = 'check_ceph_status.py -f {} --degraded_thresh {}' \ + ' --misplaced_thresh {}' \ + ' --recovery_rate {}'.format(STATUS_FILE, + config('nagios_degraded_thresh'), + config('nagios_misplaced_thresh'), + config('nagios_recovery_rate')) + if config('nagios_ignore_nodeepscub'): + check_cmd = check_cmd + ' --ignore_nodeepscrub' nrpe_setup.add_check( shortname="ceph", description='Check Ceph health {%s}' % current_unit, - check_cmd='check_ceph_status.py -f {}'.format(STATUS_FILE) + check_cmd=check_cmd ) nrpe_setup.write() diff --git a/unit_tests/ceph_crit.json b/unit_tests/ceph_crit.json new file mode 100644 index 00000000..faa23cef --- /dev/null +++ b/unit_tests/ceph_crit.json @@ -0,0 +1,226 @@ +{ + "health": { + "health": { + "health_services": [ + { + "mons": [ + { + "name": "juju-2691ab-1-lxd-1", + "kb_total": 155284096, + "kb_used": 1247744, + "kb_avail": 154036352, + "avail_percent": 99, + "last_updated": "2017-05-17 03:31:35.562497", + "store_stats": { + "bytes_total": 1012055342, + "bytes_sst": 0, + "bytes_log": 29673298, + "bytes_misc": 982382044, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + }, + { + "name": "juju-2691ab-13-lxd-0", + "kb_total": 153820288, + "kb_used": 1361280, + "kb_avail": 152459008, + "avail_percent": 99, + "last_updated": "2017-05-17 03:31:04.097201", + "store_stats": { + "bytes_total": 1370003168, + "bytes_sst": 0, + "bytes_log": 29813159, + "bytes_misc": 1340190009, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + }, + { + "name": "juju-2691ab-2-lxd-1", + "kb_total": 155251072, + "kb_used": 1373440, + "kb_avail": 153877632, + "avail_percent": 99, + "last_updated": "2017-05-17 03:31:20.684777", + "store_stats": { + "bytes_total": 1400974192, + "bytes_sst": 0, + "bytes_log": 1129945, + "bytes_misc": 1399844247, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + } + ] + } + ] + }, + "timechecks": { + "epoch": 32, + "round": 24492, + "round_status": "finished", + "mons": [ + { + "name": "juju-2691ab-1-lxd-1", + "skew": 0, + "latency": 0, + "health": "HEALTH_OK" + }, + { + "name": "juju-2691ab-13-lxd-0", + "skew": 0.000919, + "latency": 0.001036, + "health": "HEALTH_OK" + }, + { + "name": "juju-2691ab-2-lxd-1", + "skew": 0, + "latency": 0.001009, + "health": "HEALTH_OK" + } + ] + }, + "summary": [ + { + "severity": "HEALTH_WARN", + "summary": "48 pgs backfill_wait" + }, + { + "severity": "HEALTH_WARN", + "summary": "45 pgs backfilling" + }, + { + "severity": "HEALTH_WARN", + "summary": "1 pgs degraded" + }, + { + "severity": "HEALTH_WARN", + "summary": "1 pgs recovery_wait" + }, + { + "severity": "HEALTH_WARN", + "summary": "22 pgs stuck unclean" + }, + { + "severity": "HEALTH_WARN", + "summary": "recovery 14/46842755 objects degraded (0.000%)" + }, + { + "severity": "HEALTH_WARN", + "summary": "recovery 448540/46842755 objects misplaced (0.958%)" + }, + { + "severity": "HEALTH_CRITICAL", + "summary": "Test critical status message" + } + ], + "overall_status": "HEALTH_CRITICAL", + "detail": [] + }, + "fsid": "ca9451f1-5c4f-4e85-bb14-a08dfc0568f7", + "election_epoch": 32, + "quorum": [ + 0, + 1, + 2 + ], + "quorum_names": [ + "juju-2691ab-1-lxd-1", + "juju-2691ab-13-lxd-0", + "juju-2691ab-2-lxd-1" + ], + "monmap": { + "epoch": 1, + "fsid": "ca9451f1-5c4f-4e85-bb14-a08dfc0568f7", + "modified": "2016-12-03 08:09:21.854671", + "created": "2016-12-03 08:09:21.854671", + "mons": [ + { + "rank": 0, + "name": "juju-2691ab-1-lxd-1", + "addr": "10.182.254.221:6789/0" + }, + { + "rank": 1, + "name": "juju-2691ab-13-lxd-0", + "addr": "10.182.254.229:6789/0" + }, + { + "rank": 2, + "name": "juju-2691ab-2-lxd-1", + "addr": "10.182.254.242:6789/0" + } + ] + }, + "osdmap": { + "osdmap": { + "epoch": 141540, + "num_osds": 314, + "num_up_osds": 314, + "num_in_osds": 314, + "full": false, + "nearfull": false, + "num_remapped_pgs": 92 + } + }, + "pgmap": { + "pgs_by_state": [ + { + "state_name": "active+clean", + "count": 9274 + }, + { + "state_name": "active+remapped+wait_backfill", + "count": 48 + }, + { + "state_name": "active+remapped+backfilling", + "count": 45 + }, + { + "state_name": "active+clean+scrubbing+deep", + "count": 9 + }, + { + "state_name": "active+remapped", + "count": 2 + }, + { + "state_name": "active+recovery_wait+degraded", + "count": 1 + }, + { + "state_name": "active+clean+scrubbing", + "count": 1 + } + ], + "version": 13885884, + "num_pgs": 9380, + "data_bytes": 64713222471610, + "bytes_used": 193613093122048, + "bytes_avail": 690058090491904, + "bytes_total": 883671183613952, + "degraded_objects": 14, + "degraded_total": 46842755, + "degraded_ratio": 0, + "misplaced_objects": 448540, + "misplaced_total": 46842755, + "misplaced_ratio": 0.009575, + "recovering_objects_per_sec": 389, + "recovering_bytes_per_sec": 1629711746, + "recovering_keys_per_sec": 0, + "num_objects_recovered": 218, + "num_bytes_recovered": 912252928, + "num_keys_recovered": 0, + "read_bytes_sec": 117041457, + "write_bytes_sec": 293414043, + "read_op_per_sec": 5282, + "write_op_per_sec": 5270 + }, + "fsmap": { + "epoch": 1, + "by_rank": [] + } +} + diff --git a/unit_tests/ceph_nodeepscrub.json b/unit_tests/ceph_nodeepscrub.json new file mode 100644 index 00000000..fe3aedf8 --- /dev/null +++ b/unit_tests/ceph_nodeepscrub.json @@ -0,0 +1,177 @@ +{ + "health": { + "health": { + "health_services": [ + { + "mons": [ + { + "name": "node1", + "kb_total": 140956600, + "kb_used": 15916132, + "kb_avail": 117857208, + "avail_percent": 83, + "last_updated": "2017-05-17 03:23:11.248297", + "store_stats": { + "bytes_total": 140014259, + "bytes_sst": 0, + "bytes_log": 13670758, + "bytes_misc": 126343501, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + }, + { + "name": "node2", + "kb_total": 70395920, + "kb_used": 10532504, + "kb_avail": 56264436, + "avail_percent": 79, + "last_updated": "2017-05-17 03:23:16.952673", + "store_stats": { + "bytes_total": 315512452, + "bytes_sst": 0, + "bytes_log": 21691698, + "bytes_misc": 293820754, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + }, + { + "name": "juju-machine-85-lxc-10", + "kb_total": 131927524, + "kb_used": 79521024, + "kb_avail": 45954016, + "avail_percent": 34, + "last_updated": "2017-05-17 03:23:13.794034", + "store_stats": { + "bytes_total": 89036349, + "bytes_sst": 0, + "bytes_log": 21055337, + "bytes_misc": 67981012, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + } + ] + } + ] + }, + "timechecks": { + "epoch": 280, + "round": 19874, + "round_status": "finished", + "mons": [ + { + "name": "node1", + "skew": "0.000000", + "latency": "0.000000", + "health": "HEALTH_OK" + }, + { + "name": "node2", + "skew": "-0.000000", + "latency": "0.000866", + "health": "HEALTH_OK" + }, + { + "name": "juju-machine-85-lxc-10", + "skew": "-0.000000", + "latency": "0.018848", + "health": "HEALTH_OK" + } + ] + }, + "summary": [ + { + "severity": "HEALTH_WARN", + "summary": "nodeep-scrub flag(s) set" + } + ], + "overall_status": "HEALTH_WARN", + "detail": [] + }, + "fsid": "some_fsid", + "election_epoch": 280, + "quorum": [ + 0, + 1, + 2 + ], + "quorum_names": [ + "node1", + "node2", + "juju-machine-85-lxc-10" + ], + "monmap": { + "epoch": 3, + "fsid": "some_fsid", + "modified": "2016-11-25 00:08:51.235813", + "created": "0.000000", + "mons": [ + { + "rank": 0, + "name": "node1", + "addr": "10.24.0.15:6789/0" + }, + { + "rank": 1, + "name": "node2", + "addr": "10.24.0.17:6789/0" + }, + { + "rank": 2, + "name": "juju-machine-85-lxc-10", + "addr": "10.24.0.195:6789/0" + } + ] + }, + "osdmap": { + "osdmap": { + "epoch": 37820, + "num_osds": 46, + "num_up_osds": 46, + "num_in_osds": 46, + "full": false, + "nearfull": false + } + }, + "pgmap": { + "pgs_by_state": [ + { + "state_name": "active+clean", + "count": 1988 + }, + { + "state_name": "active+remapped+wait_backfill", + "count": 3 + }, + { + "state_name": "active+remapped+backfilling", + "count": 1 + } + ], + "version": 58873447, + "num_pgs": 1992, + "data_bytes": 35851846298041, + "bytes_used": 107730678743040, + "bytes_avail": 63413590548480, + "bytes_total": 171144269291520, + "degraded_objects": 0, + "degraded_total": 25759217, + "degraded_ratio": 0, + "recovering_objects_per_sec": 17, + "recovering_bytes_per_sec": 72552794, + "recovering_keys_per_sec": 0, + "read_bytes_sec": 23935944, + "write_bytes_sec": 7024503, + "op_per_sec": 5332 + }, + "mdsmap": { + "epoch": 1, + "up": 0, + "in": 0, + "max": 1, + "by_rank": [] + } +} + diff --git a/unit_tests/ceph_ok.json b/unit_tests/ceph_ok.json new file mode 100644 index 00000000..2eafbc15 --- /dev/null +++ b/unit_tests/ceph_ok.json @@ -0,0 +1 @@ +{"health":{"health":{"health_services":[{"mons":[{"name":"somehost-2","kb_total":384443444,"kb_used":254122936,"kb_avail":110768868,"avail_percent":28,"last_updated":"2017-06-28 07:22:57.268852","store_stats":{"bytes_total":563914940,"bytes_sst":0,"bytes_log":1201349,"bytes_misc":562713591,"last_updated":"0.000000"},"health":"HEALTH_OK"},{"name":"somehost-3","kb_total":384443444,"kb_used":181563008,"kb_avail":183328796,"avail_percent":47,"last_updated":"2017-06-28 07:22:09.013733","store_stats":{"bytes_total":584703758,"bytes_sst":0,"bytes_log":17361907,"bytes_misc":567341851,"last_updated":"0.000000"},"health":"HEALTH_OK"},{"name":"somehost-4","kb_total":384443444,"kb_used":278218520,"kb_avail":86673284,"avail_percent":22,"last_updated":"2017-06-28 07:22:31.725105","store_stats":{"bytes_total":598087748,"bytes_sst":0,"bytes_log":26273616,"bytes_misc":571814132,"last_updated":"0.000000"},"health":"HEALTH_OK"}]}]},"timechecks":{"epoch":52,"round":35412,"round_status":"finished","mons":[{"name":"somehost-2","skew":0.000000,"latency":0.000000,"health":"HEALTH_OK"},{"name":"somehost-3","skew":-0.001662,"latency":0.000531,"health":"HEALTH_OK"},{"name":"somehost-4","skew":-0.000034,"latency":0.000425,"health":"HEALTH_OK"}]},"summary":[],"overall_status":"HEALTH_OK","detail":[]},"fsid":"9486fd14-676d-481c-aa16-77b071a315d8","election_epoch":52,"quorum":[0,1,2],"quorum_names":["somehost-2","somehost-3","somehost-4"],"monmap":{"epoch":1,"fsid":"9486fd14-676d-481c-aa16-77b071a315d8","modified":"2016-08-09 06:33:15.685755","created":"2016-08-09 06:33:15.685755","mons":[{"rank":0,"name":"somehost-2","addr":"10.28.2.21:6789\/0"},{"rank":1,"name":"somehost-3","addr":"10.28.2.22:6789\/0"},{"rank":2,"name":"somehost-4","addr":"10.28.2.23:6789\/0"}]},"osdmap":{"osdmap":{"epoch":11122,"num_osds":42,"num_up_osds":42,"num_in_osds":42,"full":false,"nearfull":false,"num_remapped_pgs":0}},"pgmap":{"pgs_by_state":[{"state_name":"active+clean","count":12350},{"state_name":"active+clean+scrubbing+deep","count":2}],"version":25999715,"num_pgs":12352,"data_bytes":13428555112092,"bytes_used":40180090028032,"bytes_avail":43795596517376,"bytes_total":83975686545408,"read_bytes_sec":92475,"write_bytes_sec":5309194,"read_op_per_sec":367,"write_op_per_sec":506},"fsmap":{"epoch":1,"by_rank":[]}} diff --git a/unit_tests/ceph_params.json b/unit_tests/ceph_params.json new file mode 100644 index 00000000..4b4f6efb --- /dev/null +++ b/unit_tests/ceph_params.json @@ -0,0 +1,222 @@ +{ + "health": { + "health": { + "health_services": [ + { + "mons": [ + { + "name": "juju-2691ab-1-lxd-1", + "kb_total": 155284096, + "kb_used": 1247744, + "kb_avail": 154036352, + "avail_percent": 99, + "last_updated": "2017-05-17 03:31:35.562497", + "store_stats": { + "bytes_total": 1012055342, + "bytes_sst": 0, + "bytes_log": 29673298, + "bytes_misc": 982382044, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + }, + { + "name": "juju-2691ab-13-lxd-0", + "kb_total": 153820288, + "kb_used": 1361280, + "kb_avail": 152459008, + "avail_percent": 99, + "last_updated": "2017-05-17 03:31:04.097201", + "store_stats": { + "bytes_total": 1370003168, + "bytes_sst": 0, + "bytes_log": 29813159, + "bytes_misc": 1340190009, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + }, + { + "name": "juju-2691ab-2-lxd-1", + "kb_total": 155251072, + "kb_used": 1373440, + "kb_avail": 153877632, + "avail_percent": 99, + "last_updated": "2017-05-17 03:31:20.684777", + "store_stats": { + "bytes_total": 1400974192, + "bytes_sst": 0, + "bytes_log": 1129945, + "bytes_misc": 1399844247, + "last_updated": "0.000000" + }, + "health": "HEALTH_OK" + } + ] + } + ] + }, + "timechecks": { + "epoch": 32, + "round": 24492, + "round_status": "finished", + "mons": [ + { + "name": "juju-2691ab-1-lxd-1", + "skew": 0, + "latency": 0, + "health": "HEALTH_OK" + }, + { + "name": "juju-2691ab-13-lxd-0", + "skew": 0.000919, + "latency": 0.001036, + "health": "HEALTH_OK" + }, + { + "name": "juju-2691ab-2-lxd-1", + "skew": 0, + "latency": 0.001009, + "health": "HEALTH_OK" + } + ] + }, + "summary": [ + { + "severity": "HEALTH_WARN", + "summary": "48 pgs backfill_wait" + }, + { + "severity": "HEALTH_WARN", + "summary": "45 pgs backfilling" + }, + { + "severity": "HEALTH_WARN", + "summary": "1 pgs degraded" + }, + { + "severity": "HEALTH_WARN", + "summary": "1 pgs recovery_wait" + }, + { + "severity": "HEALTH_WARN", + "summary": "22 pgs stuck unclean" + }, + { + "severity": "HEALTH_WARN", + "summary": "recovery lots/bignumber objects degraded (15%)" + }, + { + "severity": "HEALTH_WARN", + "summary": "recovery 448540/46842755 objects misplaced (0.958%)" + } + ], + "overall_status": "HEALTH_WARN", + "detail": [] + }, + "fsid": "ca9451f1-5c4f-4e85-bb14-a08dfc0568f7", + "election_epoch": 32, + "quorum": [ + 0, + 1, + 2 + ], + "quorum_names": [ + "juju-2691ab-1-lxd-1", + "juju-2691ab-13-lxd-0", + "juju-2691ab-2-lxd-1" + ], + "monmap": { + "epoch": 1, + "fsid": "ca9451f1-5c4f-4e85-bb14-a08dfc0568f7", + "modified": "2016-12-03 08:09:21.854671", + "created": "2016-12-03 08:09:21.854671", + "mons": [ + { + "rank": 0, + "name": "juju-2691ab-1-lxd-1", + "addr": "10.182.254.221:6789/0" + }, + { + "rank": 1, + "name": "juju-2691ab-13-lxd-0", + "addr": "10.182.254.229:6789/0" + }, + { + "rank": 2, + "name": "juju-2691ab-2-lxd-1", + "addr": "10.182.254.242:6789/0" + } + ] + }, + "osdmap": { + "osdmap": { + "epoch": 141540, + "num_osds": 314, + "num_up_osds": 314, + "num_in_osds": 314, + "full": false, + "nearfull": false, + "num_remapped_pgs": 92 + } + }, + "pgmap": { + "pgs_by_state": [ + { + "state_name": "active+clean", + "count": 9274 + }, + { + "state_name": "active+remapped+wait_backfill", + "count": 48 + }, + { + "state_name": "active+remapped+backfilling", + "count": 45 + }, + { + "state_name": "active+clean+scrubbing+deep", + "count": 9 + }, + { + "state_name": "active+remapped", + "count": 2 + }, + { + "state_name": "active+recovery_wait+degraded", + "count": 1 + }, + { + "state_name": "active+clean+scrubbing", + "count": 1 + } + ], + "version": 13885884, + "num_pgs": 9380, + "data_bytes": 64713222471610, + "bytes_used": 193613093122048, + "bytes_avail": 690058090491904, + "bytes_total": 883671183613952, + "degraded_objects": 14, + "degraded_total": 46842755, + "degraded_ratio": 0, + "misplaced_objects": 448540, + "misplaced_total": 46842755, + "misplaced_ratio": 0.15, + "recovering_objects_per_sec": 389, + "recovering_bytes_per_sec": 1629711746, + "recovering_keys_per_sec": 0, + "num_objects_recovered": 218, + "num_bytes_recovered": 912252928, + "num_keys_recovered": 0, + "read_bytes_sec": 117041457, + "write_bytes_sec": 293414043, + "read_op_per_sec": 5282, + "write_op_per_sec": 5270 + }, + "fsmap": { + "epoch": 1, + "by_rank": [] + } +} + diff --git a/unit_tests/ceph_warn.json b/unit_tests/ceph_warn.json new file mode 100644 index 00000000..45c81578 --- /dev/null +++ b/unit_tests/ceph_warn.json @@ -0,0 +1 @@ +{"health":{"health":{"health_services":[{"mons":[{"name":"juju-2691ab-1-lxd-1","kb_total":155284096,"kb_used":1247744,"kb_avail":154036352,"avail_percent":99,"last_updated":"2017-05-17 03:31:35.562497","store_stats":{"bytes_total":1012055342,"bytes_sst":0,"bytes_log":29673298,"bytes_misc":982382044,"last_updated":"0.000000"},"health":"HEALTH_OK"},{"name":"juju-2691ab-13-lxd-0","kb_total":153820288,"kb_used":1361280,"kb_avail":152459008,"avail_percent":99,"last_updated":"2017-05-17 03:31:04.097201","store_stats":{"bytes_total":1370003168,"bytes_sst":0,"bytes_log":29813159,"bytes_misc":1340190009,"last_updated":"0.000000"},"health":"HEALTH_OK"},{"name":"juju-2691ab-2-lxd-1","kb_total":155251072,"kb_used":1373440,"kb_avail":153877632,"avail_percent":99,"last_updated":"2017-05-17 03:31:20.684777","store_stats":{"bytes_total":1400974192,"bytes_sst":0,"bytes_log":1129945,"bytes_misc":1399844247,"last_updated":"0.000000"},"health":"HEALTH_OK"}]}]},"timechecks":{"epoch":32,"round":24492,"round_status":"finished","mons":[{"name":"juju-2691ab-1-lxd-1","skew":0.000000,"latency":0.000000,"health":"HEALTH_OK"},{"name":"juju-2691ab-13-lxd-0","skew":0.000919,"latency":0.001036,"health":"HEALTH_OK"},{"name":"juju-2691ab-2-lxd-1","skew":0.000000,"latency":0.001009,"health":"HEALTH_OK"}]},"summary":[{"severity":"HEALTH_WARN","summary":"48 pgs backfill_wait"},{"severity":"HEALTH_WARN","summary":"45 pgs backfilling"},{"severity":"HEALTH_WARN","summary":"1 pgs degraded"},{"severity":"HEALTH_WARN","summary":"1 pgs recovery_wait"},{"severity":"HEALTH_WARN","summary":"22 pgs stuck unclean"},{"severity":"HEALTH_WARN","summary":"recovery 14\/46842755 objects degraded (0.000%)"},{"severity":"HEALTH_WARN","summary":"recovery 448540\/46842755 objects misplaced (0.958%)"}],"overall_status":"HEALTH_WARN","detail":[]},"fsid":"ca9451f1-5c4f-4e85-bb14-a08dfc0568f7","election_epoch":32,"quorum":[0,1,2],"quorum_names":["juju-2691ab-1-lxd-1","juju-2691ab-13-lxd-0","juju-2691ab-2-lxd-1"],"monmap":{"epoch":1,"fsid":"ca9451f1-5c4f-4e85-bb14-a08dfc0568f7","modified":"2016-12-03 08:09:21.854671","created":"2016-12-03 08:09:21.854671","mons":[{"rank":0,"name":"juju-2691ab-1-lxd-1","addr":"10.182.254.221:6789\/0"},{"rank":1,"name":"juju-2691ab-13-lxd-0","addr":"10.182.254.229:6789\/0"},{"rank":2,"name":"juju-2691ab-2-lxd-1","addr":"10.182.254.242:6789\/0"}]},"osdmap":{"osdmap":{"epoch":141540,"num_osds":314,"num_up_osds":314,"num_in_osds":314,"full":false,"nearfull":false,"num_remapped_pgs":92}},"pgmap":{"pgs_by_state":[{"state_name":"active+clean","count":9274},{"state_name":"active+remapped+wait_backfill","count":48},{"state_name":"active+remapped+backfilling","count":45},{"state_name":"active+clean+scrubbing+deep","count":9},{"state_name":"active+remapped","count":2},{"state_name":"active+recovery_wait+degraded","count":1},{"state_name":"active+clean+scrubbing","count":1}],"version":13885884,"num_pgs":9380,"data_bytes":64713222471610,"bytes_used":193613093122048,"bytes_avail":690058090491904,"bytes_total":883671183613952,"degraded_objects":14,"degraded_total":46842755,"degraded_ratio":0.000000,"misplaced_objects":448540,"misplaced_total":46842755,"misplaced_ratio":0.009575,"recovering_objects_per_sec":389,"recovering_bytes_per_sec":1629711746,"recovering_keys_per_sec":0,"num_objects_recovered":218,"num_bytes_recovered":912252928,"num_keys_recovered":0,"read_bytes_sec":117041457,"write_bytes_sec":293414043,"read_op_per_sec":5282,"write_op_per_sec":5270},"fsmap":{"epoch":1,"by_rank":[]}} diff --git a/unit_tests/test_ceph_hooks.py b/unit_tests/test_ceph_hooks.py index ef7a0786..7c98fd02 100644 --- a/unit_tests/test_ceph_hooks.py +++ b/unit_tests/test_ceph_hooks.py @@ -44,7 +44,11 @@ CHARM_CONFIG = {'config-flags': '', 'osd-format': 'ext4', 'monitor-hosts': '', 'prefer-ipv6': False, - 'default-rbd-features': None} + 'default-rbd-features': None, + 'nagios_degraded_thresh': '1', + 'nagios_misplaced_thresh': '10', + 'nagios_recovery_rate': '1', + 'nagios_ignore_nodeepscub': False} class CephHooksTestCase(unittest.TestCase): @@ -168,7 +172,10 @@ class CephHooksTestCase(unittest.TestCase): 'use_syslog': 'true'} self.assertEqual(ctxt, expected) - def test_nrpe_dependency_installed(self): + @patch.object(ceph_hooks, 'config') + def test_nrpe_dependency_installed(self, mock_config): + config = copy.deepcopy(CHARM_CONFIG) + mock_config.side_effect = lambda key: config[key] with patch.multiple(ceph_hooks, apt_install=DEFAULT, rsync=DEFAULT, @@ -179,7 +186,12 @@ class CephHooksTestCase(unittest.TestCase): mocks["apt_install"].assert_called_once_with( ["python-dbus", "lockfile-progs"]) - def test_upgrade_charm_with_nrpe_relation_installs_dependencies(self): + @patch.object(ceph_hooks, 'config') + def test_upgrade_charm_with_nrpe_relation_installs_dependencies( + self, + mock_config): + config = copy.deepcopy(CHARM_CONFIG) + mock_config.side_effect = lambda key: config[key] with patch.multiple( ceph_hooks, apt_install=DEFAULT, diff --git a/unit_tests/test_check_ceph_status.py b/unit_tests/test_check_ceph_status.py new file mode 100644 index 00000000..64c3e903 --- /dev/null +++ b/unit_tests/test_check_ceph_status.py @@ -0,0 +1,75 @@ +# Copyright 2016 Canonical Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import os +import sys + +from mock import patch + +# import the module we want to test +os.sys.path.insert(1, os.path.join(sys.path[0], 'files/nagios')) +import check_ceph_status + + +@patch('subprocess.check_output') +class NagiosTestCase(unittest.TestCase): + + def test_health_ok(self, mock_subprocess): + with open('unit_tests/ceph_ok.json') as f: + tree = f.read() + mock_subprocess.return_value = tree + args = check_ceph_status.parse_args(['--degraded_thresh', '1']) + check_output = check_ceph_status.check_ceph_status(args) + self.assertRegexpMatches(check_output, r"^All OK$") + + def test_health_warn(self, mock_subprocess): + with open('unit_tests/ceph_warn.json') as f: + tree = f.read() + mock_subprocess.return_value = tree + args = check_ceph_status.parse_args(['--degraded_thresh', '1']) + self.assertRaises(check_ceph_status.WarnError, + lambda: check_ceph_status.check_ceph_status(args)) + + def test_health_crit(self, mock_subprocess): + with open('unit_tests/ceph_crit.json') as f: + tree = f.read() + mock_subprocess.return_value = tree + args = check_ceph_status.parse_args(['--degraded_thresh', '1']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + def test_health_lotsdegraded(self, mock_subprocess): + with open('unit_tests/ceph_params.json') as f: + tree = f.read() + mock_subprocess.return_value = tree + args = check_ceph_status.parse_args(['--degraded_thresh', '1']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + def test_health_nodeepscrub(self, mock_subprocess): + with open('unit_tests/ceph_nodeepscrub.json') as f: + tree = f.read() + mock_subprocess.return_value = tree + args = check_ceph_status.parse_args(['--degraded_thresh', '1']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + def test_health_nodeepscrubok(self, mock_subprocess): + with open('unit_tests/ceph_nodeepscrub.json') as f: + tree = f.read() + mock_subprocess.return_value = tree + args = check_ceph_status.parse_args(['--ignore_nodeepscrub']) + self.assertRaises(check_ceph_status.WarnError, + lambda: check_ceph_status.check_ceph_status(args))