diff --git a/config.yaml b/config.yaml index 83e5462e..23dff176 100644 --- a/config.yaml +++ b/config.yaml @@ -260,6 +260,29 @@ options: description: | Whether additional checks report warning or error when their checks are positive. + nagios_rgw_zones: + default: "" + type: string + description: | + Comma-separated list of zones that are expected to be connected to this + radosgw. These will be checked by the line "data sync source... + (zone-name)" in the output of `radosgw-admin sync status`. + . + Example: + . + zone1,zone2 + nagios_rgw_additional_checks: + default: "" + type: string + description: | + List describing additional checks. Each item is a regular expression to + search in the output of radosgw-admin sync status. Note, this is a + list unlike `nagios_additional_checks` which uses a dictionary. + . + Example: + . + ['data is behind on'] + . use-direct-io: type: boolean default: True diff --git a/files/nagios/check_radosgw_sync_status.py b/files/nagios/check_radosgw_sync_status.py new file mode 100755 index 00000000..01edfa24 --- /dev/null +++ b/files/nagios/check_radosgw_sync_status.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2005, 2006, 2007, 2012 James Troup +# Copyright (C) 2014, 2017 Canonical +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Authors: Danny Cocks +# Based on check_ceph_status.py and authors therein + +import re +import argparse +import os +import subprocess +import sys +import time +import traceback + + +class CriticalError(Exception): + """This indicates a critical error.""" + pass + + +class UnknownError(Exception): + """This indicates a unknown error was encountered.""" + pass + + +def check_file_freshness(filename, newer_than=3600): + """ + Check a file exists, is readable and is newer than seconds (where + defaults to 3600). + """ + # First check the file exists and is readable + if not os.path.exists(filename): + raise CriticalError("%s: does not exist." % (filename)) + if os.access(filename, os.R_OK) == 0: + raise CriticalError("%s: is not readable." % (filename)) + + # Then ensure the file is up-to-date enough + mtime = os.stat(filename).st_mtime + last_modified = time.time() - mtime + if last_modified > newer_than: + raise CriticalError("%s: was last modified on %s and is too old " + "(> %s seconds)." + % (filename, time.ctime(mtime), newer_than)) + if last_modified < 0: + raise CriticalError("%s: was last modified on %s which is in the " + "future." + % (filename, time.ctime(mtime))) + + +def check_radosgw_status(args): + """ + Used to check the status of multizone RadosGW Ceph. Uses the output of + 'radosgw-admin sync status', generated during the separate cronjob, to + determine if health is OK, and if not, should we alert on that situation. + + As this is the first iteration of this function, we will only do a very + basic check and will rely on the charm config option + `nagios_rgw_additional_checks` which is passed to this script via + `args.additional_check` + + :param args: argparse object formatted in the convention of generic Nagios + checks + :returns string, describing the status of the ceph cluster. + :raises: UnknownError, CriticalError + """ + + if args.status_file: + check_file_freshness(args.status_file) + with open(args.status_file) as f: + status_data = f.read() + else: + try: + status_data = (subprocess.check_output(['radosgw-admin', + 'sync', + 'status']) + .decode('UTF-8')) + except subprocess.CalledProcessError as e: + raise UnknownError( + "UNKNOWN: radosgw-admin sync status command" + "failed with error: {}".format(e)) + + # If the realm name is empty, i.e. the first line is + # realm () + # then we assume this means this is not multizone, so exit early. + lines = status_data.split('\n') + if len(lines) >= 1 and re.match(r"realm .* \(\)", lines[0].strip()): + return "No multizone detected" + + # This is a hangover from check_ceph_status.py and not directly applicable + # here. I include it for an additional check. + required_strings = ['realm', 'zonegroup', 'zone'] + if not all(s in status_data for s in required_strings): + raise UnknownError('UNKNOWN: status data is incomplete') + + # The default message if we end up with no alerts + message_all_ok = "All OK" + # The list to collect messages + msgs = [] + + # The always-done checks go here. + # Currently none + + # Handle checks to do with given expected zones that should be connected. + if args.zones: + for zone in args.zones.split(','): + search_regex = r"data sync source:.*\(" + zone + r"\)" + if re.search(search_regex, status_data) is None: + msg = ("CRITICAL: Missing expected sync source '{}'" + .format(zone)) + msgs.append(msg) + + # For additional checks, also test these things + if args.additional_checks: + for check in args.additional_checks: + m = re.search(check, status_data) + if m is not None: + msgs.append("CRITICAL: {}".format(m.group(0))) + + complete_output = '\n'.join(msgs) + if any(msg.startswith("CRITICAL") for msg in msgs): + raise CriticalError(complete_output) + elif len(msgs) >= 1: + raise UnknownError(complete_output) + else: + return message_all_ok + + +def parse_args(args): + parser = argparse.ArgumentParser(description='Check ceph status') + parser.add_argument('-f', '--file', dest='status_file', + default=False, + help='Optional file with "ceph status" output. ' + 'Generally useful for testing, and if the Nagios ' + 'user account does not have rights for the Ceph ' + 'config files.') + parser.add_argument('--zones', dest='zones', + default=None, + help="Check if the given zones, as a comma-separated " + "list, are present in the output. If they are " + "missing report critical.") + parser.add_argument('--additional_check', dest='additional_checks', + action='append', + help="Check if a given pattern exists in any status" + "message. If it does, report critical") + + return parser.parse_args(args) + + +def main(args): + # Note: leaving "warning" in here, as a reminder for the expected NRPE + # returncodes, even though this script doesn't output any warnings. + EXIT_CODES = {'ok': 0, 'warning': 1, 'critical': 2, 'unknown': 3} + exitcode = 'unknown' + try: + output_msg = check_radosgw_status(args) + print(output_msg) + exitcode = 'ok' + except UnknownError as msg: + print(msg) + exitcode = 'unknown' + except CriticalError as msg: + print(msg) + exitcode = 'critical' + except Exception: + print("%s raised unknown exception '%s'" % ('check_ceph_status', + sys.exc_info()[0])) + print('=' * 60) + traceback.print_exc(file=sys.stdout) + print('=' * 60) + exitcode = 'unknown' + return EXIT_CODES[exitcode] + + +if __name__ == '__main__': + args = parse_args(sys.argv[1:]) + status = main(args) + sys.exit(status) diff --git a/files/nagios/collect_ceph_status.sh b/files/nagios/collect_ceph_status.sh index 514c219c..962687a3 100755 --- a/files/nagios/collect_ceph_status.sh +++ b/files/nagios/collect_ceph_status.sh @@ -31,3 +31,14 @@ ceph osd tree --format json > ${TMP_FILE} chown root:nagios ${TMP_FILE} chmod 0640 ${TMP_FILE} mv ${TMP_FILE} ${DATA_FILE} + + +# Note: radosgw-admin sync status doesn't support outputting in json at time of writing +DATA_FILE="${DATA_DIR}/current-radosgw-admin-sync-status.raw" +TMP_FILE=$(mktemp -p ${DATA_DIR}) + +radosgw-admin sync status > ${TMP_FILE} + +chown root:nagios ${TMP_FILE} +chmod 0640 ${TMP_FILE} +mv ${TMP_FILE} ${DATA_FILE} diff --git a/src/ceph_hooks.py b/src/ceph_hooks.py index 6310e60f..6c0bf919 100755 --- a/src/ceph_hooks.py +++ b/src/ceph_hooks.py @@ -109,6 +109,8 @@ NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins' NAGIOS_FILE_FOLDER = '/var/lib/nagios' SCRIPTS_DIR = '/usr/local/bin' STATUS_FILE = '{}/cat-ceph-status.txt'.format(NAGIOS_FILE_FOLDER) +RADOSGW_STATUS_FILE = ('{}/current-radosgw-admin-sync-status.raw' + .format(NAGIOS_FILE_FOLDER)) STATUS_CRONFILE = '/etc/cron.d/cat-ceph-health' HOST_OSD_COUNT_REPORT = '{}/host-osd-report.json'.format(NAGIOS_FILE_FOLDER) @@ -1198,6 +1200,10 @@ def update_nrpe_config(): 'check_ceph_osd_count.py'), os.path.join(NAGIOS_PLUGINS, 'check_ceph_osd_count.py')) + rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nagios', + 'check_radosgw_sync_status.py'), + os.path.join(NAGIOS_PLUGINS, 'check_radosgw_sync_status.py')) + script = os.path.join(SCRIPTS_DIR, 'collect_ceph_status.sh') rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nagios', 'collect_ceph_status.sh'), @@ -1257,6 +1263,21 @@ def update_nrpe_config(): description='Check whether all OSDs are up and in', check_cmd=check_cmd ) + + check_cmd = ('check_radosgw_sync_status.py -f {}' + .format(RADOSGW_STATUS_FILE)) + if config('nagios_rgw_zones'): + check_cmd += ' --zones "{}"'.format(config('nagios_rgw_zones')) + if config('nagios_rgw_additional_checks'): + x = ast.literal_eval(config('nagios_rgw_additional_checks')) + for check in x: + check_cmd += ' --additional_check \"{}\"'.format(check) + nrpe_setup.add_check( + shortname='radosgw_multizone', + description='Check multizone radosgw health', + check_cmd=check_cmd + ) + nrpe_setup.write() diff --git a/unit_tests/test_ceph_hooks.py b/unit_tests/test_ceph_hooks.py index 497bb212..16972402 100644 --- a/unit_tests/test_ceph_hooks.py +++ b/unit_tests/test_ceph_hooks.py @@ -55,6 +55,8 @@ CHARM_CONFIG = {'config-flags': '', 'nagios_raise_nodeepscrub': True, 'nagios_additional_checks': "", 'nagios_additional_checks_critical': False, + 'nagios_rgw_zones': "", + 'nagios_rgw_additional_checks': "", 'nagios_check_num_osds': False, 'disable-pg-max-object-skew': False, 'rbd-stats-pools': 'foo'}