From faefe90ce6beb5d2b3721cfb637dd7d661cbc21f Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Thu, 10 Jan 2019 18:01:23 +0000 Subject: [PATCH] Fix nrpe ceph-osd status respecting permissions The referenced bug (below) was caused because the nrpe check needed to access the ceph owned directories, and as the nagios user, nrpe can't. This change splits the check into a 'collect' phase that runs as root via a cronjob each minute and writes a file to the tmp directory, and a nrpe check phase that then reads that file and reports back to nagios. The 'check' part deletes the 'collect' file, so that fresh information is available for each nrpe check. The cron task runs every minute (as is lightweight), so the nrpe checks should not be sheduled more frequently than 1 minute. Change-Id: I4f4594a479eed47cc66643d0c6acece491ae854d Closes-Bug: #1810749 --- files/nagios/check_ceph_osd_services.py | 65 ++++++++++++++++ files/nagios/check_ceph_status.py | 10 ++- files/nagios/collect_ceph_osd_services.py | 90 +++++++++++++++++++++++ hooks/ceph_hooks.py | 78 +++++++++++++++----- tests/bundles/bionic-rocky.yaml | 2 +- tox.ini | 2 +- 6 files changed, 223 insertions(+), 24 deletions(-) create mode 100755 files/nagios/check_ceph_osd_services.py create mode 100755 files/nagios/collect_ceph_osd_services.py diff --git a/files/nagios/check_ceph_osd_services.py b/files/nagios/check_ceph_osd_services.py new file mode 100755 index 00000000..669160d4 --- /dev/null +++ b/files/nagios/check_ceph_osd_services.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2018 Canonical +# All Rights Reserved +# Author: Alex Kavanagh + +import os +import sys +import tempfile + +CRON_CHECK_TMPFILE = 'ceph-osd-checks' + +STATE_OK = 0 +STATE_WARNING = 1 +STATE_CRITICAL = 2 +STATE_UNKNOWN = 3 + + +def run_main(): + """Process the CRON_CHECK_TMP_FILE and see if any line is not OK. + + If a line is not OK, the main returns STATE_CRITICAL. + If there are no lines, or the file doesn't exist, it returns STATE_UNKNOWN + Otherwise it returns STATE_OK. + + :returns: nagios state 0,2 or 3 + """ + _tmp_file = os.path.join(tempfile.gettempdir(), CRON_CHECK_TMPFILE) + + if not os.path.isfile(_tmp_file): + print("File '{}' doesn't exist".format(_tmp_file)) + return STATE_UNKNOWN + + try: + with open(_tmp_file, 'rt') as f: + lines = f.readlines() + except Exception as e: + print("Something went wrong reading the file: {}".format(str(e))) + return STATE_UNKNOWN + + # now remove the file in case the next check fails. + try: + os.remove(_tmp_file) + except Exception: + pass + + if not lines: + print("checked status file is empty: {}".format(_tmp_file)) + return STATE_UNKNOWN + + # finally, check that the file contains all ok lines. Unfortunately, it's + # not consistent across releases, but what is consistent is that the check + # command in the collect phase does fail, and so the start of the line is + # 'Failed' + state = STATE_OK + for l in lines: + print(l, end='') + if l.startswith('Failed'): + state = STATE_CRITICAL + + return state + + +if __name__ == '__main__': + sys.exit(run_main()) diff --git a/files/nagios/check_ceph_status.py b/files/nagios/check_ceph_status.py index cc21591a..358fafd9 100755 --- a/files/nagios/check_ceph_status.py +++ b/files/nagios/check_ceph_status.py @@ -40,7 +40,8 @@ def check_ceph_status(args): msg += '"' raise nagios_plugin.CriticalError(msg) - osds = re.search("^.*: (\d+) osds: (\d+) up, (\d+) in", status_data['osdmap']) + osds = re.search("^.*: (\d+) osds: (\d+) up, (\d+) in", + status_data['osdmap']) if osds.group(1) > osds.group(2): # not all OSDs are "up" msg = 'CRITICAL: Some OSDs are not up. Total: {}, up: {}'.format( osds.group(1), osds.group(2)) @@ -50,7 +51,10 @@ def check_ceph_status(args): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Check ceph status') - parser.add_argument('-f', '--file', dest='status_file', - default=False, help='Optional file with "ceph status" output') + parser.add_argument('-f', + '--file', + dest='status_file', + default=False, + help='Optional file with "ceph status" output') args = parser.parse_args() nagios_plugin.try_check(check_ceph_status, args) diff --git a/files/nagios/collect_ceph_osd_services.py b/files/nagios/collect_ceph_osd_services.py new file mode 100755 index 00000000..7133a75e --- /dev/null +++ b/files/nagios/collect_ceph_osd_services.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2018 Canonical +# All Rights Reserved +# Author: Alex Kavanagh + +import os +import subprocess +import tempfile + +# fasteners only exists in Bionic, so this will fail on xenial and trusty +try: + import fasteners +except ImportError: + fasteners = None + +SYSTEMD_SYSTEM = '/run/systemd/system' +LOCKFILE = '/var/lock/check-osds.lock' +CRON_CHECK_TMPFILE = 'ceph-osd-checks' + + +def init_is_systemd(): + """Return True if the host system uses systemd, False otherwise.""" + if lsb_release()['DISTRIB_CODENAME'] == 'trusty': + return False + return os.path.isdir(SYSTEMD_SYSTEM) + + +def lsb_release(): + """Return /etc/lsb-release in a dict""" + d = {} + with open('/etc/lsb-release', 'r') as lsb: + for l in lsb: + k, v = l.split('=') + d[k.strip()] = v.strip() + return d + + +def get_osd_units(): + """Returns a list of strings, one for each unit that is live""" + cmd = '/bin/cat /var/lib/ceph/osd/ceph-*/whoami' + try: + output = (subprocess + .check_output([cmd], shell=True).decode('utf-8') + .split('\n')) + return [u for u in output if u] + except subprocess.CalledProcessError: + return [] + + +def do_status(): + if init_is_systemd(): + cmd = "/usr/local/lib/nagios/plugins/check_systemd.py ceph-osd@{}" + else: + cmd = "/sbin/status ceph-osd id={}" + + lines = [] + + for unit in get_osd_units(): + try: + output = (subprocess + .check_output(cmd.format(unit).split(), + stderr=subprocess.STDOUT) + .decode('utf-8')) + except subprocess.CalledProcessError as e: + output = ("Failed: check command raised: {}" + .format(e.output.decode('utf-8'))) + lines.append(output) + + _tmp_file = os.path.join(tempfile.gettempdir(), CRON_CHECK_TMPFILE) + with open(_tmp_file, 'wt') as f: + f.writelines(lines) + + +def run_main(): + # on bionic we can interprocess lock; we don't do it for older platforms + if fasteners is not None: + lock = fasteners.InterProcessLock(LOCKFILE) + + if lock.acquire(blocking=False): + try: + do_status() + finally: + lock.release() + else: + do_status() + + +if __name__ == '__main__': + run_main() diff --git a/hooks/ceph_hooks.py b/hooks/ceph_hooks.py index c69d693b..7828613f 100755 --- a/hooks/ceph_hooks.py +++ b/hooks/ceph_hooks.py @@ -15,14 +15,14 @@ # limitations under the License. import base64 -import json import glob +import json +import netifaces import os import shutil -import sys import socket import subprocess -import netifaces +import sys sys.path.append('lib') import ceph.utils as ceph @@ -37,6 +37,7 @@ from charmhelpers.core.hookenv import ( related_units, relation_get, relation_set, + relations_of_type, Hooks, UnregisteredHookError, service_name, @@ -47,16 +48,17 @@ from charmhelpers.core.hookenv import ( application_version_set, ) from charmhelpers.core.host import ( - umount, - mkdir, + add_to_updatedb_prunepath, cmp_pkgrevno, + is_container, + lsb_release, + mkdir, + restart_on_change, service_reload, service_restart, - add_to_updatedb_prunepath, - restart_on_change, + umount, write_file, - is_container, - init_is_systemd, + CompareHostReleases, ) from charmhelpers.fetch import ( add_source, @@ -110,6 +112,9 @@ import charmhelpers.contrib.openstack.vaultlocker as vaultlocker hooks = Hooks() STORAGE_MOUNT_PATH = '/var/lib/ceph' +# cron.d related files +CRON_CEPH_CHECK_FILE = '/etc/cron.d/check-osd-services' + def check_for_upgrade(): if not os.path.exists(ceph._upgrade_keyring): @@ -601,6 +606,7 @@ def upgrade_charm(): fatal=True) install_udev_rules() remap_resolved_targets() + maybe_refresh_nrpe_files() def remap_resolved_targets(): @@ -632,28 +638,62 @@ def remap_resolved_targets(): 'nrpe-external-master-relation-changed') def update_nrpe_config(): # python-dbus is used by check_upstart_job - apt_install('python3-dbus') + # fasteners is used by apt_install collect_ceph_osd_services.py + pkgs = ['python3-dbus'] + if CompareHostReleases(lsb_release()['DISTRIB_CODENAME']) >= 'bionic': + pkgs.append('python3-fasteners') + apt_install(pkgs) + + # copy the check and collect files over to the plugins directory + charm_dir = os.environ.get('CHARM_DIR', '') + nagios_plugins = '/usr/local/lib/nagios/plugins' + # Grab nagios user/group ID's from original source + _dir = os.stat(nagios_plugins) + uid = _dir.st_uid + gid = _dir.st_gid + for name in ('collect_ceph_osd_services.py', 'check_ceph_osd_services.py'): + target = os.path.join(nagios_plugins, name) + shutil.copy(os.path.join(charm_dir, 'files', 'nagios', name), target) + os.chown(target, uid, gid) + hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() - # create systemd or upstart check - cmd = '/bin/cat /var/lib/ceph/osd/ceph-*/whoami |' - if init_is_systemd(): - cmd += 'xargs -I_@ /usr/local/lib/nagios/plugins/check_systemd.py' - cmd += ' ceph-osd@_@' - else: - cmd += 'xargs -I@ status ceph-osd id=@' - cmd += ' && exit 0 || exit 2' + # BUG#1810749 - the nagios user can't access /var/lib/ceph/.. and that's a + # GOOD THING, as it keeps ceph secure from Nagios. However, to check + # whether ceph is okay, the check_systemd.py or 'status ceph-osd' still + # needs to be called with the contents of ../osd/ceph-*/whoami files. To + # get around this conundrum, instead a cron.d job that runs as root will + # perform the checks every minute, and write to a tempory file the results, + # and the nrpe check will grep this file and error out (return 2) if the + # first 3 characters of a line are not 'OK:'. + + cmd = ('MAILTO=""\n' + '* * * * * root ' + '/usr/local/lib/nagios/plugins/collect_ceph_osd_services.py' + ' 2>&1 | logger -t check-osd\n') + with open(CRON_CEPH_CHECK_FILE, "wt") as f: + f.write(cmd) + + nrpe_cmd = '/usr/local/lib/nagios/plugins/check_ceph_osd_services.py' nrpe_setup = nrpe.NRPE(hostname=hostname) nrpe_setup.add_check( shortname='ceph-osd', description='process check {%s}' % current_unit, - check_cmd=cmd + check_cmd=nrpe_cmd ) nrpe_setup.write() +def maybe_refresh_nrpe_files(): + """if the nrpe-external-master relation exists then refresh the nrpe + configuration -- this is called during a charm upgrade + """ + if relations_of_type('nrpe-external-master'): + update_nrpe_config() + + @hooks.hook('secrets-storage-relation-joined') def secrets_storage_joined(relation_id=None): relation_set(relation_id=relation_id, diff --git a/tests/bundles/bionic-rocky.yaml b/tests/bundles/bionic-rocky.yaml index ae37a596..36ea73ba 100644 --- a/tests/bundles/bionic-rocky.yaml +++ b/tests/bundles/bionic-rocky.yaml @@ -57,7 +57,7 @@ applications: charm: cs:~openstack-charmers-next/cinder-ceph nova-cloud-controller: expose: True - charm: cs::~openstack-charmers-next/nova-cloud-controller + charm: cs:~openstack-charmers-next/nova-cloud-controller num_units: 1 options: openstack-origin: cloud:bionic-rocky/proposed diff --git a/tox.ini b/tox.ini index 6b24dddf..e9553173 100644 --- a/tox.ini +++ b/tox.ini @@ -38,7 +38,7 @@ deps = -r{toxinidir}/requirements.txt basepython = python3 deps = -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt -commands = flake8 {posargs} hooks unit_tests tests actions lib +commands = flake8 {posargs} hooks unit_tests tests actions lib files charm-proof [testenv:venv]