From c0b936c7392c554bd3b72cc669484ab0b8960b8d Mon Sep 17 00:00:00 2001 From: James Hebden Date: Tue, 14 Nov 2017 10:34:26 +0100 Subject: [PATCH] Add Nagios check for cluster partitions This commit introduces the check_rabbitmq_cluster.py Nagios check, which uses the HTTP management API to check for the presence of cluster partitions. The check will only be installed if the management_plugin charm configuration is set to True, enabling the HTTP API plugin. The previous request to use administrator privileges to monitor cluster health is no longer required. The create_user and user_exists logic has been reworked to accomodate different tags, and the monitoring tag, which is respected by the HTTP management API used to monitor cluster health, has been used instead of the administrator privilege. Also cleans up usage of os.getenv('CHARM_DIR') in several places in the file hooks/rabbitmq_server_relations.py Change-Id: Ib7eb1afe258931cc917c151a2b6d72dc56d30c95 Closes-Bug: #1548679 --- hooks/install | 2 +- hooks/rabbit_utils.py | 25 ++++--- hooks/rabbitmq_server_relations.py | 39 +++++++++-- scripts/check_rabbitmq_cluster.py | 107 +++++++++++++++++++++++++++++ 4 files changed, 155 insertions(+), 18 deletions(-) create mode 100755 scripts/check_rabbitmq_cluster.py diff --git a/hooks/install b/hooks/install index 29ff6894..9be898b7 100755 --- a/hooks/install +++ b/hooks/install @@ -2,7 +2,7 @@ # Wrapper to deal with newer Ubuntu versions that don't have py2 installed # by default. -declare -a DEPS=('apt' 'netaddr' 'netifaces' 'pip' 'yaml' 'dnspython') +declare -a DEPS=('apt' 'netaddr' 'netifaces' 'pip' 'yaml' 'dnspython' 'requests') check_and_install() { pkg="${1}-${2}" diff --git a/hooks/rabbit_utils.py b/hooks/rabbit_utils.py index 96435987..ac3142b8 100644 --- a/hooks/rabbit_utils.py +++ b/hooks/rabbit_utils.py @@ -20,6 +20,7 @@ import glob import tempfile import time import socket + from collections import OrderedDict from rabbitmq_context import ( @@ -220,27 +221,25 @@ def user_exists(user): for line in out.split('\n')[1:]: _user = line.split('\t')[0] if _user == user: - admin = line.split('\t')[1] - return True, (admin == '[administrator]') - return False, False + return True + return False -def create_user(user, password, admin=False): - exists, is_admin = user_exists(user) +def create_user(user, password, tags=[]): + exists = user_exists(user) if not exists: log('Creating new user (%s).' % user) rabbitmqctl('add_user', user, password) - if admin == is_admin: - return + if 'administrator' in tags: + log('Granting admin access to {}'.format(user)) - if admin: - log('Granting user (%s) admin access.' % user) - rabbitmqctl('set_user_tags', user, 'administrator') - else: - log('Revoking user (%s) admin access.' % user) - rabbitmqctl('set_user_tags', user) + log('Adding tags [{}] to user {}'.format( + ', '.join(tags), + user + )) + rabbitmqctl('set_user_tags', user, ' '.join(tags)) def grant_permissions(user, vhost): diff --git a/hooks/rabbitmq_server_relations.py b/hooks/rabbitmq_server_relations.py index f066aab0..0cd29c45 100755 --- a/hooks/rabbitmq_server_relations.py +++ b/hooks/rabbitmq_server_relations.py @@ -29,6 +29,17 @@ except ImportError: subprocess.check_call(['apt-get', 'install', '-y', 'python3-yaml']) import yaml # flake8: noqa +try: + import requests # flake8: noqa +except ImportError: + if sys.version_info.major == 2: + subprocess.check_call(['apt-get', 'install', '-y', + 'python-requests']) + else: + subprocess.check_call(['apt-get', 'install', '-y', + 'python3-requests']) + import requests # flake8: noqa + import rabbit_utils as rabbit import ssl_utils @@ -189,7 +200,10 @@ def configure_amqp(username, vhost, relation_id, admin=False): # update vhost rabbit.create_vhost(vhost) - rabbit.create_user(username, password, admin) + if admin: + rabbit.create_user(username, password, ['administrator']) + else: + rabbit.create_user(username, password) rabbit.grant_permissions(username, vhost) # NOTE(freyes): after rabbitmq-server 3.0 the method to define HA in the @@ -584,10 +598,10 @@ def ceph_changed(): @hooks.hook('nrpe-external-master-relation-changed') def update_nrpe_checks(): if os.path.isdir(NAGIOS_PLUGINS): - rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts', + rsync(os.path.join(charm_dir(), 'scripts', 'check_rabbitmq.py'), os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq.py')) - rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts', + rsync(os.path.join(charm_dir(), 'scripts', 'check_rabbitmq_queues.py'), os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_queues.py')) if config('stats_cron_schedule'): @@ -598,6 +612,10 @@ def update_nrpe_checks(): rsync(os.path.join(charm_dir(), 'scripts', 'collect_rabbitmq_stats.sh'), script) write_file(STATS_CRONFILE, cronjob) + if config('management_plugin'): + rsync(os.path.join(charm_dir(), 'scripts', + 'check_rabbitmq_cluster.py'), + os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_cluster.py')) elif os.path.isfile(STATS_CRONFILE): os.remove(STATS_CRONFILE) @@ -612,7 +630,7 @@ def update_nrpe_checks(): password = rabbit.get_rabbit_password(user, local=True) rabbit.create_vhost(vhost) - rabbit.create_user(user, password) + rabbit.create_user(user, password, ['monitoring']) rabbit.grant_permissions(user, vhost) nrpe_compat = nrpe.NRPE(hostname=hostname) @@ -633,6 +651,19 @@ def update_nrpe_checks(): check_cmd='{}/check_rabbitmq_queues.py{} {}'.format( NAGIOS_PLUGINS, cmd, STATS_DATAFILE) ) + if config('management_plugin'): + # add NRPE check + nrpe_compat.add_check( + shortname=rabbit.RABBIT_USER + '_cluster', + description='Check RabbitMQ Cluster', + check_cmd='{}/check_rabbitmq_cluster.py --port {} --user {} --password {}'.format( + NAGIOS_PLUGINS, + rabbit.get_managment_port(), + user, + password + ) + ) + nrpe_compat.write() diff --git a/scripts/check_rabbitmq_cluster.py b/scripts/check_rabbitmq_cluster.py new file mode 100755 index 00000000..251da161 --- /dev/null +++ b/scripts/check_rabbitmq_cluster.py @@ -0,0 +1,107 @@ +#!/usr/bin/python +""" +Checks for RabbitMQ cluster partitions. + +Copyright (C) 2017 Canonical +All Rights Reserved +Author: James Hebden + +This Nagios check will use the HTTP management API +to fetch cluster status, and check it for problems +such as partitions and offline nodes. +""" + +from optparse import OptionParser +import json +import requests +import socket +import sys + +if __name__ == '__main__': + + hostname = socket.gethostname() + + parser = OptionParser() + parser.add_option("--host", dest="host", + help="RabbitMQ host to connect to [default=%default]", + metavar="HOST", default="localhost") + parser.add_option("--port", dest="port", type="int", + help="port RabbitMQ is running on [default=%default]", + metavar="PORT", default=5672) + parser.add_option("-v", "--verbose", default=False, action="store_true", + help="verbose run") + parser.add_option("-u", "--user", dest="user", default="guest", + help="RabbitMQ user [default=%default]", + metavar="USER") + parser.add_option("-p", "--password", dest="password", default="guest", + help="RabbitMQ password [default=%default]", + metavar="PASSWORD") + parser.add_option("-t", "--tls", dest="tls", default=False, + help="Use TLS to talk to RabbitMQ? [default=%default]", + metavar="TLS") + parser.add_option("-H", "--hostname", + dest="hostname", + default=hostname, + help="""Override hostname used when querying + cluster status [default=%default]""") + parser.add_option("-R", "--rabbitname", + dest="rabbitname", + default="rabbit", + help="""Override rabbit user ID used when querying + cluster status [default=%default]""") + (options, args) = parser.parse_args() + + if options.verbose: + print("Checking host: %s@%s:%d") % ( + options.user, + options.host, + options.port + ) + + if (options.tls): + proto = 'https' + else: + proto = 'http' + + query = '{0}://{1}:{2}@{3}:{4}/api/nodes/{5}@{6}'.format( + proto, + options.user, + options.password, + options.host, + options.port, + options.rabbitname, + options.hostname, + ) + + try: + partition_data = requests.get(query).text + + except requests.ConnectionError as error: + print( + "ERROR: could not connect to cluster: {0}".format( + error + ) + ) + sys.exit(3) + + if options.verbose: + print(partition_data) + + try: + partitions = len(json.loads(partition_data)['partitions']) + cluster = len(json.loads(partition_data)['cluster_links']) + + except: + print( + "UNKNOWN: Could not parse cluster status data returned by RabbitMQ" + ) + sys.exit(3) + + if(partitions > 0 or cluster < 0): + print( + "CRITICAL: %d partitions detected, %d nodes online." + ) % (partitions, cluster) + sys.exit(2) + else: + print("OK: No partitions detected") + sys.exit(0)