Add Nagios check for cluster partitions

This commit introduces the check_rabbitmq_cluster.py Nagios check, which
uses the HTTP management API to check for the presence of cluster
partitions. The check will only be installed if the management_plugin
charm configuration is set to True, enabling the HTTP API plugin.

The previous request to use administrator privileges to monitor cluster
health is no longer required. The create_user and user_exists logic has
been reworked to accomodate different tags, and the monitoring tag,
which is respected by the HTTP management API used to monitor cluster
health, has been used instead of the administrator privilege.

Also cleans up usage of os.getenv('CHARM_DIR') in several places in the
file hooks/rabbitmq_server_relations.py

Change-Id: Ib7eb1afe258931cc917c151a2b6d72dc56d30c95
Closes-Bug: #1548679
This commit is contained in:
James Hebden 2017-11-14 10:34:26 +01:00
parent dcfe275529
commit c0b936c739
4 changed files with 155 additions and 18 deletions

View File

@ -2,7 +2,7 @@
# Wrapper to deal with newer Ubuntu versions that don't have py2 installed
# by default.
declare -a DEPS=('apt' 'netaddr' 'netifaces' 'pip' 'yaml' 'dnspython')
declare -a DEPS=('apt' 'netaddr' 'netifaces' 'pip' 'yaml' 'dnspython' 'requests')
check_and_install() {
pkg="${1}-${2}"

View File

@ -20,6 +20,7 @@ import glob
import tempfile
import time
import socket
from collections import OrderedDict
from rabbitmq_context import (
@ -220,27 +221,25 @@ def user_exists(user):
for line in out.split('\n')[1:]:
_user = line.split('\t')[0]
if _user == user:
admin = line.split('\t')[1]
return True, (admin == '[administrator]')
return False, False
return True
return False
def create_user(user, password, admin=False):
exists, is_admin = user_exists(user)
def create_user(user, password, tags=[]):
exists = user_exists(user)
if not exists:
log('Creating new user (%s).' % user)
rabbitmqctl('add_user', user, password)
if admin == is_admin:
return
if 'administrator' in tags:
log('Granting admin access to {}'.format(user))
if admin:
log('Granting user (%s) admin access.' % user)
rabbitmqctl('set_user_tags', user, 'administrator')
else:
log('Revoking user (%s) admin access.' % user)
rabbitmqctl('set_user_tags', user)
log('Adding tags [{}] to user {}'.format(
', '.join(tags),
user
))
rabbitmqctl('set_user_tags', user, ' '.join(tags))
def grant_permissions(user, vhost):

View File

@ -29,6 +29,17 @@ except ImportError:
subprocess.check_call(['apt-get', 'install', '-y', 'python3-yaml'])
import yaml # flake8: noqa
try:
import requests # flake8: noqa
except ImportError:
if sys.version_info.major == 2:
subprocess.check_call(['apt-get', 'install', '-y',
'python-requests'])
else:
subprocess.check_call(['apt-get', 'install', '-y',
'python3-requests'])
import requests # flake8: noqa
import rabbit_utils as rabbit
import ssl_utils
@ -189,7 +200,10 @@ def configure_amqp(username, vhost, relation_id, admin=False):
# update vhost
rabbit.create_vhost(vhost)
rabbit.create_user(username, password, admin)
if admin:
rabbit.create_user(username, password, ['administrator'])
else:
rabbit.create_user(username, password)
rabbit.grant_permissions(username, vhost)
# NOTE(freyes): after rabbitmq-server 3.0 the method to define HA in the
@ -584,10 +598,10 @@ def ceph_changed():
@hooks.hook('nrpe-external-master-relation-changed')
def update_nrpe_checks():
if os.path.isdir(NAGIOS_PLUGINS):
rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts',
rsync(os.path.join(charm_dir(), 'scripts',
'check_rabbitmq.py'),
os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq.py'))
rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts',
rsync(os.path.join(charm_dir(), 'scripts',
'check_rabbitmq_queues.py'),
os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_queues.py'))
if config('stats_cron_schedule'):
@ -598,6 +612,10 @@ def update_nrpe_checks():
rsync(os.path.join(charm_dir(), 'scripts',
'collect_rabbitmq_stats.sh'), script)
write_file(STATS_CRONFILE, cronjob)
if config('management_plugin'):
rsync(os.path.join(charm_dir(), 'scripts',
'check_rabbitmq_cluster.py'),
os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_cluster.py'))
elif os.path.isfile(STATS_CRONFILE):
os.remove(STATS_CRONFILE)
@ -612,7 +630,7 @@ def update_nrpe_checks():
password = rabbit.get_rabbit_password(user, local=True)
rabbit.create_vhost(vhost)
rabbit.create_user(user, password)
rabbit.create_user(user, password, ['monitoring'])
rabbit.grant_permissions(user, vhost)
nrpe_compat = nrpe.NRPE(hostname=hostname)
@ -633,6 +651,19 @@ def update_nrpe_checks():
check_cmd='{}/check_rabbitmq_queues.py{} {}'.format(
NAGIOS_PLUGINS, cmd, STATS_DATAFILE)
)
if config('management_plugin'):
# add NRPE check
nrpe_compat.add_check(
shortname=rabbit.RABBIT_USER + '_cluster',
description='Check RabbitMQ Cluster',
check_cmd='{}/check_rabbitmq_cluster.py --port {} --user {} --password {}'.format(
NAGIOS_PLUGINS,
rabbit.get_managment_port(),
user,
password
)
)
nrpe_compat.write()

107
scripts/check_rabbitmq_cluster.py Executable file
View File

@ -0,0 +1,107 @@
#!/usr/bin/python
"""
Checks for RabbitMQ cluster partitions.
Copyright (C) 2017 Canonical
All Rights Reserved
Author: James Hebden
This Nagios check will use the HTTP management API
to fetch cluster status, and check it for problems
such as partitions and offline nodes.
"""
from optparse import OptionParser
import json
import requests
import socket
import sys
if __name__ == '__main__':
hostname = socket.gethostname()
parser = OptionParser()
parser.add_option("--host", dest="host",
help="RabbitMQ host to connect to [default=%default]",
metavar="HOST", default="localhost")
parser.add_option("--port", dest="port", type="int",
help="port RabbitMQ is running on [default=%default]",
metavar="PORT", default=5672)
parser.add_option("-v", "--verbose", default=False, action="store_true",
help="verbose run")
parser.add_option("-u", "--user", dest="user", default="guest",
help="RabbitMQ user [default=%default]",
metavar="USER")
parser.add_option("-p", "--password", dest="password", default="guest",
help="RabbitMQ password [default=%default]",
metavar="PASSWORD")
parser.add_option("-t", "--tls", dest="tls", default=False,
help="Use TLS to talk to RabbitMQ? [default=%default]",
metavar="TLS")
parser.add_option("-H", "--hostname",
dest="hostname",
default=hostname,
help="""Override hostname used when querying
cluster status [default=%default]""")
parser.add_option("-R", "--rabbitname",
dest="rabbitname",
default="rabbit",
help="""Override rabbit user ID used when querying
cluster status [default=%default]""")
(options, args) = parser.parse_args()
if options.verbose:
print("Checking host: %s@%s:%d") % (
options.user,
options.host,
options.port
)
if (options.tls):
proto = 'https'
else:
proto = 'http'
query = '{0}://{1}:{2}@{3}:{4}/api/nodes/{5}@{6}'.format(
proto,
options.user,
options.password,
options.host,
options.port,
options.rabbitname,
options.hostname,
)
try:
partition_data = requests.get(query).text
except requests.ConnectionError as error:
print(
"ERROR: could not connect to cluster: {0}".format(
error
)
)
sys.exit(3)
if options.verbose:
print(partition_data)
try:
partitions = len(json.loads(partition_data)['partitions'])
cluster = len(json.loads(partition_data)['cluster_links'])
except:
print(
"UNKNOWN: Could not parse cluster status data returned by RabbitMQ"
)
sys.exit(3)
if(partitions > 0 or cluster < 0):
print(
"CRITICAL: %d partitions detected, %d nodes online."
) % (partitions, cluster)
sys.exit(2)
else:
print("OK: No partitions detected")
sys.exit(0)