Merge "Add Nagios check for cluster partitions"

This commit is contained in:
Zuul 2018-05-14 11:26:19 +00:00 committed by Gerrit Code Review
commit 4b74f45412
4 changed files with 155 additions and 18 deletions

View File

@ -2,7 +2,7 @@
# Wrapper to deal with newer Ubuntu versions that don't have py2 installed
# by default.
declare -a DEPS=('apt' 'netaddr' 'netifaces' 'pip' 'yaml' 'dnspython')
declare -a DEPS=('apt' 'netaddr' 'netifaces' 'pip' 'yaml' 'dnspython' 'requests')
check_and_install() {
pkg="${1}-${2}"

View File

@ -20,6 +20,7 @@ import glob
import tempfile
import time
import socket
from collections import OrderedDict
from rabbitmq_context import (
@ -220,27 +221,25 @@ def user_exists(user):
for line in out.split('\n')[1:]:
_user = line.split('\t')[0]
if _user == user:
admin = line.split('\t')[1]
return True, (admin == '[administrator]')
return False, False
return True
return False
def create_user(user, password, admin=False):
exists, is_admin = user_exists(user)
def create_user(user, password, tags=[]):
exists = user_exists(user)
if not exists:
log('Creating new user (%s).' % user)
rabbitmqctl('add_user', user, password)
if admin == is_admin:
return
if 'administrator' in tags:
log('Granting admin access to {}'.format(user))
if admin:
log('Granting user (%s) admin access.' % user)
rabbitmqctl('set_user_tags', user, 'administrator')
else:
log('Revoking user (%s) admin access.' % user)
rabbitmqctl('set_user_tags', user)
log('Adding tags [{}] to user {}'.format(
', '.join(tags),
user
))
rabbitmqctl('set_user_tags', user, ' '.join(tags))
def grant_permissions(user, vhost):

View File

@ -29,6 +29,17 @@ except ImportError:
subprocess.check_call(['apt-get', 'install', '-y', 'python3-yaml'])
import yaml # flake8: noqa
try:
import requests # flake8: noqa
except ImportError:
if sys.version_info.major == 2:
subprocess.check_call(['apt-get', 'install', '-y',
'python-requests'])
else:
subprocess.check_call(['apt-get', 'install', '-y',
'python3-requests'])
import requests # flake8: noqa
import rabbit_utils as rabbit
import ssl_utils
@ -189,7 +200,10 @@ def configure_amqp(username, vhost, relation_id, admin=False):
# update vhost
rabbit.create_vhost(vhost)
rabbit.create_user(username, password, admin)
if admin:
rabbit.create_user(username, password, ['administrator'])
else:
rabbit.create_user(username, password)
rabbit.grant_permissions(username, vhost)
# NOTE(freyes): after rabbitmq-server 3.0 the method to define HA in the
@ -584,10 +598,10 @@ def ceph_changed():
@hooks.hook('nrpe-external-master-relation-changed')
def update_nrpe_checks():
if os.path.isdir(NAGIOS_PLUGINS):
rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts',
rsync(os.path.join(charm_dir(), 'scripts',
'check_rabbitmq.py'),
os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq.py'))
rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts',
rsync(os.path.join(charm_dir(), 'scripts',
'check_rabbitmq_queues.py'),
os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_queues.py'))
if config('stats_cron_schedule'):
@ -598,6 +612,10 @@ def update_nrpe_checks():
rsync(os.path.join(charm_dir(), 'scripts',
'collect_rabbitmq_stats.sh'), script)
write_file(STATS_CRONFILE, cronjob)
if config('management_plugin'):
rsync(os.path.join(charm_dir(), 'scripts',
'check_rabbitmq_cluster.py'),
os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_cluster.py'))
elif os.path.isfile(STATS_CRONFILE):
os.remove(STATS_CRONFILE)
@ -612,7 +630,7 @@ def update_nrpe_checks():
password = rabbit.get_rabbit_password(user, local=True)
rabbit.create_vhost(vhost)
rabbit.create_user(user, password)
rabbit.create_user(user, password, ['monitoring'])
rabbit.grant_permissions(user, vhost)
nrpe_compat = nrpe.NRPE(hostname=hostname)
@ -633,6 +651,19 @@ def update_nrpe_checks():
check_cmd='{}/check_rabbitmq_queues.py{} {}'.format(
NAGIOS_PLUGINS, cmd, STATS_DATAFILE)
)
if config('management_plugin'):
# add NRPE check
nrpe_compat.add_check(
shortname=rabbit.RABBIT_USER + '_cluster',
description='Check RabbitMQ Cluster',
check_cmd='{}/check_rabbitmq_cluster.py --port {} --user {} --password {}'.format(
NAGIOS_PLUGINS,
rabbit.get_managment_port(),
user,
password
)
)
nrpe_compat.write()

107
scripts/check_rabbitmq_cluster.py Executable file
View File

@ -0,0 +1,107 @@
#!/usr/bin/python
"""
Checks for RabbitMQ cluster partitions.
Copyright (C) 2017 Canonical
All Rights Reserved
Author: James Hebden
This Nagios check will use the HTTP management API
to fetch cluster status, and check it for problems
such as partitions and offline nodes.
"""
from optparse import OptionParser
import json
import requests
import socket
import sys
if __name__ == '__main__':
hostname = socket.gethostname()
parser = OptionParser()
parser.add_option("--host", dest="host",
help="RabbitMQ host to connect to [default=%default]",
metavar="HOST", default="localhost")
parser.add_option("--port", dest="port", type="int",
help="port RabbitMQ is running on [default=%default]",
metavar="PORT", default=5672)
parser.add_option("-v", "--verbose", default=False, action="store_true",
help="verbose run")
parser.add_option("-u", "--user", dest="user", default="guest",
help="RabbitMQ user [default=%default]",
metavar="USER")
parser.add_option("-p", "--password", dest="password", default="guest",
help="RabbitMQ password [default=%default]",
metavar="PASSWORD")
parser.add_option("-t", "--tls", dest="tls", default=False,
help="Use TLS to talk to RabbitMQ? [default=%default]",
metavar="TLS")
parser.add_option("-H", "--hostname",
dest="hostname",
default=hostname,
help="""Override hostname used when querying
cluster status [default=%default]""")
parser.add_option("-R", "--rabbitname",
dest="rabbitname",
default="rabbit",
help="""Override rabbit user ID used when querying
cluster status [default=%default]""")
(options, args) = parser.parse_args()
if options.verbose:
print("Checking host: %s@%s:%d") % (
options.user,
options.host,
options.port
)
if (options.tls):
proto = 'https'
else:
proto = 'http'
query = '{0}://{1}:{2}@{3}:{4}/api/nodes/{5}@{6}'.format(
proto,
options.user,
options.password,
options.host,
options.port,
options.rabbitname,
options.hostname,
)
try:
partition_data = requests.get(query).text
except requests.ConnectionError as error:
print(
"ERROR: could not connect to cluster: {0}".format(
error
)
)
sys.exit(3)
if options.verbose:
print(partition_data)
try:
partitions = len(json.loads(partition_data)['partitions'])
cluster = len(json.loads(partition_data)['cluster_links'])
except:
print(
"UNKNOWN: Could not parse cluster status data returned by RabbitMQ"
)
sys.exit(3)
if(partitions > 0 or cluster < 0):
print(
"CRITICAL: %d partitions detected, %d nodes online."
) % (partitions, cluster)
sys.exit(2)
else:
print("OK: No partitions detected")
sys.exit(0)