Added stats cronjob and queue monitoring nagios plugin

Rebase for https://code.launchpad.net/~jacekn/charms/precise/rabbitmq-server/queue-monitoring/+merge/218580
This commit is contained in:
David Ames 2015-04-01 10:55:33 -07:00
commit 818a0d3ad9
5 changed files with 191 additions and 1 deletions

View File

@ -166,3 +166,17 @@ options:
order for this charm to function correctly, the privacy extension must be
disabled and a non-temporary address must be configured/available on
your network interface.
stats_cron_schedule:
type: string
default: ""
description: |
Cron schedule used to generate rabbitmq stats. If unset
no stats will be generated
queue_thresholds:
type: string
default: ""
description: |
List of RabbitMQ queue size check thresholds. Interpreted as YAML
in format [<vhost>, <queue>, <warn>, <crit>]
- ['/', 'queue1', 10, 20]
- ['/', 'queue2', 200, 300]

View File

@ -6,6 +6,7 @@ import sys
import subprocess
import glob
import socket
import yaml
import rabbit_utils as rabbit
from lib.utils import (
@ -59,6 +60,7 @@ from charmhelpers.core.host import (
rsync,
service_stop,
service_restart,
write_file,
)
from charmhelpers.contrib.charmsupport import nrpe
from charmhelpers.contrib.ssl.service import ServiceCA
@ -81,6 +83,10 @@ RABBIT_DIR = '/var/lib/rabbitmq'
RABBIT_USER = 'rabbitmq'
RABBIT_GROUP = 'rabbitmq'
NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins'
SCRIPTS_DIR = '/usr/local/bin'
STATS_CRONFILE = '/etc/cron.d/rabbitmq-stats'
STATS_DATAFILE = os.path.join(RABBIT_DIR, 'data',
subprocess.check_output(['hostname', '-s']).strip() + '_queue_stats.dat')
@hooks.hook('install')
@ -472,6 +478,17 @@ def update_nrpe_checks():
rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts',
'check_rabbitmq.py'),
os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq.py'))
rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts',
'check_rabbitmq_queues.py'),
os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_queues.py'))
if config('stats_cron_schedule'):
script = os.path.join(SCRIPTS_DIR, 'collect_rabbitmq_stats.sh')
cronjob = "{} root {}\n".format(config('stats_cron_schedule'), script)
rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts',
'collect_rabbitmq_stats.sh'), script)
write_file(STATS_CRONFILE, cronjob)
elif os.path.isfile(STATS_CRONFILE):
os.remove(STATS_CRONFILE)
# Find out if nrpe set nagios_hostname
hostname = nrpe.get_nagios_hostname()
@ -494,6 +511,17 @@ def update_nrpe_checks():
check_cmd='{}/check_rabbitmq.py --user {} --password {} --vhost {}'
''.format(NAGIOS_PLUGINS, user, password, vhost)
)
if config('queue_thresholds'):
cmd = ""
# If value of queue_thresholds is incorrect we want the hook to fail
for item in yaml.safe_load(config('queue_thresholds')):
cmd += ' -c "{}" "{}" {} {}'.format(*item)
nrpe_compat.add_check(
shortname=rabbit.RABBIT_USER + '_queue',
description='Check RabbitMQ Queues',
check_cmd='{}/check_rabbitmq_queues.py{} {}'.format(
NAGIOS_PLUGINS, cmd, STATS_DATAFILE)
)
nrpe_compat.write()

View File

@ -1 +1 @@
128
150

View File

@ -0,0 +1,99 @@
#!/usr/bin/python
# Copyright (C) 2011, 2012, 2014 Canonical
# All Rights Reserved
# Author: Liam Young, Jacek Nykis
from collections import defaultdict
from fnmatch import fnmatchcase
from itertools import chain
import argparse
import sys
def gen_data_lines(filename):
with open(filename, "rb") as fin:
for line in fin:
if not line.startswith("#"):
yield line
def gen_stats(data_lines):
for line in data_lines:
try:
vhost, queue, _, _, m_all, _ = line.split(None, 5)
except ValueError:
print "ERROR: problem parsing the stats file"
sys.exit(2)
assert m_all.isdigit(), "Message count is not a number: %r" % m_all
yield vhost, queue, int(m_all)
def collate_stats(stats, limits):
# Create a dict with stats collated according to the definitions in the
# limits file. If none of the definitions in the limits file is matched,
# store the stat without collating.
collated = defaultdict(lambda: 0)
for vhost, queue, m_all in stats:
for l_vhost, l_queue, _, _ in limits:
if fnmatchcase(vhost, l_vhost) and fnmatchcase(queue, l_queue):
collated[l_vhost, l_queue] += m_all
break
else:
collated[vhost, queue] += m_all
return collated
def check_stats(stats_collated, limits):
# Create a limits lookup dict with keys of the form (vhost, queue).
limits_lookup = dict(
((l_vhost, l_queue), (int(t_warning), int(t_critical)))
for l_vhost, l_queue, t_warning, t_critical in limits)
if not (stats_collated):
yield 'No Queues Found', 'No Vhosts Found', None, "CRIT"
# Go through the stats and compare again limits, if any.
for l_vhost, l_queue in sorted(stats_collated):
m_all = stats_collated[l_vhost, l_queue]
try:
t_warning, t_critical = limits_lookup[l_vhost, l_queue]
except KeyError:
yield l_queue, l_vhost, m_all, "UNKNOWN"
else:
if m_all >= t_critical:
yield l_queue, l_vhost, m_all, "CRIT"
elif m_all >= t_warning:
yield l_queue, l_vhost, m_all, "WARN"
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='RabbitMQ queue size nagios check.')
parser.add_argument('-c', nargs=4, action='append', required=True,
metavar=('vhost', 'queue', 'warn', 'crit'),
help=('Vhost and queue to check. Can be used multiple times'))
parser.add_argument('stats_file', nargs='*', type=str, help='file containing queue stats')
args = parser.parse_args()
# Start generating stats from all files given on the command line.
stats = gen_stats(
chain.from_iterable(
gen_data_lines(filename) for filename in args.stats_file))
# Collate stats according to limit definitions and check.
stats_collated = collate_stats(stats, args.c)
stats_checked = check_stats(stats_collated, args.c)
criticals, warnings = [], []
for queue, vhost, message_no, status in stats_checked:
if status == "CRIT":
criticals.append(
"%s in %s has %s messages" % (queue, vhost, message_no))
elif status == "WARN":
warnings.append(
"%s in %s has %s messages" % (queue, vhost, message_no))
if len(criticals) > 0:
print "CRITICALS: %s" % ", ".join(criticals)
sys.exit(2)
# XXX: No warnings if there are criticals?
elif len(warnings) > 0:
print "WARNINGS: %s" % ", ".join(warnings)
sys.exit(1)
else:
print "OK"
sys.exit(0)

View File

@ -0,0 +1,49 @@
#!/bin/bash
# Copyright (C) 2011, 2014 Canonical
# All Rights Reserved
# Author: Liam Young, Jacek Nykis
# Produce a queue data for a given vhost. Useful for graphing and Nagios checks
LOCK=/var/lock/rabbitmq-gather-metrics.lock
# Check for a lock file and if not, create one
lockfile-create -r2 --lock-name $LOCK > /dev/null 2>&1
if [ $? -ne 0 ]; then
exit 1
fi
trap "rm -f $LOCK > /dev/null 2>&1" exit
# Required to fix the bug about start-stop-daemon not being found in
# rabbitmq-server 2.7.1-0ubuntu4.
# '/usr/sbin/rabbitmqctl: 33: /usr/sbin/rabbitmqctl: start-stop-daemon: not found'
export PATH=${PATH}:/sbin/
if [ -f /var/lib/rabbitmq/pids ]; then
RABBIT_PID=$(grep "{rabbit\@${HOSTNAME}," /var/lib/rabbitmq/pids | sed -e 's!^.*,\([0-9]*\).*!\1!')
elif [ -f /var/run/rabbitmq/pid ]; then
RABBIT_PID=$(cat /var/run/rabbitmq/pid)
else
echo "No PID file found"
exit 3
fi
DATA_DIR="/var/lib/rabbitmq/data"
DATA_FILE="${DATA_DIR}/$(hostname -s)_queue_stats.dat"
LOG_DIR="/var/lib/rabbitmq/logs"
RABBIT_STATS_DATA_FILE="${DATA_DIR}/$(hostname -s)_general_stats.dat"
NOW=$(date +'%s')
HOSTNAME=$(hostname -s)
MNESIA_DB_SIZE=$(du -sm /var/lib/rabbitmq/mnesia | cut -f1)
RABBIT_RSS=$(ps -p $RABBIT_PID -o rss=)
if [ ! -d $DATA_DIR ]; then
mkdir -p $DATA_DIR
fi
if [ ! -d $LOG_DIR ]; then
mkdir -p $LOG_DIR
fi
echo "#Vhost Name Messages_ready Messages_unacknowledged Messages Consumers Memory Time" > $DATA_FILE
/usr/sbin/rabbitmqctl -q list_vhosts | \
while read VHOST; do
/usr/sbin/rabbitmqctl -q list_queues -p $VHOST name messages_ready messages_unacknowledged messages consumers memory | \
awk "{print \"$VHOST \" \$0 \" $(date +'%s') \"}" >> $DATA_FILE 2>${LOG_DIR}/list_queues.log
done
echo "mnesia_size: ${MNESIA_DB_SIZE}@$NOW" > $RABBIT_STATS_DATA_FILE
echo "rss_size: ${RABBIT_RSS}@$NOW" >> $RABBIT_STATS_DATA_FILE