Display busiest queues in check_queues NRPE plugin

When invoking the check_rabbitmq_queues script with wildcards for vhost
and/or queue parameters, script output does not reflect precisely which
queues are having a high number of oustanding messages as information is
consolidated under the wildcard.

This change fixes this behaviour by adding a new charm configuration
parameter which allows the user to specify the number of busiest queues,
n, to display should the check_rabbitmq_queues script reports any
warnings or errors.  The default, n=0, keeps the current script output.
This option is applicable regardless of the vhost:queue combination but
is specifically relevant when wildcards are passed as arguments.

Implementation displays the first n items in the stats list re-organized
in decreasing message count order.

Closes-Bug: #1939084
Change-Id: I5a32cb6bf37bd2a0f30861eace3c0e6cb5c2559d
This commit is contained in:
Julien Thieffry 2021-08-06 02:16:54 +00:00
parent fd8d018bab
commit 242167b6ba
4 changed files with 60 additions and 4 deletions

View File

@ -137,6 +137,13 @@ options:
Wildcards '*' are accepted to exclude, for example, single queue on all
hosts. Note that the wildcard asterisk must be double-escaped. Example:
[['\\*', 'queue1']]
busiest_queues:
type: int
default: 0
description: |
Number of the busiest RabbitMQ queues to display when warning and
critical checking thresholds are exceeded. Queues are displayed in
decreasing message count order.
connection-backlog:
type: int
default:

View File

@ -38,7 +38,7 @@ def gen_stats(data_lines):
yield vhost, queue, int(m_all)
def collate_stats(stats, limits, exclude):
def collate_stats(stats, limits, exclude, busiest_queues):
# Create a dict with stats collated according to the definitions in the
# limits file. If none of the definitions in the limits file is matched,
# store the stat without collating.
@ -57,6 +57,9 @@ def collate_stats(stats, limits, exclude):
for l_vhost, l_queue, _, _ in limits:
if fnmatchcase(vhost, l_vhost) and fnmatchcase(queue, l_queue):
collated[l_vhost, l_queue] += m_all
# Save vhost and queue names when using wildcards as arguments.
if busiest_queues > 0:
collated[vhost, queue] += m_all
break
else:
collated[vhost, queue] += m_all
@ -104,6 +107,20 @@ def check_stats_file_freshness(stats_file, oldest_timestamp):
return ("OK", "")
def top_n_queues(stats, busiest_queues):
if busiest_queues <= 0:
return []
tqueues = [" - Top Queues"]
sorted_messages_stats = sorted(stats.items(),
key=lambda y: y[1],
reverse=True)
for stat in sorted_messages_stats[:busiest_queues]:
tqueues.append("{0}:{1} -> {2}".format(stat[0][0], # vhost
stat[0][1], # queue
stat[1])) # messages
return tqueues
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='RabbitMQ queue size nagios check.')
@ -139,6 +156,14 @@ if __name__ == "__main__":
'raised'
)
)
parser.add_argument(
'-d',
type=int,
required=False,
default=0,
metavar=('n'),
help='Display the n busiest queues'
)
parser.add_argument(
'stats_file',
nargs='*',
@ -151,7 +176,7 @@ if __name__ == "__main__":
chain.from_iterable(
gen_data_lines(filename) for filename in args.stats_file))
# Collate stats according to limit definitions and check.
stats_collated = collate_stats(stats, args.c, args.e)
stats_collated = collate_stats(stats, args.c, args.e, args.d)
stats_checked = check_stats(stats_collated, args.c)
criticals, warnings = [], []
for queue, vhost, message_no, status in stats_checked:
@ -170,12 +195,16 @@ if __name__ == "__main__":
msg for status, msg in freshness_results if status == "CRIT"
)
tqueues = top_n_queues(stats_collated, args.d)
if len(criticals) > 0:
print("CRITICAL: {}".format(", ".join(criticals)))
print("CRITICAL: {0} {1}".format(", ".join(criticals),
" | ".join(tqueues)))
sys.exit(2)
# XXX: No warnings if there are criticals?
elif len(warnings) > 0:
print("WARNING: {}".format(", ".join(warnings)))
print("WARNING: {0} {1}".format(", ".join(warnings),
" | ".join(tqueues)))
sys.exit(1)
else:
print("OK")

View File

@ -1531,6 +1531,9 @@ def nrpe_update_queues_check(nrpe_compat, rabbit_dir):
cmd += ' -c "{}" "{}" {} {}'.format(*item)
for item in yaml.safe_load(config('exclude_queues')):
cmd += ' -e "{}" "{}"'.format(*item)
busiest_queues = config('busiest_queues')
if busiest_queues is not None and int(busiest_queues) > 0:
cmd += ' -d "{}"'.format(busiest_queues)
max_age = get_max_stats_file_age()
if max_age > 0:

View File

@ -1303,6 +1303,23 @@ class UtilsTests(CharmTestCase):
self.nrpe_compat.reset_mock()
# call with set busiest_queues > 0
queues_number = 3
self.test_config.set('busiest_queues', str(queues_number))
self.test_config.set('exclude_queues', "[]")
rabbit_utils.nrpe_update_queues_check(self.nrpe_compat, self.tmp_dir)
busiest_queues = '-d "{}" '.format(queues_number)
self.nrpe_compat.add_check.assert_called_with(
shortname='rabbitmq_queue',
description='Check RabbitMQ Queues',
check_cmd='{0}/check_rabbitmq_queues.py -c "\\*" "\\*" 100 200 {1}'
'-m 600 '
'{0}/data/test_queue_stats.dat'.format(self.tmp_dir,
busiest_queues))
self.nrpe_compat.remove_check.assert_not_called()
self.nrpe_compat.reset_mock()
# call with unset stats_cron_schedule
self.test_config.unset('stats_cron_schedule')
rabbit_utils.nrpe_update_queues_check(self.nrpe_compat, self.tmp_dir)