Display busiest queues in check_queues NRPE plugin
When invoking the check_rabbitmq_queues script with wildcards for vhost and/or queue parameters, script output does not reflect precisely which queues are having a high number of oustanding messages as information is consolidated under the wildcard. This change fixes this behaviour by adding a new charm configuration parameter which allows the user to specify the number of busiest queues, n, to display should the check_rabbitmq_queues script reports any warnings or errors. The default, n=0, keeps the current script output. This option is applicable regardless of the vhost:queue combination but is specifically relevant when wildcards are passed as arguments. Implementation displays the first n items in the stats list re-organized in decreasing message count order. Closes-Bug: #1939084 Change-Id: I5a32cb6bf37bd2a0f30861eace3c0e6cb5c2559d
This commit is contained in:
parent
fd8d018bab
commit
242167b6ba
|
@ -137,6 +137,13 @@ options:
|
|||
Wildcards '*' are accepted to exclude, for example, single queue on all
|
||||
hosts. Note that the wildcard asterisk must be double-escaped. Example:
|
||||
[['\\*', 'queue1']]
|
||||
busiest_queues:
|
||||
type: int
|
||||
default: 0
|
||||
description: |
|
||||
Number of the busiest RabbitMQ queues to display when warning and
|
||||
critical checking thresholds are exceeded. Queues are displayed in
|
||||
decreasing message count order.
|
||||
connection-backlog:
|
||||
type: int
|
||||
default:
|
||||
|
|
|
@ -38,7 +38,7 @@ def gen_stats(data_lines):
|
|||
yield vhost, queue, int(m_all)
|
||||
|
||||
|
||||
def collate_stats(stats, limits, exclude):
|
||||
def collate_stats(stats, limits, exclude, busiest_queues):
|
||||
# Create a dict with stats collated according to the definitions in the
|
||||
# limits file. If none of the definitions in the limits file is matched,
|
||||
# store the stat without collating.
|
||||
|
@ -57,6 +57,9 @@ def collate_stats(stats, limits, exclude):
|
|||
for l_vhost, l_queue, _, _ in limits:
|
||||
if fnmatchcase(vhost, l_vhost) and fnmatchcase(queue, l_queue):
|
||||
collated[l_vhost, l_queue] += m_all
|
||||
# Save vhost and queue names when using wildcards as arguments.
|
||||
if busiest_queues > 0:
|
||||
collated[vhost, queue] += m_all
|
||||
break
|
||||
else:
|
||||
collated[vhost, queue] += m_all
|
||||
|
@ -104,6 +107,20 @@ def check_stats_file_freshness(stats_file, oldest_timestamp):
|
|||
return ("OK", "")
|
||||
|
||||
|
||||
def top_n_queues(stats, busiest_queues):
|
||||
if busiest_queues <= 0:
|
||||
return []
|
||||
tqueues = [" - Top Queues"]
|
||||
sorted_messages_stats = sorted(stats.items(),
|
||||
key=lambda y: y[1],
|
||||
reverse=True)
|
||||
for stat in sorted_messages_stats[:busiest_queues]:
|
||||
tqueues.append("{0}:{1} -> {2}".format(stat[0][0], # vhost
|
||||
stat[0][1], # queue
|
||||
stat[1])) # messages
|
||||
return tqueues
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description='RabbitMQ queue size nagios check.')
|
||||
|
@ -139,6 +156,14 @@ if __name__ == "__main__":
|
|||
'raised'
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
'-d',
|
||||
type=int,
|
||||
required=False,
|
||||
default=0,
|
||||
metavar=('n'),
|
||||
help='Display the n busiest queues'
|
||||
)
|
||||
parser.add_argument(
|
||||
'stats_file',
|
||||
nargs='*',
|
||||
|
@ -151,7 +176,7 @@ if __name__ == "__main__":
|
|||
chain.from_iterable(
|
||||
gen_data_lines(filename) for filename in args.stats_file))
|
||||
# Collate stats according to limit definitions and check.
|
||||
stats_collated = collate_stats(stats, args.c, args.e)
|
||||
stats_collated = collate_stats(stats, args.c, args.e, args.d)
|
||||
stats_checked = check_stats(stats_collated, args.c)
|
||||
criticals, warnings = [], []
|
||||
for queue, vhost, message_no, status in stats_checked:
|
||||
|
@ -170,12 +195,16 @@ if __name__ == "__main__":
|
|||
msg for status, msg in freshness_results if status == "CRIT"
|
||||
)
|
||||
|
||||
tqueues = top_n_queues(stats_collated, args.d)
|
||||
|
||||
if len(criticals) > 0:
|
||||
print("CRITICAL: {}".format(", ".join(criticals)))
|
||||
print("CRITICAL: {0} {1}".format(", ".join(criticals),
|
||||
" | ".join(tqueues)))
|
||||
sys.exit(2)
|
||||
# XXX: No warnings if there are criticals?
|
||||
elif len(warnings) > 0:
|
||||
print("WARNING: {}".format(", ".join(warnings)))
|
||||
print("WARNING: {0} {1}".format(", ".join(warnings),
|
||||
" | ".join(tqueues)))
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("OK")
|
||||
|
|
|
@ -1531,6 +1531,9 @@ def nrpe_update_queues_check(nrpe_compat, rabbit_dir):
|
|||
cmd += ' -c "{}" "{}" {} {}'.format(*item)
|
||||
for item in yaml.safe_load(config('exclude_queues')):
|
||||
cmd += ' -e "{}" "{}"'.format(*item)
|
||||
busiest_queues = config('busiest_queues')
|
||||
if busiest_queues is not None and int(busiest_queues) > 0:
|
||||
cmd += ' -d "{}"'.format(busiest_queues)
|
||||
|
||||
max_age = get_max_stats_file_age()
|
||||
if max_age > 0:
|
||||
|
|
|
@ -1303,6 +1303,23 @@ class UtilsTests(CharmTestCase):
|
|||
|
||||
self.nrpe_compat.reset_mock()
|
||||
|
||||
# call with set busiest_queues > 0
|
||||
queues_number = 3
|
||||
self.test_config.set('busiest_queues', str(queues_number))
|
||||
self.test_config.set('exclude_queues', "[]")
|
||||
rabbit_utils.nrpe_update_queues_check(self.nrpe_compat, self.tmp_dir)
|
||||
busiest_queues = '-d "{}" '.format(queues_number)
|
||||
self.nrpe_compat.add_check.assert_called_with(
|
||||
shortname='rabbitmq_queue',
|
||||
description='Check RabbitMQ Queues',
|
||||
check_cmd='{0}/check_rabbitmq_queues.py -c "\\*" "\\*" 100 200 {1}'
|
||||
'-m 600 '
|
||||
'{0}/data/test_queue_stats.dat'.format(self.tmp_dir,
|
||||
busiest_queues))
|
||||
self.nrpe_compat.remove_check.assert_not_called()
|
||||
|
||||
self.nrpe_compat.reset_mock()
|
||||
|
||||
# call with unset stats_cron_schedule
|
||||
self.test_config.unset('stats_cron_schedule')
|
||||
rabbit_utils.nrpe_update_queues_check(self.nrpe_compat, self.tmp_dir)
|
||||
|
|
Loading…
Reference in New Issue