diff --git a/config.yaml b/config.yaml index 287b9ea3..2245653a 100644 --- a/config.yaml +++ b/config.yaml @@ -137,6 +137,13 @@ options: Wildcards '*' are accepted to exclude, for example, single queue on all hosts. Note that the wildcard asterisk must be double-escaped. Example: [['\\*', 'queue1']] + busiest_queues: + type: int + default: 0 + description: | + Number of the busiest RabbitMQ queues to display when warning and + critical checking thresholds are exceeded. Queues are displayed in + decreasing message count order. connection-backlog: type: int default: diff --git a/files/check_rabbitmq_queues.py b/files/check_rabbitmq_queues.py index bd1bedb1..219208e6 100755 --- a/files/check_rabbitmq_queues.py +++ b/files/check_rabbitmq_queues.py @@ -38,7 +38,7 @@ def gen_stats(data_lines): yield vhost, queue, int(m_all) -def collate_stats(stats, limits, exclude): +def collate_stats(stats, limits, exclude, busiest_queues): # Create a dict with stats collated according to the definitions in the # limits file. If none of the definitions in the limits file is matched, # store the stat without collating. @@ -57,6 +57,9 @@ def collate_stats(stats, limits, exclude): for l_vhost, l_queue, _, _ in limits: if fnmatchcase(vhost, l_vhost) and fnmatchcase(queue, l_queue): collated[l_vhost, l_queue] += m_all + # Save vhost and queue names when using wildcards as arguments. + if busiest_queues > 0: + collated[vhost, queue] += m_all break else: collated[vhost, queue] += m_all @@ -104,6 +107,20 @@ def check_stats_file_freshness(stats_file, oldest_timestamp): return ("OK", "") +def top_n_queues(stats, busiest_queues): + if busiest_queues <= 0: + return [] + tqueues = [" - Top Queues"] + sorted_messages_stats = sorted(stats.items(), + key=lambda y: y[1], + reverse=True) + for stat in sorted_messages_stats[:busiest_queues]: + tqueues.append("{0}:{1} -> {2}".format(stat[0][0], # vhost + stat[0][1], # queue + stat[1])) # messages + return tqueues + + if __name__ == "__main__": parser = argparse.ArgumentParser( description='RabbitMQ queue size nagios check.') @@ -139,6 +156,14 @@ if __name__ == "__main__": 'raised' ) ) + parser.add_argument( + '-d', + type=int, + required=False, + default=0, + metavar=('n'), + help='Display the n busiest queues' + ) parser.add_argument( 'stats_file', nargs='*', @@ -151,7 +176,7 @@ if __name__ == "__main__": chain.from_iterable( gen_data_lines(filename) for filename in args.stats_file)) # Collate stats according to limit definitions and check. - stats_collated = collate_stats(stats, args.c, args.e) + stats_collated = collate_stats(stats, args.c, args.e, args.d) stats_checked = check_stats(stats_collated, args.c) criticals, warnings = [], [] for queue, vhost, message_no, status in stats_checked: @@ -170,12 +195,16 @@ if __name__ == "__main__": msg for status, msg in freshness_results if status == "CRIT" ) + tqueues = top_n_queues(stats_collated, args.d) + if len(criticals) > 0: - print("CRITICAL: {}".format(", ".join(criticals))) + print("CRITICAL: {0} {1}".format(", ".join(criticals), + " | ".join(tqueues))) sys.exit(2) # XXX: No warnings if there are criticals? elif len(warnings) > 0: - print("WARNING: {}".format(", ".join(warnings))) + print("WARNING: {0} {1}".format(", ".join(warnings), + " | ".join(tqueues))) sys.exit(1) else: print("OK") diff --git a/hooks/rabbit_utils.py b/hooks/rabbit_utils.py index 40b22f2b..e808046a 100644 --- a/hooks/rabbit_utils.py +++ b/hooks/rabbit_utils.py @@ -1531,6 +1531,9 @@ def nrpe_update_queues_check(nrpe_compat, rabbit_dir): cmd += ' -c "{}" "{}" {} {}'.format(*item) for item in yaml.safe_load(config('exclude_queues')): cmd += ' -e "{}" "{}"'.format(*item) + busiest_queues = config('busiest_queues') + if busiest_queues is not None and int(busiest_queues) > 0: + cmd += ' -d "{}"'.format(busiest_queues) max_age = get_max_stats_file_age() if max_age > 0: diff --git a/unit_tests/test_rabbit_utils.py b/unit_tests/test_rabbit_utils.py index f7405944..5ff5746f 100644 --- a/unit_tests/test_rabbit_utils.py +++ b/unit_tests/test_rabbit_utils.py @@ -1303,6 +1303,23 @@ class UtilsTests(CharmTestCase): self.nrpe_compat.reset_mock() + # call with set busiest_queues > 0 + queues_number = 3 + self.test_config.set('busiest_queues', str(queues_number)) + self.test_config.set('exclude_queues', "[]") + rabbit_utils.nrpe_update_queues_check(self.nrpe_compat, self.tmp_dir) + busiest_queues = '-d "{}" '.format(queues_number) + self.nrpe_compat.add_check.assert_called_with( + shortname='rabbitmq_queue', + description='Check RabbitMQ Queues', + check_cmd='{0}/check_rabbitmq_queues.py -c "\\*" "\\*" 100 200 {1}' + '-m 600 ' + '{0}/data/test_queue_stats.dat'.format(self.tmp_dir, + busiest_queues)) + self.nrpe_compat.remove_check.assert_not_called() + + self.nrpe_compat.reset_mock() + # call with unset stats_cron_schedule self.test_config.unset('stats_cron_schedule') rabbit_utils.nrpe_update_queues_check(self.nrpe_compat, self.tmp_dir)