Display busiest queues in check_queues NRPE plugin

When invoking the check_rabbitmq_queues script with wildcards for vhost and/or queue parameters, script output does not reflect precisely which queues are having a high number of oustanding messages as information is consolidated under the wildcard. This change fixes this behaviour by adding a new charm configuration parameter which allows the user to specify the number of busiest queues, n, to display should the check_rabbitmq_queues script reports any warnings or errors. The default, n=0, keeps the current script output. This option is applicable regardless of the vhost:queue combination but is specifically relevant when wildcards are passed as arguments. Implementation displays the first n items in the stats list re-organized in decreasing message count order. Closes-Bug: #1939084 Change-Id: I5a32cb6bf37bd2a0f30861eace3c0e6cb5c2559d
2021-08-06 02:16:54 +00:00 · 2021-08-06 02:16:54 +00:00 · 242167b6ba
parent fd8d018bab
commit 242167b6ba
4 changed files with 60 additions and 4 deletions
--- a/config.yaml
+++ b/config.yaml
@ -137,6 +137,13 @@ options:
      Wildcards '*' are accepted to exclude, for example, single queue on all
      hosts. Note that the wildcard asterisk must be double-escaped. Example:
      [['\\*', 'queue1']]
+  busiest_queues:
+    type: int
+    default: 0
+    description: |
+      Number of the busiest RabbitMQ queues to display when warning and
+      critical checking thresholds are exceeded. Queues are displayed in
+      decreasing message count order.
  connection-backlog:
    type: int
    default:
--- a/files/check_rabbitmq_queues.py
+++ b/files/check_rabbitmq_queues.py
@ -38,7 +38,7 @@ def gen_stats(data_lines):
        yield vhost, queue, int(m_all)


-def collate_stats(stats, limits, exclude):
+def collate_stats(stats, limits, exclude, busiest_queues):
    # Create a dict with stats collated according to the definitions in the
    # limits file. If none of the definitions in the limits file is matched,
    # store the stat without collating.
@ -57,6 +57,9 @@ def collate_stats(stats, limits, exclude):
        for l_vhost, l_queue, _, _ in limits:
            if fnmatchcase(vhost, l_vhost) and fnmatchcase(queue, l_queue):
                collated[l_vhost, l_queue] += m_all
+                # Save vhost and queue names when using wildcards as arguments.
+                if busiest_queues > 0:
+                    collated[vhost, queue] += m_all
                break
        else:
            collated[vhost, queue] += m_all
@ -104,6 +107,20 @@ def check_stats_file_freshness(stats_file, oldest_timestamp):
    return ("OK", "")


+def top_n_queues(stats, busiest_queues):
+    if busiest_queues <= 0:
+        return []
+    tqueues = [" - Top Queues"]
+    sorted_messages_stats = sorted(stats.items(),
+                                   key=lambda y: y[1],
+                                   reverse=True)
+    for stat in sorted_messages_stats[:busiest_queues]:
+        tqueues.append("{0}:{1} -> {2}".format(stat[0][0],  # vhost
+                                               stat[0][1],  # queue
+                                               stat[1]))    # messages
+    return tqueues
+
+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='RabbitMQ queue size nagios check.')
@ -139,6 +156,14 @@ if __name__ == "__main__":
            'raised'
        )
    )
+    parser.add_argument(
+        '-d',
+        type=int,
+        required=False,
+        default=0,
+        metavar=('n'),
+        help='Display the n busiest queues'
+    )
    parser.add_argument(
        'stats_file',
        nargs='*',
@ -151,7 +176,7 @@ if __name__ == "__main__":
        chain.from_iterable(
            gen_data_lines(filename) for filename in args.stats_file))
    # Collate stats according to limit definitions and check.
-    stats_collated = collate_stats(stats, args.c, args.e)
+    stats_collated = collate_stats(stats, args.c, args.e, args.d)
    stats_checked = check_stats(stats_collated, args.c)
    criticals, warnings = [], []
    for queue, vhost, message_no, status in stats_checked:
@ -170,12 +195,16 @@ if __name__ == "__main__":
            msg for status, msg in freshness_results if status == "CRIT"
        )

+    tqueues = top_n_queues(stats_collated, args.d)
+
    if len(criticals) > 0:
-        print("CRITICAL: {}".format(", ".join(criticals)))
+        print("CRITICAL: {0} {1}".format(", ".join(criticals),
+                                         " | ".join(tqueues)))
        sys.exit(2)
        # XXX: No warnings if there are criticals?
    elif len(warnings) > 0:
-        print("WARNING: {}".format(", ".join(warnings)))
+        print("WARNING: {0} {1}".format(", ".join(warnings),
+                                        " | ".join(tqueues)))
        sys.exit(1)
    else:
        print("OK")
--- a/hooks/rabbit_utils.py
+++ b/hooks/rabbit_utils.py
@ -1531,6 +1531,9 @@ def nrpe_update_queues_check(nrpe_compat, rabbit_dir):
            cmd += ' -c "{}" "{}" {} {}'.format(*item)
        for item in yaml.safe_load(config('exclude_queues')):
            cmd += ' -e "{}" "{}"'.format(*item)
+        busiest_queues = config('busiest_queues')
+        if busiest_queues is not None and int(busiest_queues) > 0:
+            cmd += ' -d "{}"'.format(busiest_queues)

        max_age = get_max_stats_file_age()
        if max_age > 0:
--- a/unit_tests/test_rabbit_utils.py
+++ b/unit_tests/test_rabbit_utils.py
@ -1303,6 +1303,23 @@ class UtilsTests(CharmTestCase):

        self.nrpe_compat.reset_mock()

+        # call with set busiest_queues > 0
+        queues_number = 3
+        self.test_config.set('busiest_queues', str(queues_number))
+        self.test_config.set('exclude_queues', "[]")
+        rabbit_utils.nrpe_update_queues_check(self.nrpe_compat, self.tmp_dir)
+        busiest_queues = '-d "{}" '.format(queues_number)
+        self.nrpe_compat.add_check.assert_called_with(
+            shortname='rabbitmq_queue',
+            description='Check RabbitMQ Queues',
+            check_cmd='{0}/check_rabbitmq_queues.py -c "\\*" "\\*" 100 200 {1}'
+                      '-m 600 '
+                      '{0}/data/test_queue_stats.dat'.format(self.tmp_dir,
+                                                             busiest_queues))
+        self.nrpe_compat.remove_check.assert_not_called()
+
+        self.nrpe_compat.reset_mock()
+
        # call with unset stats_cron_schedule
        self.test_config.unset('stats_cron_schedule')
        rabbit_utils.nrpe_update_queues_check(self.nrpe_compat, self.tmp_dir)