From 9376aeb8e66c9672baf8af950ce4164b1da4c466 Mon Sep 17 00:00:00 2001 From: Tianqi Date: Thu, 31 Mar 2022 17:45:03 +0000 Subject: [PATCH] Handle non-uniform queue stats output RabbitMQ sesrver sometimes creates non-uniform outputs that nrpe can't parse. Instead of breaking the check, this commit outputs the error messages and continue the check. This problem is most likely caused by queue state being "down" [1]. However, because the current charm doesn't show such information and the bug is hard to manually reproduce, this commit adds the state attribute when creating queue_state file for future debugging. [1] https://www.rabbitmq.com/rabbitmqctl.8.html#state_2 Closes-Bug: #1850948 Change-Id: Iaa493c8270f344cde8ad7c89bd2bb548f0ad71bd --- files/check_rabbitmq_queues.py | 6 +++--- files/collect_rabbitmq_stats.sh | 4 ++-- unit_tests/test_check_rabbitmq_queues.py | 23 +++++++++++++++++++++++ 3 files changed, 28 insertions(+), 5 deletions(-) create mode 100644 unit_tests/test_check_rabbitmq_queues.py diff --git a/files/check_rabbitmq_queues.py b/files/check_rabbitmq_queues.py index 219208e6..469baf65 100755 --- a/files/check_rabbitmq_queues.py +++ b/files/check_rabbitmq_queues.py @@ -29,10 +29,10 @@ def gen_data_lines(filename): def gen_stats(data_lines): for line in data_lines: try: - vhost, queue, _, _, m_all, _ = line.split(None, 5) + vhost, queue, _, _, m_all, _, _ = line.split(None, 6) except ValueError: - print("ERROR: problem parsing the stats file") - sys.exit(2) + print("ERROR: problem parsing the line {}".format(line)) + continue assert m_all.isdigit(), ("Message count is not a number: {0!r}" .format(m_all)) yield vhost, queue, int(m_all) diff --git a/files/collect_rabbitmq_stats.sh b/files/collect_rabbitmq_stats.sh index 6fc64bbe..6c31621b 100755 --- a/files/collect_rabbitmq_stats.sh +++ b/files/collect_rabbitmq_stats.sh @@ -44,10 +44,10 @@ if [ ! -d $LOG_DIR ]; then mkdir -p $LOG_DIR fi TMP_DATA_FILE=$(mktemp -p ${DATA_DIR}) -echo "#Vhost Name Messages_ready Messages_unacknowledged Messages Consumers Memory Time" > ${TMP_DATA_FILE} +echo "#Vhost Name Messages_ready Messages_unacknowledged Messages Consumers Memory State Time" > ${TMP_DATA_FILE} /usr/sbin/rabbitmqctl -q list_vhosts | \ while read VHOST; do - /usr/sbin/rabbitmqctl -q list_queues -p $VHOST name messages_ready messages_unacknowledged messages consumers memory | \ + /usr/sbin/rabbitmqctl -q list_queues -p $VHOST name messages_ready messages_unacknowledged messages consumers memory state | \ awk "{print \"$VHOST \" \$0 \" $(date +'%s') \"}" >> ${TMP_DATA_FILE} 2>${LOG_DIR}/list_queues.log done mv ${TMP_DATA_FILE} ${DATA_FILE} diff --git a/unit_tests/test_check_rabbitmq_queues.py b/unit_tests/test_check_rabbitmq_queues.py new file mode 100644 index 00000000..dfabf1c3 --- /dev/null +++ b/unit_tests/test_check_rabbitmq_queues.py @@ -0,0 +1,23 @@ +import unittest + +import check_rabbitmq_queues + + +class TestCheckRabbitmqQueues(unittest.TestCase): + def test_gen_stats(self): + incomplete_queue = ["landscape " + "landscape.notifications-queue." + "aed6fb68-b1ff-4a68-980e-df6adf786beb " + "DOWN 1572621605"] + x = list(check_rabbitmq_queues.gen_stats(incomplete_queue)) + self.assertEqual(x, []) + + complete_queue = ["landscape " + "landscape.notifications-queue." + "b9557ad1-9908-425e-a860-d424e34f63d7 " + "0 0 0 0 34952 RUNNING 1572621605"] + y = list(check_rabbitmq_queues.gen_stats(complete_queue)) + self.assertEqual(y, [("landscape", + "landscape.notifications-queue." + "b9557ad1-9908-425e-a860-d424e34f63d7", + 0)])