Fix: nrpe queue check should check for freshness

Make the rabbitmq queue check also check if its input data file was
recently updated. This input data is created via cronjob; if that gets
stuck we might not actually be getting meaningful data.

The charm supports configuring the check interval via a full cron time
specification, so technically one could have that updated only once a
year even if this doesn't make much sense in a monitoring scenario.

Also fix a buglet in the nrpe update hook function: only deploy a
queue check if the cron job hasn't been deconfigured by setting it to
the empty string

Change-Id: I60141397f39e3b1b0274230db8d984934c98a08d
Closes-Bug: #1898523
This commit is contained in:
Peter Sabaini 2020-10-13 23:14:32 +02:00
parent 33af068075
commit 943f4f63ab
8 changed files with 115 additions and 4 deletions

View File

@ -90,7 +90,8 @@ options:
Run a command with a time limit specified in seconds in cron.
This timeout will govern to the rabbitmq stats capture, and that once
the timeout is reached a SIGINT is sent to the program, if it doesn't
exits before 10 seconds a SIGKILL is sent.
exits before 10 seconds a SIGKILL is sent. Note that from xenial onwards
the nrpe queue check will alert if stats are not updated as expected
queue_thresholds:
type: string
default: "[['\\*', '\\*', 100, 200]]"

View File

@ -5,11 +5,19 @@
# Author: Liam Young, Jacek Nykis
from collections import defaultdict
from datetime import datetime
from fnmatch import fnmatchcase
from itertools import chain
import argparse
import os
import sys
from charmhelpers.core.hookenv import config
from charmhelpers.core.host import CompareHostReleases, get_distrib_codename
if CompareHostReleases(get_distrib_codename()) > 'trusty':
from croniter import croniter
def gen_data_lines(filename):
with open(filename, "rt") as fin:
@ -66,6 +74,43 @@ def check_stats(stats_collated, limits):
yield l_queue, l_vhost, m_all, "WARN"
def get_cron_interval(cronspec, base):
"""Estimate cron interval by subtracting last from next job runtime
:param cronspec: Cronjob schedule string
:param base: datetime from when to check cron schedule
:return: timedelta
"""
it = croniter(cronspec, base)
return it.get_next(datetime) - it.get_prev(datetime)
def check_stats_file_freshness(stats_file, asof=None):
"""Check if a rabbitmq stats file is fresh
Fresh here is defined as modified within the last 2* cron job intervals
:param stats_file: file name to check
:param asof: datetime from when to check, defaults to datetime.now()
:return: tuple (status, message)
"""
if asof is None:
asof = datetime.now()
file_mtime = datetime.fromtimestamp(os.path.getmtime(stats_file))
cronspec = config("stats_cron_schedule")
interval = get_cron_interval(cronspec, asof)
# We expect the file to be modified in the last 2 cron intervals
cutoff_time = asof - (2 * interval)
if file_mtime < cutoff_time:
return (
"CRIT",
"Rabbit stats file not updated since {}".format(
file_mtime
),
)
return ("OK", "")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='RabbitMQ queue size nagios check.')
@ -98,6 +143,14 @@ if __name__ == "__main__":
elif status == "WARN":
warnings.append(
"%s in %s has %s messages" % (queue, vhost, message_no))
if "croniter" in sys.modules.keys(): # not on trusty and imported croniter
freshness_results = [check_stats_file_freshness(f)
for f in args.stats_file]
criticals.append(
msg for status, msg in freshness_results if status == "CRIT"
)
if len(criticals) > 0:
print("CRITICAL: {}".format(", ".join(criticals)))
sys.exit(2)

View File

@ -17,4 +17,7 @@ for dep in ${DEPS[@]}; do
check_and_install ${PYTHON} ${dep}
done
# python3-croniter not available on trusty
[ "$( lsb_release -sc )" != "trusty" ] && check_and_install ${PYTHON} croniter
exec ./hooks/install.real

View File

@ -719,7 +719,8 @@ def update_nrpe_checks():
description=description,
check_cmd=cmd)
if config('queue_thresholds'):
if config('queue_thresholds') and config('stats_cron_schedule'):
# Only add queue check if there's also a cronjob for creating stats
cmd = ""
# If value of queue_thresholds is incorrect we want the hook to fail
for item in yaml.safe_load(config('queue_thresholds')):

View File

@ -17,3 +17,5 @@ coverage>=4.5.2
pyudev # for ceph-* charm unit tests (need to fix the ceph-* charm unit tests/mocking)
git+https://github.com/openstack-charmers/zaza.git#egg=zaza;python_version>='3.0'
git+https://github.com/openstack-charmers/zaza-openstack-tests.git#egg=zaza.openstack
croniter # needed for charm-rabbitmq-server unit tests

View File

@ -15,9 +15,12 @@
import os
import sys
_path = os.path.dirname(os.path.realpath(__file__))
_actions = os.path.abspath(os.path.join(_path, '../actions'))
_hooks = os.path.abspath(os.path.join(_path, '../hooks'))
_files = os.path.abspath(os.path.join(_path, '../files'))
_tests = os.path.abspath(os.path.join(_path, '../unit_tests'))
def _add_path(path):
@ -27,3 +30,5 @@ def _add_path(path):
_add_path(_actions)
_add_path(_hooks)
_add_path(_files)
_add_path(_tests)

View File

@ -0,0 +1,45 @@
# Copyright 2016 Canonical Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from datetime import datetime, timedelta
from tempfile import NamedTemporaryFile
import unittest
from mock import MagicMock, patch
import check_rabbitmq_queues
class CheckRabbitTest(unittest.TestCase):
@patch(
"check_rabbitmq_queues.config",
MagicMock(return_value="*/5 * * * *"),
)
def test_check_stats_file_freshness_fresh(self):
with NamedTemporaryFile() as stats_file:
results = check_rabbitmq_queues.check_stats_file_freshness(
stats_file.name
)
self.assertEqual(results[0], "OK")
@patch(
"check_rabbitmq_queues.config",
MagicMock(return_value="*/5 * * * *"),
)
def test_check_stats_file_freshness_nonfresh(self):
with NamedTemporaryFile() as stats_file:
next_hour = datetime.now() + timedelta(hours=1)
results = check_rabbitmq_queues.check_stats_file_freshness(
stats_file.name, asof=next_hour
)
self.assertEqual(results[0], "CRIT")

View File

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import logging
import unittest
import os
@ -150,12 +151,12 @@ def patch_open():
Yields the mock for "open" and "file", respectively.'''
mock_open = MagicMock(spec=open)
mock_file = MagicMock(spec=__file__)
mock_file = MagicMock(spec=io.FileIO)
@contextmanager
def stub_open(*args, **kwargs):
mock_open(*args, **kwargs)
yield mock_file
with patch('__builtin__.open', stub_open):
with patch('builtins.open', stub_open):
yield mock_open, mock_file