Fix: nrpe queue check should check for freshness
Make the rabbitmq queue check also check if its input data file was recently updated. This input data is created via cronjob; if that gets stuck we might not actually be getting meaningful data. The charm supports configuring the check interval via a full cron time specification, so technically one could have that updated only once a year even if this doesn't make much sense in a monitoring scenario. Also fix a buglet in the nrpe update hook function: only deploy a queue check if the cron job hasn't been deconfigured by setting it to the empty string Change-Id: I60141397f39e3b1b0274230db8d984934c98a08d Closes-Bug: #1898523
This commit is contained in:
parent
33af068075
commit
943f4f63ab
|
@ -90,7 +90,8 @@ options:
|
|||
Run a command with a time limit specified in seconds in cron.
|
||||
This timeout will govern to the rabbitmq stats capture, and that once
|
||||
the timeout is reached a SIGINT is sent to the program, if it doesn't
|
||||
exits before 10 seconds a SIGKILL is sent.
|
||||
exits before 10 seconds a SIGKILL is sent. Note that from xenial onwards
|
||||
the nrpe queue check will alert if stats are not updated as expected
|
||||
queue_thresholds:
|
||||
type: string
|
||||
default: "[['\\*', '\\*', 100, 200]]"
|
||||
|
|
|
@ -5,11 +5,19 @@
|
|||
# Author: Liam Young, Jacek Nykis
|
||||
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from fnmatch import fnmatchcase
|
||||
from itertools import chain
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
from charmhelpers.core.hookenv import config
|
||||
from charmhelpers.core.host import CompareHostReleases, get_distrib_codename
|
||||
|
||||
if CompareHostReleases(get_distrib_codename()) > 'trusty':
|
||||
from croniter import croniter
|
||||
|
||||
|
||||
def gen_data_lines(filename):
|
||||
with open(filename, "rt") as fin:
|
||||
|
@ -66,6 +74,43 @@ def check_stats(stats_collated, limits):
|
|||
yield l_queue, l_vhost, m_all, "WARN"
|
||||
|
||||
|
||||
def get_cron_interval(cronspec, base):
|
||||
"""Estimate cron interval by subtracting last from next job runtime
|
||||
|
||||
:param cronspec: Cronjob schedule string
|
||||
:param base: datetime from when to check cron schedule
|
||||
:return: timedelta
|
||||
"""
|
||||
it = croniter(cronspec, base)
|
||||
return it.get_next(datetime) - it.get_prev(datetime)
|
||||
|
||||
|
||||
def check_stats_file_freshness(stats_file, asof=None):
|
||||
"""Check if a rabbitmq stats file is fresh
|
||||
|
||||
Fresh here is defined as modified within the last 2* cron job intervals
|
||||
|
||||
:param stats_file: file name to check
|
||||
:param asof: datetime from when to check, defaults to datetime.now()
|
||||
:return: tuple (status, message)
|
||||
"""
|
||||
if asof is None:
|
||||
asof = datetime.now()
|
||||
file_mtime = datetime.fromtimestamp(os.path.getmtime(stats_file))
|
||||
cronspec = config("stats_cron_schedule")
|
||||
interval = get_cron_interval(cronspec, asof)
|
||||
# We expect the file to be modified in the last 2 cron intervals
|
||||
cutoff_time = asof - (2 * interval)
|
||||
if file_mtime < cutoff_time:
|
||||
return (
|
||||
"CRIT",
|
||||
"Rabbit stats file not updated since {}".format(
|
||||
file_mtime
|
||||
),
|
||||
)
|
||||
return ("OK", "")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description='RabbitMQ queue size nagios check.')
|
||||
|
@ -98,6 +143,14 @@ if __name__ == "__main__":
|
|||
elif status == "WARN":
|
||||
warnings.append(
|
||||
"%s in %s has %s messages" % (queue, vhost, message_no))
|
||||
|
||||
if "croniter" in sys.modules.keys(): # not on trusty and imported croniter
|
||||
freshness_results = [check_stats_file_freshness(f)
|
||||
for f in args.stats_file]
|
||||
criticals.append(
|
||||
msg for status, msg in freshness_results if status == "CRIT"
|
||||
)
|
||||
|
||||
if len(criticals) > 0:
|
||||
print("CRITICAL: {}".format(", ".join(criticals)))
|
||||
sys.exit(2)
|
||||
|
|
|
@ -17,4 +17,7 @@ for dep in ${DEPS[@]}; do
|
|||
check_and_install ${PYTHON} ${dep}
|
||||
done
|
||||
|
||||
# python3-croniter not available on trusty
|
||||
[ "$( lsb_release -sc )" != "trusty" ] && check_and_install ${PYTHON} croniter
|
||||
|
||||
exec ./hooks/install.real
|
||||
|
|
|
@ -719,7 +719,8 @@ def update_nrpe_checks():
|
|||
description=description,
|
||||
check_cmd=cmd)
|
||||
|
||||
if config('queue_thresholds'):
|
||||
if config('queue_thresholds') and config('stats_cron_schedule'):
|
||||
# Only add queue check if there's also a cronjob for creating stats
|
||||
cmd = ""
|
||||
# If value of queue_thresholds is incorrect we want the hook to fail
|
||||
for item in yaml.safe_load(config('queue_thresholds')):
|
||||
|
|
|
@ -17,3 +17,5 @@ coverage>=4.5.2
|
|||
pyudev # for ceph-* charm unit tests (need to fix the ceph-* charm unit tests/mocking)
|
||||
git+https://github.com/openstack-charmers/zaza.git#egg=zaza;python_version>='3.0'
|
||||
git+https://github.com/openstack-charmers/zaza-openstack-tests.git#egg=zaza.openstack
|
||||
|
||||
croniter # needed for charm-rabbitmq-server unit tests
|
||||
|
|
|
@ -15,9 +15,12 @@
|
|||
import os
|
||||
import sys
|
||||
|
||||
|
||||
_path = os.path.dirname(os.path.realpath(__file__))
|
||||
_actions = os.path.abspath(os.path.join(_path, '../actions'))
|
||||
_hooks = os.path.abspath(os.path.join(_path, '../hooks'))
|
||||
_files = os.path.abspath(os.path.join(_path, '../files'))
|
||||
_tests = os.path.abspath(os.path.join(_path, '../unit_tests'))
|
||||
|
||||
|
||||
def _add_path(path):
|
||||
|
@ -27,3 +30,5 @@ def _add_path(path):
|
|||
|
||||
_add_path(_actions)
|
||||
_add_path(_hooks)
|
||||
_add_path(_files)
|
||||
_add_path(_tests)
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
# Copyright 2016 Canonical Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from tempfile import NamedTemporaryFile
|
||||
import unittest
|
||||
|
||||
from mock import MagicMock, patch
|
||||
import check_rabbitmq_queues
|
||||
|
||||
|
||||
class CheckRabbitTest(unittest.TestCase):
|
||||
@patch(
|
||||
"check_rabbitmq_queues.config",
|
||||
MagicMock(return_value="*/5 * * * *"),
|
||||
)
|
||||
def test_check_stats_file_freshness_fresh(self):
|
||||
with NamedTemporaryFile() as stats_file:
|
||||
results = check_rabbitmq_queues.check_stats_file_freshness(
|
||||
stats_file.name
|
||||
)
|
||||
self.assertEqual(results[0], "OK")
|
||||
|
||||
@patch(
|
||||
"check_rabbitmq_queues.config",
|
||||
MagicMock(return_value="*/5 * * * *"),
|
||||
)
|
||||
def test_check_stats_file_freshness_nonfresh(self):
|
||||
with NamedTemporaryFile() as stats_file:
|
||||
next_hour = datetime.now() + timedelta(hours=1)
|
||||
results = check_rabbitmq_queues.check_stats_file_freshness(
|
||||
stats_file.name, asof=next_hour
|
||||
)
|
||||
self.assertEqual(results[0], "CRIT")
|
|
@ -12,6 +12,7 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import io
|
||||
import logging
|
||||
import unittest
|
||||
import os
|
||||
|
@ -150,12 +151,12 @@ def patch_open():
|
|||
|
||||
Yields the mock for "open" and "file", respectively.'''
|
||||
mock_open = MagicMock(spec=open)
|
||||
mock_file = MagicMock(spec=__file__)
|
||||
mock_file = MagicMock(spec=io.FileIO)
|
||||
|
||||
@contextmanager
|
||||
def stub_open(*args, **kwargs):
|
||||
mock_open(*args, **kwargs)
|
||||
yield mock_file
|
||||
|
||||
with patch('__builtin__.open', stub_open):
|
||||
with patch('builtins.open', stub_open):
|
||||
yield mock_open, mock_file
|
||||
|
|
Loading…
Reference in New Issue