Move cron max file age calculation to rabbit_utils

The check_rabbitmq_queues nrpe check accesses the cron file created
for running collect stats job. This is done in order to determine if
the stats are too old and an alert should be raised. The nagios user
does not have access to read the cron job when running in a hardened
environment where /etc/cron.d is not readable.

This change refactors this logic to move the calculation of maximum
age for a stats file from the check_rabbitmq_queues script and into
the rabbit_utils code where it is generating the nrpe configuration.
A new (optional) parameter is added to the check_rabbitmq_queues
script to accept the maximum age in seconds a file can last be
modified.

This change also removes the trusty support in hooks/install and
hooks/upgrade-charm as the rabbit_utils.py file needs to import a
dependency which is installed by the scripts. It is cleaned up to make
sure the croniter package is always installed on install or upgrade.

Change-Id: If948fc921ee0b63682946c7cc879ac50e971e588
Closes-Bug: #1940495
Co-authored-by: Aurelien Lourot <aurelien.lourot@canonical.com>
This commit is contained in:
Billy Olsen 2021-08-18 20:48:46 -07:00 committed by Aurelien Lourot
parent 8948fe7a49
commit fd8d018bab
8 changed files with 180 additions and 133 deletions

View File

@ -5,7 +5,7 @@
# Author: Liam Young, Jacek Nykis
from collections import defaultdict
from datetime import datetime
from datetime import datetime, timedelta
from fnmatch import fnmatchcase
from itertools import chain
import argparse
@ -18,13 +18,6 @@ with open("/etc/lsb-release") as f:
lsb = [s.split("=") for s in f.readlines()]
lsb_dict = dict([(k, v.strip()) for k, v in lsb])
if lsb_dict.get("DISTRIB_CODENAME") != "trusty":
# Trusty doesn't have croniter
from croniter import croniter
CRONJOB = "/etc/cron.d/rabbitmq-stats"
def gen_data_lines(filename):
with open(filename, "rt") as fin:
@ -91,54 +84,17 @@ def check_stats(stats_collated, limits):
yield l_queue, l_vhost, m_all, "WARN"
def get_cron_interval(cronspec, base):
"""Estimate cron interval by subtracting last from next job runtime
:param cronspec: Cronjob schedule string
:param base: datetime from when to check cron schedule
:return: timedelta
"""
it = croniter(cronspec, base)
return it.get_next(datetime) - it.get_prev(datetime)
def get_stats_cron_schedule():
"""Returns the cron schedule from the stats CRONJOB spec file.
:return: a string containing the cron schedule
:rtype: str
"""
# TODO(wolsen) in general, the layout of this code makes a lot of
# assumptions about the files that are written, etc and is somewhat
# brittle for anything not specifically laid out by the charm. This
# should be revisited in the future.
with open(CRONJOB) as f:
cronjob = f.read()
# The first 5 columns make up the cron spec, but the output of this
# function should be a string. Split the line on whitespace and reform
# the spec string from the necessary columns
# See LP#1939702
cron_spec = ' '.join(cronjob.split()[:5])
return cron_spec
def check_stats_file_freshness(stats_file, asof=None):
def check_stats_file_freshness(stats_file, oldest_timestamp):
"""Check if a rabbitmq stats file is fresh
Fresh here is defined as modified within the last 2* cron job intervals
:param stats_file: file name to check
:param asof: datetime from when to check, defaults to datetime.now()
:param oldest_timestamp: oldest timestamp the file can be last modified
:return: tuple (status, message)
"""
if asof is None:
asof = datetime.now()
file_mtime = datetime.fromtimestamp(os.path.getmtime(stats_file))
cronspec = get_stats_cron_schedule()
interval = get_cron_interval(cronspec, asof)
# We expect the file to be modified in the last 2 cron intervals
cutoff_time = asof - (2 * interval)
if file_mtime < cutoff_time:
if file_mtime < oldest_timestamp:
return (
"CRIT",
"Rabbit stats file not updated since {}".format(
@ -166,8 +122,22 @@ if __name__ == "__main__":
required=False,
default=[],
metavar=('vhost', 'queue'),
help='Vhost and queue to exclude from checks. Can be used multiple \
times'
help=(
'Vhost and queue to exclude from checks. Can be used multiple '
'times'
)
)
parser.add_argument(
'-m',
nargs='?',
action='store',
required=False,
default=0,
type=int,
help=(
'Maximum age (in seconds) the stats files can be before a crit is '
'raised'
)
)
parser.add_argument(
'stats_file',
@ -192,8 +162,9 @@ if __name__ == "__main__":
warnings.append(
"%s in %s has %s messages" % (queue, vhost, message_no))
if "croniter" in sys.modules.keys(): # not on trusty and imported croniter
freshness_results = [check_stats_file_freshness(f)
if args.m:
oldest = datetime.now() - timedelta(seconds=args.m)
freshness_results = [check_stats_file_freshness(f, oldest)
for f in args.stats_file]
criticals.extend(
msg for status, msg in freshness_results if status == "CRIT"

View File

@ -2,7 +2,7 @@
# Wrapper to deal with newer Ubuntu versions that don't have py2 installed
# by default.
declare -a DEPS=('apt' 'netaddr' 'netifaces' 'pip' 'yaml' 'dnspython' 'requests')
declare -a DEPS=('apt' 'netaddr' 'netifaces' 'pip' 'yaml' 'dnspython' 'requests' 'croniter')
check_and_install() {
pkg="${1}-${2}"
@ -13,11 +13,9 @@ check_and_install() {
PYTHON="python3"
apt-get update
for dep in ${DEPS[@]}; do
check_and_install ${PYTHON} ${dep}
done
# python3-croniter not available on trusty
[ "$( lsb_release -sc )" != "trusty" ] && check_and_install ${PYTHON} croniter
exec ./hooks/install.real

View File

@ -26,6 +26,17 @@ import yaml
from collections import OrderedDict, defaultdict
try:
from croniter import CroniterBadCronError
except ImportError:
# NOTE(lourot): CroniterBadCronError doesn't exist in croniter
# 0.3.12 and older, i.e. it doesn't exist in Bionic and older.
# croniter used to raise a ValueError on these older versions:
CroniterBadCronError = ValueError
from croniter import croniter
from datetime import datetime
from rabbitmq_context import (
RabbitMQSSLContext,
RabbitMQClusterContext,
@ -1045,14 +1056,22 @@ def assess_status_func(configs):
# If the cluster has completed the series upgrade, run the
# complete-cluster-series-upgrade action to clear this setting.
if leader_get('cluster_series_upgrading'):
message += (", Run complete-cluster-series-upgrade when the "
"cluster has completed its upgrade.")
message += (", run complete-cluster-series-upgrade when the "
"cluster has completed its upgrade")
# Edge case when the first rabbitmq unit is upgraded it will show
# waiting for peers. Force "active" workload state for various
# testing suites like zaza to recognize a successful series upgrade
# of the first unit.
if state == "waiting":
state = "active"
# Validate that the cron schedule for nrpe status checks is correct. An
# invalid cron schedule will not prevent the rabbitmq service from
# running but may cause problems with nrpe checks.
schedule = config('stats_cron_schedule')
if schedule and not is_cron_schedule_valid(schedule):
message += ". stats_cron_schedule is invalid"
# Deferred restarts should be managed by _determine_os_workload_status
# but rabbits wlm code needs refactoring to make it consistent with
# other charms as any message returned by _determine_os_workload_status
@ -1446,6 +1465,48 @@ def nrpe_update_vhost_ssl_check(nrpe_compat, unit, user, password, vhost):
check_cmd='{}/check_rabbitmq.py'.format(NAGIOS_PLUGINS))
def is_cron_schedule_valid(cron_schedule):
"""Returns whether or not the stats_cron_schedule can be properly parsed.
:param cron_schedule: the cron schedule to validate
:return: True if the cron schedule defined can be parsed by the croniter
library, False otherwise
"""
try:
croniter(cron_schedule).get_next(datetime)
return True
except CroniterBadCronError:
return False
def get_max_stats_file_age():
"""Returns the max stats file age for NRPE checks.
Max stats file age is determined by a heuristic of 2x the configured
interval in the stats_cron_schedule config value.
:return: the maximum age (in seconds) the queues check should consider
a stats file as aged. If a cron schedule is not defined,
then return 0.
:rtype: int
"""
cron_schedule = config('stats_cron_schedule')
if not cron_schedule:
return 0
try:
it = croniter(cron_schedule)
interval = it.get_next(datetime) - it.get_prev(datetime)
return int(interval.total_seconds() * 2)
except CroniterBadCronError as err:
# The config value is being passed straight into croniter and it may
# not be valid which will cause croniter to raise an error. Catch any
# of the errors raised.
log('Specified cron schedule is invalid: %s' % err,
level=ERROR)
return 0
def nrpe_update_queues_check(nrpe_compat, rabbit_dir):
"""Add/Remove the RabbitMQ queues check
@ -1471,6 +1532,10 @@ def nrpe_update_queues_check(nrpe_compat, rabbit_dir):
for item in yaml.safe_load(config('exclude_queues')):
cmd += ' -e "{}" "{}"'.format(*item)
max_age = get_max_stats_file_age()
if max_age > 0:
cmd += ' -m {}'.format(max_age)
nrpe_compat.add_check(
shortname=RABBIT_USER + '_queue',
description='Check RabbitMQ Queues',

View File

@ -2,7 +2,7 @@
# Wrapper to ensure that the required py3 versions are installed if upgrading
# from a py2 charm to a py3 based charm.
declare -a DEPS=('apt' 'netaddr' 'netifaces' 'pip' 'yaml' 'dnspython' 'requests')
declare -a DEPS=('apt' 'netaddr' 'netifaces' 'pip' 'yaml' 'dnspython' 'requests' 'croniter')
check_and_install() {
pkg="${1}-${2}"
@ -13,11 +13,9 @@ check_and_install() {
PYTHON="python3"
apt-get update
for dep in ${DEPS[@]}; do
check_and_install ${PYTHON} ${dep}
done
# python3-croniter not available on trusty
[ "$( lsb_release -sc )" != "trusty" ] && check_and_install ${PYTHON} croniter
exec ./hooks/upgrade-charm.real

View File

@ -1,53 +0,0 @@
variables:
openstack-origin: &openstack-origin cloud:trusty-mitaka
series: trusty
relations:
- - cinder:amqp
- rabbitmq-server:amqp
- - cinder:shared-db
- percona-cluster:shared-db
- - cinder:identity-service
- keystone:identity-service
- - keystone:shared-db
- percona-cluster:shared-db
- - nrpe:nrpe-external-master
- rabbitmq-server:nrpe-external-master
- - nrpe:monitors
- nagios:monitors
applications:
rabbitmq-server:
charm: "../../../rabbitmq-server"
num_units: 3
constraints:
cpu-cores=2
options:
min-cluster-size: 3
max-cluster-tries: 6
ssl: "off"
management_plugin: "False"
stats_cron_schedule: "*/1 * * * *"
cinder:
charm: cs:~openstack-charmers-next/cinder
num_units: 1
options:
openstack-origin: *openstack-origin
percona-cluster:
charm: cs:~openstack-charmers-next/percona-cluster-340
num_units: 1
options:
source: *openstack-origin
max-connections: 1000
keystone:
charm: cs:~openstack-charmers-next/keystone
num_units: 1
options:
openstack-origin: *openstack-origin
admin-password: openstack
nagios:
charm: cs:nagios
num_units: 1
nrpe:
charm: cs:nrpe

View File

@ -12,7 +12,6 @@ gate_bundles:
- hirsute-wallaby
dev_bundles:
- trusty-mitaka
- xenial-ocata
- xenial-pike
- xenial-queens

View File

@ -30,30 +30,25 @@ class CheckRabbitTest(unittest.TestCase):
f.write("*/5 * * * * rabbitmq timeout -k 10s -s SIGINT 300 "
"/usr/local/bin/collect_rabbitmq_stats.sh 2>&1 | "
"logger -p local0.notice")
cls.old_cron = check_rabbitmq_queues.CRONJOB
check_rabbitmq_queues.CRONJOB = str(cronjob)
@classmethod
def tearDownClass(cls):
"""Tear down class fixture."""
cls.tmpdir.cleanup()
check_rabbitmq_queues.CRONJOB = cls.old_cron
def test_check_stats_file_freshness_fresh(self):
oldest = datetime.now() - timedelta(minutes=15)
with NamedTemporaryFile() as stats_file:
results = check_rabbitmq_queues.check_stats_file_freshness(
stats_file.name
stats_file.name,
oldest
)
self.assertEqual(results[0], "OK")
def test_check_stats_file_freshness_nonfresh(self):
with NamedTemporaryFile() as stats_file:
next_hour = datetime.now() + timedelta(hours=1)
oldest_timestamp = datetime.now() + timedelta(minutes=1)
results = check_rabbitmq_queues.check_stats_file_freshness(
stats_file.name, asof=next_hour
stats_file.name, oldest_timestamp
)
self.assertEqual(results[0], "CRIT")
def test_get_stats_cron_schedule(self):
schedule = check_rabbitmq_queues.get_stats_cron_schedule()
self.assertEqual(schedule, "*/5 * * * *")

View File

@ -18,6 +18,7 @@ import mock
import os
import sys
import tempfile
from datetime import timedelta
from unit_tests.test_utils import CharmTestCase
@ -539,6 +540,7 @@ class UtilsTests(CharmTestCase):
callee.assert_called_once_with()
mock_application_version_set.assert_called_with('3.5.7')
@mock.patch.object(rabbit_utils, 'is_cron_schedule_valid')
@mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_hooks')
@mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_restarts')
@mock.patch.object(rabbit_utils, 'clustered')
@ -553,7 +555,9 @@ class UtilsTests(CharmTestCase):
status_set,
clustered,
get_deferred_restarts,
get_deferred_hooks):
get_deferred_hooks,
is_cron_schedule_valid):
is_cron_schedule_valid.return_value = True
get_deferred_hooks.return_value = []
get_deferred_restarts.return_value = []
self.leader_get.return_value = None
@ -568,6 +572,39 @@ class UtilsTests(CharmTestCase):
status_set.assert_called_once_with('active',
'Unit is ready and clustered')
@mock.patch.object(rabbit_utils, 'is_cron_schedule_valid')
@mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_hooks')
@mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_restarts')
@mock.patch.object(rabbit_utils, 'clustered')
@mock.patch.object(rabbit_utils, 'status_set')
@mock.patch.object(rabbit_utils, 'assess_cluster_status')
@mock.patch.object(rabbit_utils, 'services')
@mock.patch.object(rabbit_utils, '_determine_os_workload_status')
def test_assess_status_func_invalid_cron(self,
_determine_os_workload_status,
services,
assess_cluster_status,
status_set,
clustered,
get_deferred_restarts,
get_deferred_hooks,
is_cron_schedule_valid):
is_cron_schedule_valid.return_value = False
get_deferred_hooks.return_value = []
get_deferred_restarts.return_value = []
self.leader_get.return_value = None
services.return_value = 's1'
_determine_os_workload_status.return_value = ('active', '')
clustered.return_value = True
rabbit_utils.assess_status_func('test-config')()
# ports=None whilst port checks are disabled.
_determine_os_workload_status.assert_called_once_with(
'test-config', {}, charm_func=assess_cluster_status, services='s1',
ports=None)
msg = 'Unit is ready and clustered. stats_cron_schedule is invalid'
status_set.assert_called_once_with('active', msg)
@mock.patch.object(rabbit_utils, 'is_cron_schedule_valid')
@mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_hooks')
@mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_restarts')
@mock.patch.object(rabbit_utils, 'clustered')
@ -578,7 +615,8 @@ class UtilsTests(CharmTestCase):
def test_assess_status_func_cluster_upgrading(
self, _determine_os_workload_status, services,
assess_cluster_status, status_set, clustered,
get_deferred_restarts, get_deferred_hooks):
get_deferred_restarts, get_deferred_hooks, is_cron_schedule_valid):
is_cron_schedule_valid.return_value = True
get_deferred_hooks.return_value = []
get_deferred_restarts.return_value = []
self.leader_get.return_value = True
@ -591,10 +629,11 @@ class UtilsTests(CharmTestCase):
'test-config', {}, charm_func=assess_cluster_status, services='s1',
ports=None)
status_set.assert_called_once_with(
'active', 'Unit is ready and clustered, Run '
'active', 'Unit is ready and clustered, run '
'complete-cluster-series-upgrade when the cluster has completed '
'its upgrade.')
'its upgrade')
@mock.patch.object(rabbit_utils, 'is_cron_schedule_valid')
@mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_hooks')
@mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_restarts')
@mock.patch.object(rabbit_utils, 'clustered')
@ -605,7 +644,8 @@ class UtilsTests(CharmTestCase):
def test_assess_status_func_cluster_upgrading_first_unit(
self, _determine_os_workload_status, services,
assess_cluster_status, status_set, clustered,
get_deferred_restarts, get_deferred_hooks):
get_deferred_restarts, get_deferred_hooks, is_cron_schedule_valid):
is_cron_schedule_valid.return_value = True
get_deferred_hooks.return_value = []
get_deferred_restarts.return_value = []
self.leader_get.return_value = True
@ -618,9 +658,9 @@ class UtilsTests(CharmTestCase):
'test-config', {}, charm_func=assess_cluster_status, services='s1',
ports=None)
status_set.assert_called_once_with(
'active', 'No peers, Run '
'active', 'No peers, run '
'complete-cluster-series-upgrade when the cluster has completed '
'its upgrade.')
'its upgrade')
def test_pause_unit_helper(self):
with mock.patch.object(rabbit_utils, '_pause_resume_helper') as prh:
@ -1256,6 +1296,7 @@ class UtilsTests(CharmTestCase):
shortname='rabbitmq_queue',
description='Check RabbitMQ Queues',
check_cmd='{0}/check_rabbitmq_queues.py -c "\\*" "\\*" 100 200 {1}'
'-m 600 '
'{0}/data/test_queue_stats.dat'.format(self.tmp_dir,
exclude_queues))
self.nrpe_compat.remove_check.assert_not_called()
@ -1311,3 +1352,36 @@ class UtilsTests(CharmTestCase):
shortname='rabbitmq_cluster',
description='Remove check RabbitMQ Cluster',
check_cmd='{}/check_rabbitmq_cluster.py'.format(self.tmp_dir))
@mock.patch('rabbit_utils.config')
def test_get_max_stats_file_age(self, mock_config):
"""Testing the max stats file age"""
mock_config.side_effect = self.test_config
# default config value should show 10 minutes
max_age = rabbit_utils.get_max_stats_file_age()
self.assertEqual(600, max_age)
# changing to run every 15 minutes shows 30 minutes
self.test_config.set('stats_cron_schedule', '*/15 * * * *')
max_age = rabbit_utils.get_max_stats_file_age()
expected = timedelta(minutes=30).total_seconds()
self.assertEqual(expected, max_age)
# oddball cron schedule, just to validate. This runs every Saturday
# at 23:30, which means the max age would be 14 days since it calcs
# to twice the cron schedule
self.test_config.set('stats_cron_schedule', '30 23 * * SAT')
max_age = rabbit_utils.get_max_stats_file_age()
expected = timedelta(days=14).total_seconds()
self.assertEqual(expected, max_age)
# empty cron schedule will return 0 for the max_age
self.test_config.set('stats_cron_schedule', '')
max_age = rabbit_utils.get_max_stats_file_age()
self.assertEqual(0, max_age)
# poorly formatted cron schedule will return 0 for the max_age
self.test_config.set('stats_cron_schedule', 'poorly formatted')
max_age = rabbit_utils.get_max_stats_file_age()
self.assertEqual(0, max_age)