Move cron max file age calculation to rabbit_utils

The check_rabbitmq_queues nrpe check accesses the cron file created for running collect stats job. This is done in order to determine if the stats are too old and an alert should be raised. The nagios user does not have access to read the cron job when running in a hardened environment where /etc/cron.d is not readable. This change refactors this logic to move the calculation of maximum age for a stats file from the check_rabbitmq_queues script and into the rabbit_utils code where it is generating the nrpe configuration. A new (optional) parameter is added to the check_rabbitmq_queues script to accept the maximum age in seconds a file can last be modified. This change also removes the trusty support in hooks/install and hooks/upgrade-charm as the rabbit_utils.py file needs to import a dependency which is installed by the scripts. It is cleaned up to make sure the croniter package is always installed on install or upgrade. Change-Id: If948fc921ee0b63682946c7cc879ac50e971e588 Closes-Bug: #1940495 Co-authored-by: Aurelien Lourot <aurelien.lourot@canonical.com>
2021-08-18 20:48:46 -07:00 · 2021-08-18 20:48:46 -07:00 · fd8d018bab
parent 8948fe7a49
commit fd8d018bab
8 changed files with 180 additions and 133 deletions
--- a/files/check_rabbitmq_queues.py
+++ b/files/check_rabbitmq_queues.py
@ -5,7 +5,7 @@
 # Author: Liam Young, Jacek Nykis

 from collections import defaultdict
-from datetime import datetime
+from datetime import datetime, timedelta
 from fnmatch import fnmatchcase
 from itertools import chain
 import argparse
@ -18,13 +18,6 @@ with open("/etc/lsb-release") as f:
    lsb = [s.split("=") for s in f.readlines()]
    lsb_dict = dict([(k, v.strip()) for k, v in lsb])

-if lsb_dict.get("DISTRIB_CODENAME") != "trusty":
-    # Trusty doesn't have croniter
-    from croniter import croniter
-
-
-CRONJOB = "/etc/cron.d/rabbitmq-stats"
-

 def gen_data_lines(filename):
    with open(filename, "rt") as fin:
@ -91,54 +84,17 @@ def check_stats(stats_collated, limits):
                yield l_queue, l_vhost, m_all, "WARN"


-def get_cron_interval(cronspec, base):
-    """Estimate cron interval by subtracting last from next job runtime
-
-    :param cronspec: Cronjob schedule string
-    :param base: datetime from when to check cron schedule
-    :return: timedelta
-    """
-    it = croniter(cronspec, base)
-    return it.get_next(datetime) - it.get_prev(datetime)
-
-
-def get_stats_cron_schedule():
-    """Returns the cron schedule from the stats CRONJOB spec file.
-
-    :return: a string containing the cron schedule
-    :rtype: str
-    """
-    # TODO(wolsen) in general, the layout of this code makes a lot of
-    #  assumptions about the files that are written, etc and is somewhat
-    #  brittle for anything not specifically laid out by the charm. This
-    #  should be revisited in the future.
-    with open(CRONJOB) as f:
-        cronjob = f.read()
-        # The first 5 columns make up the cron spec, but the output of this
-        # function should be a string. Split the line on whitespace and reform
-        # the spec string from the necessary columns
-        # See LP#1939702
-        cron_spec = ' '.join(cronjob.split()[:5])
-        return cron_spec
-
-
-def check_stats_file_freshness(stats_file, asof=None):
+def check_stats_file_freshness(stats_file, oldest_timestamp):
    """Check if a rabbitmq stats file is fresh

    Fresh here is defined as modified within the last 2* cron job intervals

    :param stats_file: file name to check
-    :param asof: datetime from when to check, defaults to datetime.now()
+    :param oldest_timestamp: oldest timestamp the file can be last modified
    :return: tuple (status, message)
    """
-    if asof is None:
-        asof = datetime.now()
    file_mtime = datetime.fromtimestamp(os.path.getmtime(stats_file))
-    cronspec = get_stats_cron_schedule()
-    interval = get_cron_interval(cronspec, asof)
-    # We expect the file to be modified in the last 2 cron intervals
-    cutoff_time = asof - (2 * interval)
-    if file_mtime < cutoff_time:
+    if file_mtime < oldest_timestamp:
        return (
            "CRIT",
            "Rabbit stats file not updated since {}".format(
@ -166,8 +122,22 @@ if __name__ == "__main__":
        required=False,
        default=[],
        metavar=('vhost', 'queue'),
-        help='Vhost and queue to exclude from checks. Can be used multiple \
-        times'
+        help=(
+            'Vhost and queue to exclude from checks. Can be used multiple '
+            'times'
+        )
+    )
+    parser.add_argument(
+        '-m',
+        nargs='?',
+        action='store',
+        required=False,
+        default=0,
+        type=int,
+        help=(
+            'Maximum age (in seconds) the stats files can be before a crit is '
+            'raised'
+        )
    )
    parser.add_argument(
        'stats_file',
@ -192,8 +162,9 @@ if __name__ == "__main__":
            warnings.append(
                "%s in %s has %s messages" % (queue, vhost, message_no))

-    if "croniter" in sys.modules.keys():  # not on trusty and imported croniter
-        freshness_results = [check_stats_file_freshness(f)
+    if args.m:
+        oldest = datetime.now() - timedelta(seconds=args.m)
+        freshness_results = [check_stats_file_freshness(f, oldest)
                             for f in args.stats_file]
        criticals.extend(
            msg for status, msg in freshness_results if status == "CRIT"
--- a/hooks/install
+++ b/hooks/install
@ -2,7 +2,7 @@
 # Wrapper to deal with newer Ubuntu versions that don't have py2 installed
 # by default.

-declare -a DEPS=('apt' 'netaddr' 'netifaces' 'pip' 'yaml' 'dnspython' 'requests')
+declare -a DEPS=('apt' 'netaddr' 'netifaces' 'pip' 'yaml' 'dnspython' 'requests' 'croniter')

 check_and_install() {
    pkg="${1}-${2}"
@ -13,11 +13,9 @@ check_and_install() {

 PYTHON="python3"

+apt-get update
 for dep in ${DEPS[@]}; do
    check_and_install ${PYTHON} ${dep}
 done

-# python3-croniter not available on trusty
-[ "$( lsb_release -sc )" != "trusty" ] && check_and_install ${PYTHON} croniter
-
 exec ./hooks/install.real
--- a/hooks/rabbit_utils.py
+++ b/hooks/rabbit_utils.py
@ -26,6 +26,17 @@ import yaml

 from collections import OrderedDict, defaultdict

+try:
+    from croniter import CroniterBadCronError
+except ImportError:
+    # NOTE(lourot): CroniterBadCronError doesn't exist in croniter
+    # 0.3.12 and older, i.e. it doesn't exist in Bionic and older.
+    # croniter used to raise a ValueError on these older versions:
+    CroniterBadCronError = ValueError
+from croniter import croniter
+
+from datetime import datetime
+
 from rabbitmq_context import (
    RabbitMQSSLContext,
    RabbitMQClusterContext,
@ -1045,14 +1056,22 @@ def assess_status_func(configs):
        # If the cluster has completed the series upgrade, run the
        # complete-cluster-series-upgrade action to clear this setting.
        if leader_get('cluster_series_upgrading'):
-            message += (", Run complete-cluster-series-upgrade when the "
-                        "cluster has completed its upgrade.")
+            message += (", run complete-cluster-series-upgrade when the "
+                        "cluster has completed its upgrade")
            # Edge case when the first rabbitmq unit is upgraded it will show
            # waiting for peers. Force "active" workload state for various
            # testing suites like zaza to recognize a successful series upgrade
            # of the first unit.
            if state == "waiting":
                state = "active"
+
+        # Validate that the cron schedule for nrpe status checks is correct. An
+        # invalid cron schedule will not prevent the rabbitmq service from
+        # running but may cause problems with nrpe checks.
+        schedule = config('stats_cron_schedule')
+        if schedule and not is_cron_schedule_valid(schedule):
+            message += ". stats_cron_schedule is invalid"
+
        # Deferred restarts should be managed by _determine_os_workload_status
        # but rabbits wlm code needs refactoring to make it consistent with
        # other charms as any message returned by _determine_os_workload_status
@ -1446,6 +1465,48 @@ def nrpe_update_vhost_ssl_check(nrpe_compat, unit, user, password, vhost):
            check_cmd='{}/check_rabbitmq.py'.format(NAGIOS_PLUGINS))


+def is_cron_schedule_valid(cron_schedule):
+    """Returns whether or not the stats_cron_schedule can be properly parsed.
+
+    :param cron_schedule: the cron schedule to validate
+    :return: True if the cron schedule defined can be parsed by the croniter
+             library, False otherwise
+    """
+    try:
+        croniter(cron_schedule).get_next(datetime)
+        return True
+    except CroniterBadCronError:
+        return False
+
+
+def get_max_stats_file_age():
+    """Returns the max stats file age for NRPE checks.
+
+    Max stats file age is determined by a heuristic of 2x the configured
+    interval in the stats_cron_schedule config value.
+
+    :return: the maximum age (in seconds) the queues check should consider
+             a stats file as aged. If a cron schedule is not defined,
+             then return 0.
+    :rtype: int
+    """
+    cron_schedule = config('stats_cron_schedule')
+    if not cron_schedule:
+        return 0
+
+    try:
+        it = croniter(cron_schedule)
+        interval = it.get_next(datetime) - it.get_prev(datetime)
+        return int(interval.total_seconds() * 2)
+    except CroniterBadCronError as err:
+        # The config value is being passed straight into croniter and it may
+        # not be valid which will cause croniter to raise an error. Catch any
+        # of the errors raised.
+        log('Specified cron schedule is invalid: %s' % err,
+            level=ERROR)
+        return 0
+
+
 def nrpe_update_queues_check(nrpe_compat, rabbit_dir):
    """Add/Remove the RabbitMQ queues check

@ -1471,6 +1532,10 @@ def nrpe_update_queues_check(nrpe_compat, rabbit_dir):
        for item in yaml.safe_load(config('exclude_queues')):
            cmd += ' -e "{}" "{}"'.format(*item)

+        max_age = get_max_stats_file_age()
+        if max_age > 0:
+            cmd += ' -m {}'.format(max_age)
+
        nrpe_compat.add_check(
            shortname=RABBIT_USER + '_queue',
            description='Check RabbitMQ Queues',
--- a/hooks/upgrade-charm
+++ b/hooks/upgrade-charm
@ -2,7 +2,7 @@
 # Wrapper to ensure that the required py3 versions are installed if upgrading
 # from a py2 charm to a py3 based charm.

-declare -a DEPS=('apt' 'netaddr' 'netifaces' 'pip' 'yaml' 'dnspython' 'requests')
+declare -a DEPS=('apt' 'netaddr' 'netifaces' 'pip' 'yaml' 'dnspython' 'requests' 'croniter')

 check_and_install() {
    pkg="${1}-${2}"
@ -13,11 +13,9 @@ check_and_install() {

 PYTHON="python3"

+apt-get update
 for dep in ${DEPS[@]}; do
    check_and_install ${PYTHON} ${dep}
 done

-# python3-croniter not available on trusty
-[ "$( lsb_release -sc )" != "trusty" ] && check_and_install ${PYTHON} croniter
-
 exec ./hooks/upgrade-charm.real
--- a/tests/bundles/trusty-mitaka.yaml
+++ b/tests/bundles/trusty-mitaka.yaml
@ -1,53 +0,0 @@
-variables:
-  openstack-origin: &openstack-origin cloud:trusty-mitaka
-
-series: trusty
-
-relations:
-  - - cinder:amqp
-    - rabbitmq-server:amqp
-  - - cinder:shared-db
-    - percona-cluster:shared-db
-  - - cinder:identity-service
-    - keystone:identity-service
-  - - keystone:shared-db
-    - percona-cluster:shared-db
-  - - nrpe:nrpe-external-master
-    - rabbitmq-server:nrpe-external-master
-  - - nrpe:monitors
-    - nagios:monitors
-
-applications:
-  rabbitmq-server:
-    charm: "../../../rabbitmq-server"
-    num_units: 3
-    constraints:
-      cpu-cores=2
-    options:
-      min-cluster-size: 3
-      max-cluster-tries: 6
-      ssl: "off"
-      management_plugin: "False"
-      stats_cron_schedule: "*/1 * * * *"
-  cinder:
-    charm: cs:~openstack-charmers-next/cinder
-    num_units: 1
-    options:
-      openstack-origin: *openstack-origin
-  percona-cluster:
-    charm: cs:~openstack-charmers-next/percona-cluster-340
-    num_units: 1
-    options:
-      source: *openstack-origin
-      max-connections: 1000
-  keystone:
-    charm: cs:~openstack-charmers-next/keystone
-    num_units: 1
-    options:
-      openstack-origin: *openstack-origin
-      admin-password: openstack
-  nagios:
-    charm: cs:nagios
-    num_units: 1
-  nrpe:
-    charm: cs:nrpe
--- a/tests/tests.yaml
+++ b/tests/tests.yaml
@ -12,7 +12,6 @@ gate_bundles:
  - hirsute-wallaby

 dev_bundles:
-  - trusty-mitaka
  - xenial-ocata
  - xenial-pike
  - xenial-queens
--- a/unit_tests/test_nrpe_checks.py
+++ b/unit_tests/test_nrpe_checks.py
@ -30,30 +30,25 @@ class CheckRabbitTest(unittest.TestCase):
            f.write("*/5 * * * * rabbitmq timeout -k 10s -s SIGINT 300 "
                    "/usr/local/bin/collect_rabbitmq_stats.sh 2>&1 | "
                    "logger -p local0.notice")
-        cls.old_cron = check_rabbitmq_queues.CRONJOB
-        check_rabbitmq_queues.CRONJOB = str(cronjob)

    @classmethod
    def tearDownClass(cls):
        """Tear down class fixture."""
        cls.tmpdir.cleanup()
-        check_rabbitmq_queues.CRONJOB = cls.old_cron

    def test_check_stats_file_freshness_fresh(self):
+        oldest = datetime.now() - timedelta(minutes=15)
        with NamedTemporaryFile() as stats_file:
            results = check_rabbitmq_queues.check_stats_file_freshness(
-                stats_file.name
+                stats_file.name,
+                oldest
            )
            self.assertEqual(results[0], "OK")

    def test_check_stats_file_freshness_nonfresh(self):
        with NamedTemporaryFile() as stats_file:
-            next_hour = datetime.now() + timedelta(hours=1)
+            oldest_timestamp = datetime.now() + timedelta(minutes=1)
            results = check_rabbitmq_queues.check_stats_file_freshness(
-                stats_file.name, asof=next_hour
+                stats_file.name, oldest_timestamp
            )
            self.assertEqual(results[0], "CRIT")
-
-    def test_get_stats_cron_schedule(self):
-        schedule = check_rabbitmq_queues.get_stats_cron_schedule()
-        self.assertEqual(schedule, "*/5 * * * *")
--- a/unit_tests/test_rabbit_utils.py
+++ b/unit_tests/test_rabbit_utils.py
@ -18,6 +18,7 @@ import mock
 import os
 import sys
 import tempfile
+from datetime import timedelta

 from unit_tests.test_utils import CharmTestCase

@ -539,6 +540,7 @@ class UtilsTests(CharmTestCase):
            callee.assert_called_once_with()
            mock_application_version_set.assert_called_with('3.5.7')

+    @mock.patch.object(rabbit_utils, 'is_cron_schedule_valid')
    @mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_hooks')
    @mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_restarts')
    @mock.patch.object(rabbit_utils, 'clustered')
@ -553,7 +555,9 @@ class UtilsTests(CharmTestCase):
                                status_set,
                                clustered,
                                get_deferred_restarts,
-                                get_deferred_hooks):
+                                get_deferred_hooks,
+                                is_cron_schedule_valid):
+        is_cron_schedule_valid.return_value = True
        get_deferred_hooks.return_value = []
        get_deferred_restarts.return_value = []
        self.leader_get.return_value = None
@ -568,6 +572,39 @@ class UtilsTests(CharmTestCase):
        status_set.assert_called_once_with('active',
                                           'Unit is ready and clustered')

+    @mock.patch.object(rabbit_utils, 'is_cron_schedule_valid')
+    @mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_hooks')
+    @mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_restarts')
+    @mock.patch.object(rabbit_utils, 'clustered')
+    @mock.patch.object(rabbit_utils, 'status_set')
+    @mock.patch.object(rabbit_utils, 'assess_cluster_status')
+    @mock.patch.object(rabbit_utils, 'services')
+    @mock.patch.object(rabbit_utils, '_determine_os_workload_status')
+    def test_assess_status_func_invalid_cron(self,
+                                             _determine_os_workload_status,
+                                             services,
+                                             assess_cluster_status,
+                                             status_set,
+                                             clustered,
+                                             get_deferred_restarts,
+                                             get_deferred_hooks,
+                                             is_cron_schedule_valid):
+        is_cron_schedule_valid.return_value = False
+        get_deferred_hooks.return_value = []
+        get_deferred_restarts.return_value = []
+        self.leader_get.return_value = None
+        services.return_value = 's1'
+        _determine_os_workload_status.return_value = ('active', '')
+        clustered.return_value = True
+        rabbit_utils.assess_status_func('test-config')()
+        # ports=None whilst port checks are disabled.
+        _determine_os_workload_status.assert_called_once_with(
+            'test-config', {}, charm_func=assess_cluster_status, services='s1',
+            ports=None)
+        msg = 'Unit is ready and clustered. stats_cron_schedule is invalid'
+        status_set.assert_called_once_with('active', msg)
+
+    @mock.patch.object(rabbit_utils, 'is_cron_schedule_valid')
    @mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_hooks')
    @mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_restarts')
    @mock.patch.object(rabbit_utils, 'clustered')
@ -578,7 +615,8 @@ class UtilsTests(CharmTestCase):
    def test_assess_status_func_cluster_upgrading(
            self, _determine_os_workload_status, services,
            assess_cluster_status, status_set, clustered,
-            get_deferred_restarts, get_deferred_hooks):
+            get_deferred_restarts, get_deferred_hooks, is_cron_schedule_valid):
+        is_cron_schedule_valid.return_value = True
        get_deferred_hooks.return_value = []
        get_deferred_restarts.return_value = []
        self.leader_get.return_value = True
@ -591,10 +629,11 @@ class UtilsTests(CharmTestCase):
            'test-config', {}, charm_func=assess_cluster_status, services='s1',
            ports=None)
        status_set.assert_called_once_with(
-            'active', 'Unit is ready and clustered, Run '
+            'active', 'Unit is ready and clustered, run '
            'complete-cluster-series-upgrade when the cluster has completed '
-            'its upgrade.')
+            'its upgrade')

+    @mock.patch.object(rabbit_utils, 'is_cron_schedule_valid')
    @mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_hooks')
    @mock.patch.object(rabbit_utils.deferred_events, 'get_deferred_restarts')
    @mock.patch.object(rabbit_utils, 'clustered')
@ -605,7 +644,8 @@ class UtilsTests(CharmTestCase):
    def test_assess_status_func_cluster_upgrading_first_unit(
            self, _determine_os_workload_status, services,
            assess_cluster_status, status_set, clustered,
-            get_deferred_restarts, get_deferred_hooks):
+            get_deferred_restarts, get_deferred_hooks, is_cron_schedule_valid):
+        is_cron_schedule_valid.return_value = True
        get_deferred_hooks.return_value = []
        get_deferred_restarts.return_value = []
        self.leader_get.return_value = True
@ -618,9 +658,9 @@ class UtilsTests(CharmTestCase):
            'test-config', {}, charm_func=assess_cluster_status, services='s1',
            ports=None)
        status_set.assert_called_once_with(
-            'active', 'No peers, Run '
+            'active', 'No peers, run '
            'complete-cluster-series-upgrade when the cluster has completed '
-            'its upgrade.')
+            'its upgrade')

    def test_pause_unit_helper(self):
        with mock.patch.object(rabbit_utils, '_pause_resume_helper') as prh:
@ -1256,6 +1296,7 @@ class UtilsTests(CharmTestCase):
            shortname='rabbitmq_queue',
            description='Check RabbitMQ Queues',
            check_cmd='{0}/check_rabbitmq_queues.py -c "\\*" "\\*" 100 200 {1}'
+                      '-m 600 '
                      '{0}/data/test_queue_stats.dat'.format(self.tmp_dir,
                                                             exclude_queues))
        self.nrpe_compat.remove_check.assert_not_called()
@ -1311,3 +1352,36 @@ class UtilsTests(CharmTestCase):
            shortname='rabbitmq_cluster',
            description='Remove check RabbitMQ Cluster',
            check_cmd='{}/check_rabbitmq_cluster.py'.format(self.tmp_dir))
+
+    @mock.patch('rabbit_utils.config')
+    def test_get_max_stats_file_age(self, mock_config):
+        """Testing the max stats file age"""
+        mock_config.side_effect = self.test_config
+
+        # default config value should show 10 minutes
+        max_age = rabbit_utils.get_max_stats_file_age()
+        self.assertEqual(600, max_age)
+
+        # changing to run every 15 minutes shows 30 minutes
+        self.test_config.set('stats_cron_schedule', '*/15 * * * *')
+        max_age = rabbit_utils.get_max_stats_file_age()
+        expected = timedelta(minutes=30).total_seconds()
+        self.assertEqual(expected, max_age)
+
+        # oddball cron schedule, just to validate. This runs every Saturday
+        # at 23:30, which means the max age would be 14 days since it calcs
+        # to twice the cron schedule
+        self.test_config.set('stats_cron_schedule', '30 23 * * SAT')
+        max_age = rabbit_utils.get_max_stats_file_age()
+        expected = timedelta(days=14).total_seconds()
+        self.assertEqual(expected, max_age)
+
+        # empty cron schedule will return 0 for the max_age
+        self.test_config.set('stats_cron_schedule', '')
+        max_age = rabbit_utils.get_max_stats_file_age()
+        self.assertEqual(0, max_age)
+
+        # poorly formatted cron schedule will return 0 for the max_age
+        self.test_config.set('stats_cron_schedule', 'poorly formatted')
+        max_age = rabbit_utils.get_max_stats_file_age()
+        self.assertEqual(0, max_age)