Merge "Add Prometheus evaluator"

2023-11-07 16:02:59 +00:00 · 2023-11-07 16:02:59 +00:00 · 197440c3c8
parent 99df2a40bf f932265290
commit 197440c3c8
10 changed files with 226 additions and 25 deletions
--- a/aodh/api/controllers/v2/alarm_rules/composite.py
+++ b/aodh/api/controllers/v2/alarm_rules/composite.py
@ -41,7 +41,8 @@ class CompositeRule(wtypes.UserType):
    threshold_plugins = None

    def __init__(self):
-        threshold_rules = ('gnocchi_resources_threshold',
+        threshold_rules = ('prometheus',
+                           'gnocchi_resources_threshold',
                           'gnocchi_aggregation_by_metrics_threshold',
                           'gnocchi_aggregation_by_resources_threshold')
        CompositeRule.threshold_plugins = named.NamedExtensionManager(
--- a/aodh/api/controllers/v2/alarm_rules/prometheus.py
+++ b/aodh/api/controllers/v2/alarm_rules/prometheus.py
@ -0,0 +1,46 @@
+#
+# Copyright 2023 Red Hat, Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+from oslo_log import log
+import wsme
+from wsme import types as wtypes
+
+from aodh.api.controllers.v2 import base
+
+
+LOG = log.getLogger(__name__)
+
+
+class PrometheusRule(base.AlarmRule):
+    comparison_operator = base.AdvEnum('comparison_operator', str,
+                                       'lt', 'le', 'eq', 'ne', 'ge', 'gt',
+                                       default='eq')
+    "The comparison against the alarm threshold"
+
+    threshold = wsme.wsattr(float, mandatory=True)
+    "The threshold of the alarm"
+
+    query = wsme.wsattr(wtypes.text, mandatory=True)
+    "The Prometheus query"
+
+    @staticmethod
+    def validate(rule):
+        # TO-DO(mmagr): validate Prometheus query maybe?
+        return rule
+
+    def as_dict(self):
+        rule = self.as_dict_from_keys(['comparison_operator', 'threshold',
+                                       'query'])
+        return rule
--- a/aodh/evaluator/composite.py
+++ b/aodh/evaluator/composite.py
@ -116,7 +116,8 @@ class CompositeEvaluator(evaluator.Evaluator):
    @property
    def threshold_evaluators(self):
        if not self._threshold_evaluators:
-            threshold_types = ('gnocchi_resources_threshold',
+            threshold_types = ('prometheus',
+                               'gnocchi_resources_threshold',
                               'gnocchi_aggregation_by_metrics_threshold',
                               'gnocchi_aggregation_by_resources_threshold')
            self._threshold_evaluators = stevedore.NamedExtensionManager(
--- a/aodh/evaluator/prometheus.py
+++ b/aodh/evaluator/prometheus.py
@ -0,0 +1,78 @@
+#
+# Copyright 2023 Red Hat, Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+from oslo_config import cfg
+from oslo_log import log
+
+from observabilityclient import client
+
+from aodh.evaluator import threshold
+from aodh import keystone_client
+
+
+LOG = log.getLogger(__name__)
+OPTS = [
+    cfg.BoolOpt('prometheus_disable_rbac',
+                default=False,
+                help='Disable RBAC for Prometheus evaluator.'),
+]
+
+
+class PrometheusBase(threshold.ThresholdEvaluator):
+    def __init__(self, conf):
+        super(PrometheusBase, self).__init__(conf)
+        self._set_obsclient(conf)
+        self._no_rbac = conf.prometheus_disable_rbac
+
+    def _set_obsclient(self, conf):
+        session = keystone_client.get_session(conf)
+        opts = {'interface': conf.service_credentials.interface,
+                'region_name': conf.service_credentials.region_name}
+        self._prom = client.Client('1', session, adapter_options=opts)
+
+    def _get_metric_data(self, query):
+        LOG.debug(f'Querying Prometheus instance on: {query}')
+        return self._prom.query.query(query, disable_rbac=self._no_rbac)
+
+
+class PrometheusEvaluator(PrometheusBase):
+
+    def _sanitize(self, metric_data):
+        sanitized = [float(m.value) for m in metric_data]
+        LOG.debug(f'Sanited Prometheus metric data: {metric_data}'
+                  f' to statistics: {sanitized}')
+        return sanitized
+
+    def evaluate_rule(self, alarm_rule):
+        """Evaluate alarm rule.
+
+        :returns: state, trending state, statistics, number of samples outside
+        threshold and reason
+        """
+        metrics = self._get_metric_data(alarm_rule['query'])
+        if not metrics:
+            LOG.warning("Empty result fetched from Prometheus for query"
+                        f" {alarm_rule['query']}")
+
+        statistics = self._sanitize(metrics)
+        if not statistics:
+            raise threshold.InsufficientDataError('datapoints are unknown',
+                                                  statistics)
+        return self._process_statistics(alarm_rule, statistics)
+
+    def _unknown_reason_data(self, alarm, statistics):
+        LOG.warning(f'Transfering alarm {alarm} on unknown reason')
+        last = None if not statistics else statistics[-1]
+        return self._reason_data('unknown', len(statistics), last)
--- a/aodh/evaluator/threshold.py
+++ b/aodh/evaluator/threshold.py
@ -96,19 +96,7 @@ class ThresholdEvaluator(evaluator.Evaluator):
                ' %(disposition)s threshold, most recent: %(most_recent)s'
                % dict(reason_data, state=state), reason_data)

-    def evaluate_rule(self, alarm_rule):
-        """Evaluate alarm rule.
-
-        :returns: state, trending state and statistics.
-        """
-        start, end = self._bound_duration(alarm_rule)
-        statistics = self._statistics(alarm_rule, start, end)
-        statistics = self._sanitize(alarm_rule, statistics)
-        sufficient = len(statistics) >= alarm_rule['evaluation_periods']
-        if not sufficient:
-            raise InsufficientDataError(
-                '%d datapoints are unknown' % alarm_rule['evaluation_periods'],
-                statistics)
+    def _process_statistics(self, alarm_rule, statistics):

        def _compare(value):
            op = COMPARATORS[alarm_rule['comparison_operator']]
@ -129,6 +117,31 @@ class ThresholdEvaluator(evaluator.Evaluator):
            trending_state = evaluator.ALARM if compared[-1] else evaluator.OK
            return None, trending_state, statistics, number_outside, None

+    def evaluate_rule(self, alarm_rule):
+        """Evaluate alarm rule.
+
+        :returns: state, trending state and statistics.
+        """
+        start, end = self._bound_duration(alarm_rule)
+        statistics = self._statistics(alarm_rule, start, end)
+        statistics = self._sanitize(alarm_rule, statistics)
+        sufficient = len(statistics) >= alarm_rule['evaluation_periods']
+        if not sufficient:
+            raise InsufficientDataError(
+                '%d datapoints are unknown' % alarm_rule['evaluation_periods'],
+                statistics)
+
+        return self._process_statistics(alarm_rule, statistics)
+
+    def _unknown_reason_data(self, alarm, statistics):
+        LOG.warning(f'Expecting {alarm.rule["evaluation_periods"]} datapoints'
+                    f' but only get {len(statistics)}')
+        # Reason is not same as log message because we want to keep
+        # consistent since thirdparty software may depend on old format.
+        last = None if not statistics else statistics[-1]
+        return self._reason_data('unknown', alarm.rule['evaluation_periods'],
+                                 last)
+
    def _transition_alarm(self, alarm, state, trending_state, statistics,
                          outside_count, unknown_reason):
        unknown = alarm.state == evaluator.UNKNOWN
@ -143,16 +156,7 @@ class ThresholdEvaluator(evaluator.Evaluator):
                return

        if state == evaluator.UNKNOWN and not unknown:
-            LOG.warning('Expecting %(expected)d datapoints but only get '
-                        '%(actual)d'
-                        % {'expected': alarm.rule['evaluation_periods'],
-                           'actual': len(statistics)})
-            # Reason is not same as log message because we want to keep
-            # consistent since thirdparty software may depend on old format.
-            last = None if not statistics else statistics[-1]
-            reason_data = self._reason_data('unknown',
-                                            alarm.rule['evaluation_periods'],
-                                            last)
+            reason_data = self._unknown_reason_data(alarm, statistics)
            self._refresh(alarm, state, unknown_reason, reason_data)

        elif state and (alarm.state != state or continuous):
--- a/aodh/opts.py
+++ b/aodh/opts.py
@ -23,6 +23,7 @@ import aodh.evaluator
 import aodh.evaluator.event
 import aodh.evaluator.gnocchi
 import aodh.evaluator.loadbalancer
+import aodh.evaluator.prometheus
 import aodh.evaluator.threshold
 import aodh.event
 import aodh.keystone_client
@ -38,6 +39,7 @@ def list_opts():
         itertools.chain(
             aodh.evaluator.OPTS,
             aodh.evaluator.event.OPTS,
+             aodh.evaluator.prometheus.OPTS,
             aodh.evaluator.threshold.OPTS,
             aodh.evaluator.loadbalancer.OPTS,
             aodh.notifier.rest.OPTS,
--- a/aodh/tests/unit/evaluator/test_composite.py
+++ b/aodh/tests/unit/evaluator/test_composite.py
@ -15,6 +15,7 @@
 from unittest import mock

 import fixtures
+import os
 from oslo_utils import timeutils
 from oslo_utils import uuidutils

@ -25,6 +26,12 @@ from aodh.tests import constants
 from aodh.tests.unit.evaluator import base


+# NOTE(mmagr): Overriding PrometheusEvaluator setting to avoid
+# complains during init.
+os.environ['PROMETHEUS_HOST'] = '127.0.0.1'
+os.environ['PROMETHEUS_PORT'] = '666'
+
+
 class BaseCompositeEvaluate(base.TestEvaluatorBase):
    EVALUATOR = composite.CompositeEvaluator

--- a/aodh/tests/unit/test_evaluator.py
+++ b/aodh/tests/unit/test_evaluator.py
@ -18,11 +18,14 @@ import fixtures
 import time
 from unittest import mock

+from observabilityclient import prometheus_client
 from oslo_config import fixture as fixture_config
 from stevedore import extension

 from aodh import evaluator
 from aodh import service
+
+from aodh.evaluator import prometheus
 from aodh.tests import base as tests_base


@ -190,3 +193,59 @@ class TestAlarmEvaluationService(tests_base.BaseTestCase):
        target = svc.partition_coordinator.extract_my_subset
        self.assertEqual(0, target.call_count)
        self.assertEqual(0, self.threshold_eval.evaluate.call_count)
+
+
+class TestPrometheusEvaluator(tests_base.BaseTestCase):
+    def setUp(self):
+        super(TestPrometheusEvaluator, self).setUp()
+        conf = service.prepare_service(argv=[], config_files=[])
+        self.CONF = self.useFixture(fixture_config.Config(conf)).conf
+
+    def test_rule_evaluation(self):
+        metric_list = [
+            prometheus_client.PrometheusMetric({'metric': 'mtr',
+                                                'value': (0, 10)}),
+            prometheus_client.PrometheusMetric({'metric': 'mtr',
+                                                'value': (1, 15)}),
+            prometheus_client.PrometheusMetric({'metric': 'mtr',
+                                                'value': (2, 20)}),
+            prometheus_client.PrometheusMetric({'metric': 'mtr',
+                                                'value': (3, 25)}),
+            prometheus_client.PrometheusMetric({'metric': 'mtr',
+                                                'value': (4, 30)}),
+            prometheus_client.PrometheusMetric({'metric': 'mtr',
+                                                'value': (5, 15)}),
+        ]
+        with mock.patch.object(prometheus.PrometheusEvaluator,
+                               '_set_obsclient', return_value=None):
+            # mock Prometheus client
+            ev = prometheus.PrometheusEvaluator(self.CONF)
+            ev._get_metric_data = mock.Mock(return_value=metric_list)
+
+            # test transfer to alarm state
+            state, trend, stats, outside, reason = ev.evaluate_rule(
+                {'query': 'mtr', 'threshold': 9,
+                 'comparison_operator': 'gt'})
+            self.assertEqual('alarm', state)
+            self.assertEqual(6, outside)
+
+            # test transfer to ok state
+            state, trend, stats, outside, reason = ev.evaluate_rule(
+                {'query': 'mtr', 'threshold': 31,
+                 'comparison_operator': 'gt'})
+            self.assertEqual('ok', state)
+            self.assertEqual(0, outside)
+
+            # test trending to alarm state
+            state, trend, stats, outside, reason = ev.evaluate_rule(
+                {'query': 'mtr', 'threshold': 14,
+                 'comparison_operator': 'gt'})
+            self.assertEqual('alarm', trend)
+            self.assertEqual(5, outside)
+
+            # test trending to ok state
+            state, trend, stats, outside, reason = ev.evaluate_rule(
+                {'query': 'mtr', 'threshold': 20,
+                 'comparison_operator': 'gt'})
+            self.assertEqual('ok', trend)
+            self.assertEqual(2, outside)
--- a/requirements.txt
+++ b/requirements.txt
@ -36,6 +36,7 @@ cachetools>=1.1.6
 cotyledon>=1.7.3
 keystoneauth1>=2.1
 debtcollector>=1.2.0  # Apache-2.0
+python-observabilityclient>=0.0.4
 python-octaviaclient>=1.8.0
 python-dateutil>=2.8.2  # BSD
 python-heatclient>=1.17.0
--- a/setup.cfg
+++ b/setup.cfg
@ -57,6 +57,7 @@ aodh.alarm.rule =
    event = aodh.api.controllers.v2.alarm_rules.event:AlarmEventRule
    composite = aodh.api.controllers.v2.alarm_rules.composite:composite_rule
    loadbalancer_member_health = aodh.api.controllers.v2.alarm_rules.loadbalancer:LoadBalancerMemberHealthRule
+    prometheus = aodh.api.controllers.v2.alarm_rules.prometheus:PrometheusRule

 aodh.evaluator =
    gnocchi_resources_threshold = aodh.evaluator.gnocchi:GnocchiResourceThresholdEvaluator
@ -64,6 +65,7 @@ aodh.evaluator =
    gnocchi_aggregation_by_resources_threshold = aodh.evaluator.gnocchi:GnocchiAggregationResourcesThresholdEvaluator
    composite = aodh.evaluator.composite:CompositeEvaluator
    loadbalancer_member_health = aodh.evaluator.loadbalancer:LoadBalancerMemberHealthEvaluator
+    prometheus = aodh.evaluator.prometheus:PrometheusEvaluator

 aodh.notifier =
    log = aodh.notifier.log:LogAlarmNotifier