Merge "Add Prometheus evaluator"

This commit is contained in:
Zuul 2023-11-07 16:02:59 +00:00 committed by Gerrit Code Review
commit 197440c3c8
10 changed files with 226 additions and 25 deletions

View File

@ -41,7 +41,8 @@ class CompositeRule(wtypes.UserType):
threshold_plugins = None
def __init__(self):
threshold_rules = ('gnocchi_resources_threshold',
threshold_rules = ('prometheus',
'gnocchi_resources_threshold',
'gnocchi_aggregation_by_metrics_threshold',
'gnocchi_aggregation_by_resources_threshold')
CompositeRule.threshold_plugins = named.NamedExtensionManager(

View File

@ -0,0 +1,46 @@
#
# Copyright 2023 Red Hat, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from oslo_log import log
import wsme
from wsme import types as wtypes
from aodh.api.controllers.v2 import base
LOG = log.getLogger(__name__)
class PrometheusRule(base.AlarmRule):
comparison_operator = base.AdvEnum('comparison_operator', str,
'lt', 'le', 'eq', 'ne', 'ge', 'gt',
default='eq')
"The comparison against the alarm threshold"
threshold = wsme.wsattr(float, mandatory=True)
"The threshold of the alarm"
query = wsme.wsattr(wtypes.text, mandatory=True)
"The Prometheus query"
@staticmethod
def validate(rule):
# TO-DO(mmagr): validate Prometheus query maybe?
return rule
def as_dict(self):
rule = self.as_dict_from_keys(['comparison_operator', 'threshold',
'query'])
return rule

View File

@ -116,7 +116,8 @@ class CompositeEvaluator(evaluator.Evaluator):
@property
def threshold_evaluators(self):
if not self._threshold_evaluators:
threshold_types = ('gnocchi_resources_threshold',
threshold_types = ('prometheus',
'gnocchi_resources_threshold',
'gnocchi_aggregation_by_metrics_threshold',
'gnocchi_aggregation_by_resources_threshold')
self._threshold_evaluators = stevedore.NamedExtensionManager(

View File

@ -0,0 +1,78 @@
#
# Copyright 2023 Red Hat, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from oslo_config import cfg
from oslo_log import log
from observabilityclient import client
from aodh.evaluator import threshold
from aodh import keystone_client
LOG = log.getLogger(__name__)
OPTS = [
cfg.BoolOpt('prometheus_disable_rbac',
default=False,
help='Disable RBAC for Prometheus evaluator.'),
]
class PrometheusBase(threshold.ThresholdEvaluator):
def __init__(self, conf):
super(PrometheusBase, self).__init__(conf)
self._set_obsclient(conf)
self._no_rbac = conf.prometheus_disable_rbac
def _set_obsclient(self, conf):
session = keystone_client.get_session(conf)
opts = {'interface': conf.service_credentials.interface,
'region_name': conf.service_credentials.region_name}
self._prom = client.Client('1', session, adapter_options=opts)
def _get_metric_data(self, query):
LOG.debug(f'Querying Prometheus instance on: {query}')
return self._prom.query.query(query, disable_rbac=self._no_rbac)
class PrometheusEvaluator(PrometheusBase):
def _sanitize(self, metric_data):
sanitized = [float(m.value) for m in metric_data]
LOG.debug(f'Sanited Prometheus metric data: {metric_data}'
f' to statistics: {sanitized}')
return sanitized
def evaluate_rule(self, alarm_rule):
"""Evaluate alarm rule.
:returns: state, trending state, statistics, number of samples outside
threshold and reason
"""
metrics = self._get_metric_data(alarm_rule['query'])
if not metrics:
LOG.warning("Empty result fetched from Prometheus for query"
f" {alarm_rule['query']}")
statistics = self._sanitize(metrics)
if not statistics:
raise threshold.InsufficientDataError('datapoints are unknown',
statistics)
return self._process_statistics(alarm_rule, statistics)
def _unknown_reason_data(self, alarm, statistics):
LOG.warning(f'Transfering alarm {alarm} on unknown reason')
last = None if not statistics else statistics[-1]
return self._reason_data('unknown', len(statistics), last)

View File

@ -96,19 +96,7 @@ class ThresholdEvaluator(evaluator.Evaluator):
' %(disposition)s threshold, most recent: %(most_recent)s'
% dict(reason_data, state=state), reason_data)
def evaluate_rule(self, alarm_rule):
"""Evaluate alarm rule.
:returns: state, trending state and statistics.
"""
start, end = self._bound_duration(alarm_rule)
statistics = self._statistics(alarm_rule, start, end)
statistics = self._sanitize(alarm_rule, statistics)
sufficient = len(statistics) >= alarm_rule['evaluation_periods']
if not sufficient:
raise InsufficientDataError(
'%d datapoints are unknown' % alarm_rule['evaluation_periods'],
statistics)
def _process_statistics(self, alarm_rule, statistics):
def _compare(value):
op = COMPARATORS[alarm_rule['comparison_operator']]
@ -129,6 +117,31 @@ class ThresholdEvaluator(evaluator.Evaluator):
trending_state = evaluator.ALARM if compared[-1] else evaluator.OK
return None, trending_state, statistics, number_outside, None
def evaluate_rule(self, alarm_rule):
"""Evaluate alarm rule.
:returns: state, trending state and statistics.
"""
start, end = self._bound_duration(alarm_rule)
statistics = self._statistics(alarm_rule, start, end)
statistics = self._sanitize(alarm_rule, statistics)
sufficient = len(statistics) >= alarm_rule['evaluation_periods']
if not sufficient:
raise InsufficientDataError(
'%d datapoints are unknown' % alarm_rule['evaluation_periods'],
statistics)
return self._process_statistics(alarm_rule, statistics)
def _unknown_reason_data(self, alarm, statistics):
LOG.warning(f'Expecting {alarm.rule["evaluation_periods"]} datapoints'
f' but only get {len(statistics)}')
# Reason is not same as log message because we want to keep
# consistent since thirdparty software may depend on old format.
last = None if not statistics else statistics[-1]
return self._reason_data('unknown', alarm.rule['evaluation_periods'],
last)
def _transition_alarm(self, alarm, state, trending_state, statistics,
outside_count, unknown_reason):
unknown = alarm.state == evaluator.UNKNOWN
@ -143,16 +156,7 @@ class ThresholdEvaluator(evaluator.Evaluator):
return
if state == evaluator.UNKNOWN and not unknown:
LOG.warning('Expecting %(expected)d datapoints but only get '
'%(actual)d'
% {'expected': alarm.rule['evaluation_periods'],
'actual': len(statistics)})
# Reason is not same as log message because we want to keep
# consistent since thirdparty software may depend on old format.
last = None if not statistics else statistics[-1]
reason_data = self._reason_data('unknown',
alarm.rule['evaluation_periods'],
last)
reason_data = self._unknown_reason_data(alarm, statistics)
self._refresh(alarm, state, unknown_reason, reason_data)
elif state and (alarm.state != state or continuous):

View File

@ -23,6 +23,7 @@ import aodh.evaluator
import aodh.evaluator.event
import aodh.evaluator.gnocchi
import aodh.evaluator.loadbalancer
import aodh.evaluator.prometheus
import aodh.evaluator.threshold
import aodh.event
import aodh.keystone_client
@ -38,6 +39,7 @@ def list_opts():
itertools.chain(
aodh.evaluator.OPTS,
aodh.evaluator.event.OPTS,
aodh.evaluator.prometheus.OPTS,
aodh.evaluator.threshold.OPTS,
aodh.evaluator.loadbalancer.OPTS,
aodh.notifier.rest.OPTS,

View File

@ -15,6 +15,7 @@
from unittest import mock
import fixtures
import os
from oslo_utils import timeutils
from oslo_utils import uuidutils
@ -25,6 +26,12 @@ from aodh.tests import constants
from aodh.tests.unit.evaluator import base
# NOTE(mmagr): Overriding PrometheusEvaluator setting to avoid
# complains during init.
os.environ['PROMETHEUS_HOST'] = '127.0.0.1'
os.environ['PROMETHEUS_PORT'] = '666'
class BaseCompositeEvaluate(base.TestEvaluatorBase):
EVALUATOR = composite.CompositeEvaluator

View File

@ -18,11 +18,14 @@ import fixtures
import time
from unittest import mock
from observabilityclient import prometheus_client
from oslo_config import fixture as fixture_config
from stevedore import extension
from aodh import evaluator
from aodh import service
from aodh.evaluator import prometheus
from aodh.tests import base as tests_base
@ -190,3 +193,59 @@ class TestAlarmEvaluationService(tests_base.BaseTestCase):
target = svc.partition_coordinator.extract_my_subset
self.assertEqual(0, target.call_count)
self.assertEqual(0, self.threshold_eval.evaluate.call_count)
class TestPrometheusEvaluator(tests_base.BaseTestCase):
def setUp(self):
super(TestPrometheusEvaluator, self).setUp()
conf = service.prepare_service(argv=[], config_files=[])
self.CONF = self.useFixture(fixture_config.Config(conf)).conf
def test_rule_evaluation(self):
metric_list = [
prometheus_client.PrometheusMetric({'metric': 'mtr',
'value': (0, 10)}),
prometheus_client.PrometheusMetric({'metric': 'mtr',
'value': (1, 15)}),
prometheus_client.PrometheusMetric({'metric': 'mtr',
'value': (2, 20)}),
prometheus_client.PrometheusMetric({'metric': 'mtr',
'value': (3, 25)}),
prometheus_client.PrometheusMetric({'metric': 'mtr',
'value': (4, 30)}),
prometheus_client.PrometheusMetric({'metric': 'mtr',
'value': (5, 15)}),
]
with mock.patch.object(prometheus.PrometheusEvaluator,
'_set_obsclient', return_value=None):
# mock Prometheus client
ev = prometheus.PrometheusEvaluator(self.CONF)
ev._get_metric_data = mock.Mock(return_value=metric_list)
# test transfer to alarm state
state, trend, stats, outside, reason = ev.evaluate_rule(
{'query': 'mtr', 'threshold': 9,
'comparison_operator': 'gt'})
self.assertEqual('alarm', state)
self.assertEqual(6, outside)
# test transfer to ok state
state, trend, stats, outside, reason = ev.evaluate_rule(
{'query': 'mtr', 'threshold': 31,
'comparison_operator': 'gt'})
self.assertEqual('ok', state)
self.assertEqual(0, outside)
# test trending to alarm state
state, trend, stats, outside, reason = ev.evaluate_rule(
{'query': 'mtr', 'threshold': 14,
'comparison_operator': 'gt'})
self.assertEqual('alarm', trend)
self.assertEqual(5, outside)
# test trending to ok state
state, trend, stats, outside, reason = ev.evaluate_rule(
{'query': 'mtr', 'threshold': 20,
'comparison_operator': 'gt'})
self.assertEqual('ok', trend)
self.assertEqual(2, outside)

View File

@ -36,6 +36,7 @@ cachetools>=1.1.6
cotyledon>=1.7.3
keystoneauth1>=2.1
debtcollector>=1.2.0 # Apache-2.0
python-observabilityclient>=0.0.4
python-octaviaclient>=1.8.0
python-dateutil>=2.8.2 # BSD
python-heatclient>=1.17.0

View File

@ -57,6 +57,7 @@ aodh.alarm.rule =
event = aodh.api.controllers.v2.alarm_rules.event:AlarmEventRule
composite = aodh.api.controllers.v2.alarm_rules.composite:composite_rule
loadbalancer_member_health = aodh.api.controllers.v2.alarm_rules.loadbalancer:LoadBalancerMemberHealthRule
prometheus = aodh.api.controllers.v2.alarm_rules.prometheus:PrometheusRule
aodh.evaluator =
gnocchi_resources_threshold = aodh.evaluator.gnocchi:GnocchiResourceThresholdEvaluator
@ -64,6 +65,7 @@ aodh.evaluator =
gnocchi_aggregation_by_resources_threshold = aodh.evaluator.gnocchi:GnocchiAggregationResourcesThresholdEvaluator
composite = aodh.evaluator.composite:CompositeEvaluator
loadbalancer_member_health = aodh.evaluator.loadbalancer:LoadBalancerMemberHealthEvaluator
prometheus = aodh.evaluator.prometheus:PrometheusEvaluator
aodh.notifier =
log = aodh.notifier.log:LogAlarmNotifier