Allow to extends the evaluator lookback window

Sometimes alarm state is flapping we just missing the last datapoint
often. This can be solved by increase dedicated to the metric injection
chain or for less critical scenario, we could allow a bigger lookback window.

This change allows to extends the lookback window with the new
configuration option 'acceptable_ingestion_lag'.

Change-Id: If2aca73aea95c0c6d08afa5fbb89b949099507db
Closes-bug: #1540298
Closes-bug: #1506911
This commit is contained in:
Mehdi Abaakouk 2016-02-08 08:57:44 +01:00 committed by Mehdi Abaakouk
parent 06204adac4
commit b3874c47f1
4 changed files with 73 additions and 5 deletions

View File

@ -19,6 +19,7 @@ import operator
import six
from ceilometerclient import client as ceiloclient
from oslo_config import cfg
from oslo_log import log
from oslo_utils import timeutils
@ -38,11 +39,20 @@ COMPARATORS = {
'ne': operator.ne,
}
OPTS = [
cfg.IntOpt('additional_ingestion_lag',
min=0,
default=0,
help='The number of seconds to extend the evaluation windows '
'to compensate the reporting/ingestion lag.')
]
class ThresholdEvaluator(evaluator.Evaluator):
# the sliding evaluation window is extended to allow
# for reporting/ingestion lag
# the reporting/ingestion lag this can be increased
# with 'additional_ingestion_lag' seconds if needed.
look_back = 1
def __init__(self, conf):
@ -63,17 +73,17 @@ class ThresholdEvaluator(evaluator.Evaluator):
return self._cm_client
@classmethod
def _bound_duration(cls, rule):
def _bound_duration(self, rule):
"""Bound the duration of the statistics query."""
now = timeutils.utcnow()
# when exclusion of weak datapoints is enabled, we extend
# the look-back period so as to allow a clearer sample count
# trend to be established
look_back = (cls.look_back if not rule.get('exclude_outliers')
look_back = (self.look_back if not rule.get('exclude_outliers')
else rule['evaluation_periods'])
window = ((rule.get('period', None) or rule['granularity'])
* (rule['evaluation_periods'] + look_back))
* (rule['evaluation_periods'] + look_back) +
self.conf.additional_ingestion_lag)
start = now - datetime.timedelta(seconds=window)
LOG.debug('query stats from %(start)s to '
'%(now)s', {'start': start, 'now': now})

View File

@ -35,6 +35,7 @@ def list_opts():
itertools.chain(
aodh.evaluator.OPTS,
aodh.evaluator.event.OPTS,
aodh.evaluator.threshold.OPTS,
aodh.notifier.rest.OPTS,
aodh.queue.OPTS,
aodh.service.OPTS)),

View File

@ -193,6 +193,56 @@ class TestEvaluate(base.TestEvaluatorBase):
in zip(self.alarms, reasons, reason_datas)]
self.assertEqual(expected, self.notifier.notify.call_args_list)
@mock.patch.object(timeutils, 'utcnow')
def test_lag_configuration(self, mock_utcnow):
mock_utcnow.return_value = datetime.datetime(2012, 7, 2, 10, 45)
self.api_client.statistics.list.side_effect = []
self._set_all_alarms('ok')
self._evaluate_all_alarms()
self._set_all_alarms('ok')
self.conf.set_override("additional_ingestion_lag", 42)
self._evaluate_all_alarms()
self.assertEqual([
mock.call(
meter_name='cpu_util', period=60,
q=[{'value': 'cpu_util', 'op': 'eq', 'field': 'meter'},
{'value': 'my_instance', 'op': 'eq',
'field': 'resource_id'},
{'value': '2012-07-02T10:45:00', 'op': 'le',
'field': 'timestamp'},
{'value': '2012-07-02T10:39:00', 'op': 'ge',
'field': 'timestamp'}]),
mock.call(
meter_name='cpu_util', period=300,
q=[{'value': 'cpu_util', 'op': 'eq', 'field': 'meter'},
{'value': 'my_group', 'op': 'eq',
'field': 'metadata.user_metadata.AS'},
{'value': '2012-07-02T10:45:00', 'op': 'le',
'field': 'timestamp'},
{'value': '2012-07-02T10:20:00', 'op': 'ge',
'field': 'timestamp'}]),
mock.call(
meter_name='cpu_util', period=60,
q=[{'value': 'cpu_util', 'op': 'eq', 'field': 'meter'},
{'value': 'my_instance', 'op': 'eq',
'field': 'resource_id'},
{'value': '2012-07-02T10:45:00', 'op': 'le',
'field': 'timestamp'},
{'value': '2012-07-02T10:38:18', 'op': 'ge',
'field': 'timestamp'}]),
mock.call(
meter_name='cpu_util', period=300,
q=[{'value': 'cpu_util', 'op': 'eq', 'field': 'meter'},
{'value': 'my_group', 'op': 'eq',
'field': 'metadata.user_metadata.AS'},
{'value': '2012-07-02T10:45:00', 'op': 'le',
'field': 'timestamp'},
{'value': '2012-07-02T10:19:18', 'op': 'ge',
'field': 'timestamp'}])],
self.api_client.statistics.list.mock_calls)
def test_simple_alarm_clear(self):
self._set_all_alarms('alarm')
avgs = [self._get_stat('avg', self.alarms[0].rule['threshold'] - v)

View File

@ -0,0 +1,7 @@
---
features:
- Allow to extends the alarm evaluation windows to to compensate the
reporting/ingestion lag.
An new option is introduced additional_ingestion_lag defaulted to 0.
It represents the number of seconds of the window extension.