monasca-agent/monagent/collector/checks/check.py

607 lines
24 KiB
Python

"""Base class for Checks.
If you are writing your own checks you should subclass the AgentCheck class.
The Check class is being deprecated so don't write new checks with it.
"""
# This file uses 'print' as a function rather than a statement, a la Python3
from __future__ import print_function
import logging
import os
import pprint
import re
import time
import traceback
import yaml
import monagent.common.aggregator
import monagent.common.config
import monagent.common.exceptions
import monagent.common.keystone
import monagent.common.util
log = logging.getLogger(__name__)
# todo convert all checks to the new interface then remove this.
# Is the LaconicFilter on logs used elsewhere?
# =============================================================================
# DEPRECATED
# ------------------------------
# If you are writing your own check, you should inherit from AgentCheck
# and not this class. This class will be removed in a future version
# of the agent.
# =============================================================================
class Check(object):
"""(Abstract) class for all checks with the ability to:
* store 1 (and only 1) sample for gauges per metric/tag combination
* compute rates for counters
* only log error messages once (instead of each time they occur)
"""
def __init__(self, logger, agent_config=None):
# where to store samples, indexed by metric_name
# metric_name: {("sorted", "dimensions"): [(ts, value), (ts, value)],
# tuple(dimensions) are stored as a key since lists are not hashable
# None: [(ts, value), (ts, value)]}
# untagged values are indexed by None
self.agent_config = agent_config
self._sample_store = {}
self._counters = {} # metric_name: bool
self.logger = logger
try:
self.logger.addFilter(monagent.common.util.LaconicFilter())
except Exception:
self.logger.exception("Trying to install laconic log filter and failed")
@staticmethod
def normalize(metric, prefix=None):
"""Turn a metric into a well-formed metric name
prefix.b.c
"""
name = re.sub(r"[,\+\*\-/()\[\]{}]", "_", metric)
# Eliminate multiple _
name = re.sub(r"__+", "_", name)
# Don't start/end with _
name = re.sub(r"^_", "", name)
name = re.sub(r"_$", "", name)
# Drop ._ and _.
name = re.sub(r"\._", ".", name)
name = re.sub(r"_\.", ".", name)
if prefix is not None:
return prefix + "." + name
else:
return name
@staticmethod
def normalize_device_name(device_name):
return device_name.strip().lower().replace(' ', '_')
def counter(self, metric):
"""Treats the metric as a counter, i.e. computes its per second derivative
ACHTUNG: Resets previous values associated with this metric.
"""
self._counters[metric] = True
self._sample_store[metric] = {}
def is_counter(self, metric):
"""Is this metric a counter?
"""
return metric in self._counters
def gauge(self, metric):
"""Treats the metric as a gauge, i.e. keep the data as is
ACHTUNG: Resets previous values associated with this metric.
"""
self._sample_store[metric] = {}
def is_metric(self, metric):
return metric in self._sample_store
def is_gauge(self, metric):
return self.is_metric(metric) and not self.is_counter(metric)
def get_metric_names(self):
"""Get all metric names.
"""
return self._sample_store.keys()
def save_gauge(self, metric, value, timestamp=None,
dimensions=None, hostname=None, device_name=None):
"""Save a gauge value.
"""
if not self.is_gauge(metric):
self.gauge(metric)
self.save_sample(metric, value, timestamp, dimensions, hostname, device_name)
def save_sample(self, metric, value, timestamp=None,
dimensions=None, hostname=None, device_name=None):
"""Save a simple sample, evict old values if needed.
"""
if dimensions is None:
dimensions = {}
if timestamp is None:
timestamp = time.time()
if metric not in self._sample_store:
raise monagent.common.exceptions.CheckException("Saving a sample for an undefined metric: %s" % metric)
try:
value = monagent.common.util.cast_metric_val(value)
except ValueError as ve:
raise monagent.common.exceptions.NaN(ve)
# Sort and validate dimensions
if dimensions is not None and not isinstance(dimensions, dict):
raise monagent.common.exceptions.CheckException("Dimensions must be a dictionary")
# Data eviction rules
key = (tuple(sorted(dimensions.items())), device_name)
if self.is_gauge(metric):
self._sample_store[metric][key] = ((timestamp, value, hostname, device_name), )
elif self.is_counter(metric):
if self._sample_store[metric].get(key) is None:
self._sample_store[metric][key] = [(timestamp, value, hostname, device_name)]
else:
self._sample_store[metric][key] = self._sample_store[metric][key][-1:] + \
[(timestamp, value, hostname, device_name)]
else:
raise monagent.common.exceptions.CheckException("%s must be either gauge or counter, skipping sample at %s" %
(metric, time.ctime(timestamp)))
if self.is_gauge(metric):
# store[metric][dimensions] = (ts, val) - only 1 value allowed
assert len(self._sample_store[metric][key]) == 1, self._sample_store[metric]
elif self.is_counter(metric):
assert len(self._sample_store[metric][key]) in (1, 2), self._sample_store[metric]
@classmethod
def _rate(cls, sample1, sample2):
"""Simple rate.
"""
try:
interval = sample2[0] - sample1[0]
if interval == 0:
raise monagent.common.exceptions.Infinity()
delta = sample2[1] - sample1[1]
if delta < 0:
raise monagent.common.exceptions.UnknownValue()
return (sample2[0], delta / interval, sample2[2], sample2[3])
except monagent.common.exceptions.Infinity:
raise
except monagent.common.exceptions.UnknownValue:
raise
except Exception as e:
raise monagent.common.exceptions.NaN(e)
def get_sample_with_timestamp(self, metric, dimensions=None, device_name=None, expire=True):
"""Get (timestamp-epoch-style, value).
"""
if dimensions is None:
dimensions = {}
# Get the proper dimensions
key = (tuple(sorted(dimensions.items())), device_name)
# Never seen this metric
if metric not in self._sample_store:
raise monagent.common.exceptions.UnknownValue()
# Not enough value to compute rate
elif self.is_counter(metric) and len(self._sample_store[metric][key]) < 2:
raise monagent.common.exceptions.UnknownValue()
elif self.is_counter(metric) and len(self._sample_store[metric][key]) >= 2:
res = self._rate(
self._sample_store[metric][key][-2], self._sample_store[metric][key][-1])
if expire:
del self._sample_store[metric][key][:-1]
return res
elif self.is_gauge(metric) and len(self._sample_store[metric][key]) >= 1:
return self._sample_store[metric][key][-1]
else:
raise monagent.common.exceptions.UnknownValue()
def get_sample(self, metric, dimensions=None, device_name=None, expire=True):
"""Return the last value for that metric.
"""
x = self.get_sample_with_timestamp(metric, dimensions, device_name, expire)
assert isinstance(x, tuple) and len(x) == 4, x
return x[1]
def get_samples_with_timestamps(self, expire=True):
"""Return all values {metric: (ts, value)} for non-tagged metrics.
"""
values = {}
for m in self._sample_store:
try:
values[m] = self.get_sample_with_timestamp(m, expire=expire)
except Exception:
pass
return values
def get_samples(self, expire=True):
"""Return all values {metric: value} for non-tagged metrics.
"""
values = {}
for m in self._sample_store:
try:
# Discard the timestamp
values[m] = self.get_sample_with_timestamp(m, expire=expire)[1]
except Exception:
pass
return values
def get_metrics(self, expire=True, prettyprint=False):
"""Get all metrics, including the ones that are tagged.
This is the preferred method to retrieve metrics
@return the list of samples
@rtype [(metric_name, timestamp, value,
{"dimensions": {"name1": "key1", "name2": "key2"}}), ...]
"""
metrics = []
for m in self._sample_store:
try:
for key in self._sample_store[m]:
dimensions_list, device_name = key
dimensions = dict(dimensions_list)
try:
ts, val, hostname, device_name = self.get_sample_with_timestamp(
m, dimensions, device_name, expire)
except monagent.common.exceptions.UnknownValue:
continue
attributes = {}
if dimensions_list:
attributes['dimensions'] = dimensions
if hostname:
attributes['host_name'] = hostname
if device_name:
attributes['device_name'] = device_name
metrics.append((m, int(ts), val, attributes))
except Exception:
pass
if prettyprint:
print("Metrics: {}".format(metrics))
return metrics
class AgentCheck(object):
keystone = None
def __init__(self, name, init_config, agent_config, instances=None):
"""Initialize a new check.
:param name: The name of the check
:param init_config: The config for initializing the check
:param agent_config: The global configuration for the agent
:param instances: A list of configuration objects for each instance.
"""
self.name = name
self.init_config = init_config
self.agent_config = agent_config
self.hostname = monagent.common.util.get_hostname(agent_config)
self.log = logging.getLogger('%s.%s' % (__name__, name))
self.aggregator = monagent.common.aggregator.MetricsAggregator(self.hostname,
recent_point_threshold=agent_config.get('recent_point_threshold',
None))
self.events = []
self.instances = instances or []
self.warnings = []
self.library_versions = None
api_config = self.agent_config['Api']
AgentCheck.keystone = monagent.common.keystone.Keystone(api_config['keystone_url'],
api_config['username'],
api_config['password'],
api_config['project_name'])
def instance_count(self):
"""Return the number of instances that are configured for this check.
"""
return len(self.instances)
def gauge(self, metric, value, dimensions=None,
hostname=None, device_name=None, timestamp=None):
"""Record the value of a gauge, with optional dimensions, hostname and device name.
:param metric: The name of the metric
:param value: The value of the gauge
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
:param timestamp: (optional) The timestamp for this metric value
"""
self.aggregator.gauge(metric, value, dimensions, hostname, device_name, timestamp)
def increment(self, metric, value=1, dimensions=None, hostname=None, device_name=None):
"""Increment a counter with optional dimensions, hostname and device name.
:param metric: The name of the metric
:param value: The value to increment by
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
"""
self.aggregator.increment(metric, value, dimensions, hostname, device_name)
def decrement(self, metric, value=-1, dimensions=None, hostname=None, device_name=None):
"""Decrement a counter with optional dimensions, hostname and device name.
:param metric: The name of the metric
:param value: The value to decrement by
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
"""
self.aggregator.decrement(metric, value, dimensions, hostname, device_name)
def rate(self, metric, value, dimensions=None, hostname=None, device_name=None):
"""Submit a point for a metric that will be calculated as a rate on flush.
Values will persist across each call to `check` if there is not enough
point to generate a rate on the flush.
:param metric: The name of the metric
:param value: The value of the rate
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
"""
self.aggregator.rate(metric, value, dimensions, hostname, device_name)
def histogram(self, metric, value, dimensions=None, hostname=None, device_name=None):
"""Sample a histogram value, with optional dimensions, hostname and device name.
:param metric: The name of the metric
:param value: The value to sample for the histogram
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
"""
self.aggregator.histogram(metric, value, dimensions, hostname, device_name)
def set(self, metric, value, dimensions=None, hostname=None, device_name=None):
"""Sample a set value, with optional dimensions, hostname and device name.
:param metric: The name of the metric
:param value: The value for the set
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
"""
self.aggregator.set(metric, value, dimensions, hostname, device_name)
def event(self, event):
"""Save an event.
:param event: The event payload as a dictionary. Has the following
structure:
{
"timestamp": int, the epoch timestamp for the event,
"event_type": string, the event time name,
"api_key": string, the api key of the account to associate the event with,
"msg_title": string, the title of the event,
"msg_text": string, the text body of the event,
"alert_type": (optional) string, one of ('error', 'warning', 'success', 'info').
Defaults to 'info'.
"source_type_name": (optional) string, the source type name,
"host": (optional) string, the name of the host,
"dimensions": (optional) a dictionary of dimensions to associate with this event
}
"""
if event.get('api_key') is None:
event['api_key'] = self.agent_config['api_key']
self.events.append(event)
def has_events(self):
"""Check whether the check has saved any events
@return whether or not the check has saved any events
@rtype boolean
"""
return len(self.events) > 0
def get_metrics(self, prettyprint=False):
"""Get all metrics, including the ones that are tagged.
@return the list of samples
@rtype list of Measurement objects from monagent.common.metrics
"""
if prettyprint:
metrics = self.aggregator.flush()
for metric in metrics:
print(" Timestamp: {}".format(metric.timestamp))
print(" Name: {}".format(metric.name))
print(" Value: {}".format(metric.value))
print(" Dimensions: ", end='')
line = 0
for name in metric.dimensions:
if line != 0:
print(" " * 13, end='')
print("{0}={1}".format(name, metric.dimensions[name]))
line += 1
print("-" * 24)
return self.aggregator.flush()
def get_events(self):
"""Return a list of the events saved by the check, if any
@return the list of events saved by this check
@rtype list of event dictionaries
"""
events = self.events
self.events = []
return events
def has_warnings(self):
"""Check whether the instance run created any warnings.
"""
return len(self.warnings) > 0
def warning(self, warning_message):
"""Add a warning message that will be printed in the info page
:param warning_message: String. Warning message to be displayed
"""
self.warnings.append(warning_message)
def get_library_info(self):
if self.library_versions is not None:
return self.library_versions
try:
self.library_versions = self.get_library_versions()
except NotImplementedError:
pass
def get_library_versions(self):
"""Should return a string that shows which version
of the needed libraries are used
"""
raise NotImplementedError
def get_warnings(self):
"""Return the list of warnings messages to be displayed in the info page.
"""
warnings = self.warnings
self.warnings = []
return warnings
def run(self):
"""Run all instances.
"""
instance_statuses = []
for i, instance in enumerate(self.instances):
try:
instance['keystone'] = AgentCheck.keystone
self.check(instance)
if self.has_warnings():
instance_status = monagent.common.check_status.InstanceStatus(i,
monagent.common.check_status.STATUS_WARNING,
warnings=self.get_warnings())
else:
instance_status = monagent.common.check_status.InstanceStatus(i,
monagent.common.check_status.STATUS_OK)
except Exception as e:
self.log.exception("Check '%s' instance #%s failed" % (self.name, i))
instance_status = monagent.common.check_status.InstanceStatus(i,
monagent.common.check_status.STATUS_ERROR,
error=e,
tb=traceback.format_exc())
instance_statuses.append(instance_status)
return instance_statuses
def check(self, instance):
"""Overriden by the check class. This will be called to run the check.
:param instance: A dict with the instance information. This will vary
depending on your config structure.
"""
raise NotImplementedError()
@staticmethod
def stop():
"""To be executed when the agent is being stopped to clean ressources.
"""
pass
@classmethod
def from_yaml(cls, path_to_yaml=None, agentConfig=None, yaml_text=None, check_name=None):
"""A method used for testing your check without running the agent.
"""
if hasattr(yaml, 'CLoader'):
Loader = yaml.CLoader
else:
Loader = yaml.Loader
if path_to_yaml:
check_name = os.path.basename(path_to_yaml).split('.')[0]
try:
f = open(path_to_yaml)
except IOError:
raise Exception('Unable to open yaml config: %s' % path_to_yaml)
yaml_text = f.read()
f.close()
config = yaml.load(yaml_text, Loader=Loader)
check = cls(check_name, config.get('init_config') or {}, agentConfig or {})
return check, config.get('instances', [])
@staticmethod
def normalize(metric, prefix=None):
"""Turn a metric into a well-formed metric name prefix.b.c
:param metric The metric name to normalize
:param prefix A prefix to to add to the normalized name, default None
"""
name = re.sub(r"[,\+\*\-/()\[\]{}]", "_", metric)
# Eliminate multiple _
name = re.sub(r"__+", "_", name)
# Don't start/end with _
name = re.sub(r"^_", "", name)
name = re.sub(r"_$", "", name)
# Drop ._ and _.
name = re.sub(r"\._", ".", name)
name = re.sub(r"_\.", ".", name)
if prefix is not None:
return prefix + "." + name
else:
return name
@staticmethod
def read_config(instance, key, message=None, cast=None):
val = instance.get(key)
if val is None:
message = message or 'Must provide `%s` value in instance config' % key
raise Exception(message)
if cast is None:
return val
else:
return cast(val)
def run_check(name, path=None):
import tests.common
# Read the config file
confd_path = path or os.path.join(monagent.common.config.get_confd_path(monagent.common.util.get_os()),
'%s.yaml' % name)
try:
f = open(confd_path)
except IOError:
raise Exception('Unable to open configuration at %s' % confd_path)
config_str = f.read()
f.close()
# Run the check
check, instances = tests.common.get_check(name, config_str)
if not instances:
raise Exception('YAML configuration returned no instances.')
for instance in instances:
check.check(instance)
if check.has_events():
print("Events:\n")
pprint.pprint(check.get_events(), indent=4)
print("Metrics:\n")
pprint.pprint(check.get_metrics(), indent=4)