607 lines
24 KiB
Python
607 lines
24 KiB
Python
"""Base class for Checks.
|
|
|
|
If you are writing your own checks you should subclass the AgentCheck class.
|
|
The Check class is being deprecated so don't write new checks with it.
|
|
"""
|
|
# This file uses 'print' as a function rather than a statement, a la Python3
|
|
from __future__ import print_function
|
|
|
|
import logging
|
|
import os
|
|
import pprint
|
|
import re
|
|
import time
|
|
import traceback
|
|
|
|
import yaml
|
|
|
|
import monagent.common.aggregator
|
|
import monagent.common.config
|
|
import monagent.common.exceptions
|
|
import monagent.common.keystone
|
|
import monagent.common.util
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
# todo convert all checks to the new interface then remove this.
|
|
# Is the LaconicFilter on logs used elsewhere?
|
|
# =============================================================================
|
|
# DEPRECATED
|
|
# ------------------------------
|
|
# If you are writing your own check, you should inherit from AgentCheck
|
|
# and not this class. This class will be removed in a future version
|
|
# of the agent.
|
|
# =============================================================================
|
|
class Check(object):
|
|
|
|
"""(Abstract) class for all checks with the ability to:
|
|
|
|
* store 1 (and only 1) sample for gauges per metric/tag combination
|
|
* compute rates for counters
|
|
* only log error messages once (instead of each time they occur)
|
|
"""
|
|
|
|
def __init__(self, logger, agent_config=None):
|
|
# where to store samples, indexed by metric_name
|
|
# metric_name: {("sorted", "dimensions"): [(ts, value), (ts, value)],
|
|
# tuple(dimensions) are stored as a key since lists are not hashable
|
|
# None: [(ts, value), (ts, value)]}
|
|
# untagged values are indexed by None
|
|
self.agent_config = agent_config
|
|
self._sample_store = {}
|
|
self._counters = {} # metric_name: bool
|
|
self.logger = logger
|
|
try:
|
|
self.logger.addFilter(monagent.common.util.LaconicFilter())
|
|
except Exception:
|
|
self.logger.exception("Trying to install laconic log filter and failed")
|
|
|
|
@staticmethod
|
|
def normalize(metric, prefix=None):
|
|
"""Turn a metric into a well-formed metric name
|
|
|
|
prefix.b.c
|
|
"""
|
|
name = re.sub(r"[,\+\*\-/()\[\]{}]", "_", metric)
|
|
# Eliminate multiple _
|
|
name = re.sub(r"__+", "_", name)
|
|
# Don't start/end with _
|
|
name = re.sub(r"^_", "", name)
|
|
name = re.sub(r"_$", "", name)
|
|
# Drop ._ and _.
|
|
name = re.sub(r"\._", ".", name)
|
|
name = re.sub(r"_\.", ".", name)
|
|
|
|
if prefix is not None:
|
|
return prefix + "." + name
|
|
else:
|
|
return name
|
|
|
|
@staticmethod
|
|
def normalize_device_name(device_name):
|
|
return device_name.strip().lower().replace(' ', '_')
|
|
|
|
def counter(self, metric):
|
|
"""Treats the metric as a counter, i.e. computes its per second derivative
|
|
|
|
ACHTUNG: Resets previous values associated with this metric.
|
|
"""
|
|
self._counters[metric] = True
|
|
self._sample_store[metric] = {}
|
|
|
|
def is_counter(self, metric):
|
|
"""Is this metric a counter?
|
|
"""
|
|
return metric in self._counters
|
|
|
|
def gauge(self, metric):
|
|
"""Treats the metric as a gauge, i.e. keep the data as is
|
|
|
|
ACHTUNG: Resets previous values associated with this metric.
|
|
"""
|
|
self._sample_store[metric] = {}
|
|
|
|
def is_metric(self, metric):
|
|
return metric in self._sample_store
|
|
|
|
def is_gauge(self, metric):
|
|
return self.is_metric(metric) and not self.is_counter(metric)
|
|
|
|
def get_metric_names(self):
|
|
"""Get all metric names.
|
|
"""
|
|
return self._sample_store.keys()
|
|
|
|
def save_gauge(self, metric, value, timestamp=None,
|
|
dimensions=None, hostname=None, device_name=None):
|
|
"""Save a gauge value.
|
|
"""
|
|
if not self.is_gauge(metric):
|
|
self.gauge(metric)
|
|
self.save_sample(metric, value, timestamp, dimensions, hostname, device_name)
|
|
|
|
def save_sample(self, metric, value, timestamp=None,
|
|
dimensions=None, hostname=None, device_name=None):
|
|
"""Save a simple sample, evict old values if needed.
|
|
"""
|
|
if dimensions is None:
|
|
dimensions = {}
|
|
if timestamp is None:
|
|
timestamp = time.time()
|
|
if metric not in self._sample_store:
|
|
raise monagent.common.exceptions.CheckException("Saving a sample for an undefined metric: %s" % metric)
|
|
try:
|
|
value = monagent.common.util.cast_metric_val(value)
|
|
except ValueError as ve:
|
|
raise monagent.common.exceptions.NaN(ve)
|
|
|
|
# Sort and validate dimensions
|
|
if dimensions is not None and not isinstance(dimensions, dict):
|
|
raise monagent.common.exceptions.CheckException("Dimensions must be a dictionary")
|
|
|
|
# Data eviction rules
|
|
key = (tuple(sorted(dimensions.items())), device_name)
|
|
if self.is_gauge(metric):
|
|
self._sample_store[metric][key] = ((timestamp, value, hostname, device_name), )
|
|
elif self.is_counter(metric):
|
|
if self._sample_store[metric].get(key) is None:
|
|
self._sample_store[metric][key] = [(timestamp, value, hostname, device_name)]
|
|
else:
|
|
self._sample_store[metric][key] = self._sample_store[metric][key][-1:] + \
|
|
[(timestamp, value, hostname, device_name)]
|
|
else:
|
|
raise monagent.common.exceptions.CheckException("%s must be either gauge or counter, skipping sample at %s" %
|
|
(metric, time.ctime(timestamp)))
|
|
|
|
if self.is_gauge(metric):
|
|
# store[metric][dimensions] = (ts, val) - only 1 value allowed
|
|
assert len(self._sample_store[metric][key]) == 1, self._sample_store[metric]
|
|
elif self.is_counter(metric):
|
|
assert len(self._sample_store[metric][key]) in (1, 2), self._sample_store[metric]
|
|
|
|
@classmethod
|
|
def _rate(cls, sample1, sample2):
|
|
"""Simple rate.
|
|
"""
|
|
try:
|
|
interval = sample2[0] - sample1[0]
|
|
if interval == 0:
|
|
raise monagent.common.exceptions.Infinity()
|
|
|
|
delta = sample2[1] - sample1[1]
|
|
if delta < 0:
|
|
raise monagent.common.exceptions.UnknownValue()
|
|
|
|
return (sample2[0], delta / interval, sample2[2], sample2[3])
|
|
except monagent.common.exceptions.Infinity:
|
|
raise
|
|
except monagent.common.exceptions.UnknownValue:
|
|
raise
|
|
except Exception as e:
|
|
raise monagent.common.exceptions.NaN(e)
|
|
|
|
def get_sample_with_timestamp(self, metric, dimensions=None, device_name=None, expire=True):
|
|
"""Get (timestamp-epoch-style, value).
|
|
"""
|
|
if dimensions is None:
|
|
dimensions = {}
|
|
|
|
# Get the proper dimensions
|
|
key = (tuple(sorted(dimensions.items())), device_name)
|
|
|
|
# Never seen this metric
|
|
if metric not in self._sample_store:
|
|
raise monagent.common.exceptions.UnknownValue()
|
|
|
|
# Not enough value to compute rate
|
|
elif self.is_counter(metric) and len(self._sample_store[metric][key]) < 2:
|
|
raise monagent.common.exceptions.UnknownValue()
|
|
|
|
elif self.is_counter(metric) and len(self._sample_store[metric][key]) >= 2:
|
|
res = self._rate(
|
|
self._sample_store[metric][key][-2], self._sample_store[metric][key][-1])
|
|
if expire:
|
|
del self._sample_store[metric][key][:-1]
|
|
return res
|
|
|
|
elif self.is_gauge(metric) and len(self._sample_store[metric][key]) >= 1:
|
|
return self._sample_store[metric][key][-1]
|
|
|
|
else:
|
|
raise monagent.common.exceptions.UnknownValue()
|
|
|
|
def get_sample(self, metric, dimensions=None, device_name=None, expire=True):
|
|
"""Return the last value for that metric.
|
|
"""
|
|
x = self.get_sample_with_timestamp(metric, dimensions, device_name, expire)
|
|
assert isinstance(x, tuple) and len(x) == 4, x
|
|
return x[1]
|
|
|
|
def get_samples_with_timestamps(self, expire=True):
|
|
"""Return all values {metric: (ts, value)} for non-tagged metrics.
|
|
"""
|
|
values = {}
|
|
for m in self._sample_store:
|
|
try:
|
|
values[m] = self.get_sample_with_timestamp(m, expire=expire)
|
|
except Exception:
|
|
pass
|
|
return values
|
|
|
|
def get_samples(self, expire=True):
|
|
"""Return all values {metric: value} for non-tagged metrics.
|
|
"""
|
|
values = {}
|
|
for m in self._sample_store:
|
|
try:
|
|
# Discard the timestamp
|
|
values[m] = self.get_sample_with_timestamp(m, expire=expire)[1]
|
|
except Exception:
|
|
pass
|
|
return values
|
|
|
|
def get_metrics(self, expire=True, prettyprint=False):
|
|
"""Get all metrics, including the ones that are tagged.
|
|
|
|
This is the preferred method to retrieve metrics
|
|
|
|
@return the list of samples
|
|
@rtype [(metric_name, timestamp, value,
|
|
{"dimensions": {"name1": "key1", "name2": "key2"}}), ...]
|
|
"""
|
|
metrics = []
|
|
for m in self._sample_store:
|
|
try:
|
|
for key in self._sample_store[m]:
|
|
dimensions_list, device_name = key
|
|
dimensions = dict(dimensions_list)
|
|
try:
|
|
ts, val, hostname, device_name = self.get_sample_with_timestamp(
|
|
m, dimensions, device_name, expire)
|
|
except monagent.common.exceptions.UnknownValue:
|
|
continue
|
|
attributes = {}
|
|
if dimensions_list:
|
|
attributes['dimensions'] = dimensions
|
|
if hostname:
|
|
attributes['host_name'] = hostname
|
|
if device_name:
|
|
attributes['device_name'] = device_name
|
|
metrics.append((m, int(ts), val, attributes))
|
|
except Exception:
|
|
pass
|
|
if prettyprint:
|
|
print("Metrics: {}".format(metrics))
|
|
return metrics
|
|
|
|
|
|
class AgentCheck(object):
|
|
|
|
keystone = None
|
|
|
|
def __init__(self, name, init_config, agent_config, instances=None):
|
|
"""Initialize a new check.
|
|
|
|
:param name: The name of the check
|
|
:param init_config: The config for initializing the check
|
|
:param agent_config: The global configuration for the agent
|
|
:param instances: A list of configuration objects for each instance.
|
|
"""
|
|
self.name = name
|
|
self.init_config = init_config
|
|
self.agent_config = agent_config
|
|
self.hostname = monagent.common.util.get_hostname(agent_config)
|
|
self.log = logging.getLogger('%s.%s' % (__name__, name))
|
|
|
|
self.aggregator = monagent.common.aggregator.MetricsAggregator(self.hostname,
|
|
recent_point_threshold=agent_config.get('recent_point_threshold',
|
|
None))
|
|
|
|
self.events = []
|
|
self.instances = instances or []
|
|
self.warnings = []
|
|
self.library_versions = None
|
|
|
|
api_config = self.agent_config['Api']
|
|
AgentCheck.keystone = monagent.common.keystone.Keystone(api_config['keystone_url'],
|
|
api_config['username'],
|
|
api_config['password'],
|
|
api_config['project_name'])
|
|
|
|
def instance_count(self):
|
|
"""Return the number of instances that are configured for this check.
|
|
"""
|
|
return len(self.instances)
|
|
|
|
def gauge(self, metric, value, dimensions=None,
|
|
hostname=None, device_name=None, timestamp=None):
|
|
"""Record the value of a gauge, with optional dimensions, hostname and device name.
|
|
|
|
:param metric: The name of the metric
|
|
:param value: The value of the gauge
|
|
:param dimensions: (optional) A dictionary of dimensions for this metric
|
|
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
|
:param device_name: (optional) The device name for this metric
|
|
:param timestamp: (optional) The timestamp for this metric value
|
|
"""
|
|
self.aggregator.gauge(metric, value, dimensions, hostname, device_name, timestamp)
|
|
|
|
def increment(self, metric, value=1, dimensions=None, hostname=None, device_name=None):
|
|
"""Increment a counter with optional dimensions, hostname and device name.
|
|
|
|
:param metric: The name of the metric
|
|
:param value: The value to increment by
|
|
:param dimensions: (optional) A dictionary of dimensions for this metric
|
|
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
|
:param device_name: (optional) The device name for this metric
|
|
"""
|
|
self.aggregator.increment(metric, value, dimensions, hostname, device_name)
|
|
|
|
def decrement(self, metric, value=-1, dimensions=None, hostname=None, device_name=None):
|
|
"""Decrement a counter with optional dimensions, hostname and device name.
|
|
|
|
:param metric: The name of the metric
|
|
:param value: The value to decrement by
|
|
:param dimensions: (optional) A dictionary of dimensions for this metric
|
|
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
|
:param device_name: (optional) The device name for this metric
|
|
"""
|
|
self.aggregator.decrement(metric, value, dimensions, hostname, device_name)
|
|
|
|
def rate(self, metric, value, dimensions=None, hostname=None, device_name=None):
|
|
"""Submit a point for a metric that will be calculated as a rate on flush.
|
|
|
|
Values will persist across each call to `check` if there is not enough
|
|
point to generate a rate on the flush.
|
|
|
|
:param metric: The name of the metric
|
|
:param value: The value of the rate
|
|
:param dimensions: (optional) A dictionary of dimensions for this metric
|
|
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
|
:param device_name: (optional) The device name for this metric
|
|
"""
|
|
self.aggregator.rate(metric, value, dimensions, hostname, device_name)
|
|
|
|
def histogram(self, metric, value, dimensions=None, hostname=None, device_name=None):
|
|
"""Sample a histogram value, with optional dimensions, hostname and device name.
|
|
|
|
:param metric: The name of the metric
|
|
:param value: The value to sample for the histogram
|
|
:param dimensions: (optional) A dictionary of dimensions for this metric
|
|
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
|
:param device_name: (optional) The device name for this metric
|
|
"""
|
|
self.aggregator.histogram(metric, value, dimensions, hostname, device_name)
|
|
|
|
def set(self, metric, value, dimensions=None, hostname=None, device_name=None):
|
|
"""Sample a set value, with optional dimensions, hostname and device name.
|
|
|
|
:param metric: The name of the metric
|
|
:param value: The value for the set
|
|
:param dimensions: (optional) A dictionary of dimensions for this metric
|
|
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
|
:param device_name: (optional) The device name for this metric
|
|
"""
|
|
self.aggregator.set(metric, value, dimensions, hostname, device_name)
|
|
|
|
def event(self, event):
|
|
"""Save an event.
|
|
|
|
:param event: The event payload as a dictionary. Has the following
|
|
structure:
|
|
|
|
{
|
|
"timestamp": int, the epoch timestamp for the event,
|
|
"event_type": string, the event time name,
|
|
"api_key": string, the api key of the account to associate the event with,
|
|
"msg_title": string, the title of the event,
|
|
"msg_text": string, the text body of the event,
|
|
"alert_type": (optional) string, one of ('error', 'warning', 'success', 'info').
|
|
Defaults to 'info'.
|
|
"source_type_name": (optional) string, the source type name,
|
|
"host": (optional) string, the name of the host,
|
|
"dimensions": (optional) a dictionary of dimensions to associate with this event
|
|
}
|
|
"""
|
|
if event.get('api_key') is None:
|
|
event['api_key'] = self.agent_config['api_key']
|
|
self.events.append(event)
|
|
|
|
def has_events(self):
|
|
"""Check whether the check has saved any events
|
|
|
|
@return whether or not the check has saved any events
|
|
@rtype boolean
|
|
"""
|
|
return len(self.events) > 0
|
|
|
|
def get_metrics(self, prettyprint=False):
|
|
"""Get all metrics, including the ones that are tagged.
|
|
|
|
@return the list of samples
|
|
@rtype list of Measurement objects from monagent.common.metrics
|
|
"""
|
|
if prettyprint:
|
|
metrics = self.aggregator.flush()
|
|
for metric in metrics:
|
|
print(" Timestamp: {}".format(metric.timestamp))
|
|
print(" Name: {}".format(metric.name))
|
|
print(" Value: {}".format(metric.value))
|
|
print(" Dimensions: ", end='')
|
|
line = 0
|
|
for name in metric.dimensions:
|
|
if line != 0:
|
|
print(" " * 13, end='')
|
|
print("{0}={1}".format(name, metric.dimensions[name]))
|
|
line += 1
|
|
print("-" * 24)
|
|
return self.aggregator.flush()
|
|
|
|
def get_events(self):
|
|
"""Return a list of the events saved by the check, if any
|
|
|
|
@return the list of events saved by this check
|
|
@rtype list of event dictionaries
|
|
"""
|
|
events = self.events
|
|
self.events = []
|
|
return events
|
|
|
|
def has_warnings(self):
|
|
"""Check whether the instance run created any warnings.
|
|
"""
|
|
return len(self.warnings) > 0
|
|
|
|
def warning(self, warning_message):
|
|
"""Add a warning message that will be printed in the info page
|
|
|
|
:param warning_message: String. Warning message to be displayed
|
|
"""
|
|
self.warnings.append(warning_message)
|
|
|
|
def get_library_info(self):
|
|
if self.library_versions is not None:
|
|
return self.library_versions
|
|
try:
|
|
self.library_versions = self.get_library_versions()
|
|
except NotImplementedError:
|
|
pass
|
|
|
|
def get_library_versions(self):
|
|
"""Should return a string that shows which version
|
|
|
|
of the needed libraries are used
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def get_warnings(self):
|
|
"""Return the list of warnings messages to be displayed in the info page.
|
|
"""
|
|
warnings = self.warnings
|
|
self.warnings = []
|
|
return warnings
|
|
|
|
def run(self):
|
|
"""Run all instances.
|
|
"""
|
|
instance_statuses = []
|
|
for i, instance in enumerate(self.instances):
|
|
try:
|
|
instance['keystone'] = AgentCheck.keystone
|
|
self.check(instance)
|
|
if self.has_warnings():
|
|
instance_status = monagent.common.check_status.InstanceStatus(i,
|
|
monagent.common.check_status.STATUS_WARNING,
|
|
warnings=self.get_warnings())
|
|
else:
|
|
instance_status = monagent.common.check_status.InstanceStatus(i,
|
|
monagent.common.check_status.STATUS_OK)
|
|
except Exception as e:
|
|
self.log.exception("Check '%s' instance #%s failed" % (self.name, i))
|
|
instance_status = monagent.common.check_status.InstanceStatus(i,
|
|
monagent.common.check_status.STATUS_ERROR,
|
|
error=e,
|
|
tb=traceback.format_exc())
|
|
instance_statuses.append(instance_status)
|
|
return instance_statuses
|
|
|
|
def check(self, instance):
|
|
"""Overriden by the check class. This will be called to run the check.
|
|
|
|
:param instance: A dict with the instance information. This will vary
|
|
depending on your config structure.
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
@staticmethod
|
|
def stop():
|
|
"""To be executed when the agent is being stopped to clean ressources.
|
|
"""
|
|
pass
|
|
|
|
@classmethod
|
|
def from_yaml(cls, path_to_yaml=None, agentConfig=None, yaml_text=None, check_name=None):
|
|
"""A method used for testing your check without running the agent.
|
|
"""
|
|
if hasattr(yaml, 'CLoader'):
|
|
Loader = yaml.CLoader
|
|
else:
|
|
Loader = yaml.Loader
|
|
|
|
if path_to_yaml:
|
|
check_name = os.path.basename(path_to_yaml).split('.')[0]
|
|
try:
|
|
f = open(path_to_yaml)
|
|
except IOError:
|
|
raise Exception('Unable to open yaml config: %s' % path_to_yaml)
|
|
yaml_text = f.read()
|
|
f.close()
|
|
|
|
config = yaml.load(yaml_text, Loader=Loader)
|
|
check = cls(check_name, config.get('init_config') or {}, agentConfig or {})
|
|
|
|
return check, config.get('instances', [])
|
|
|
|
@staticmethod
|
|
def normalize(metric, prefix=None):
|
|
"""Turn a metric into a well-formed metric name prefix.b.c
|
|
|
|
:param metric The metric name to normalize
|
|
:param prefix A prefix to to add to the normalized name, default None
|
|
"""
|
|
name = re.sub(r"[,\+\*\-/()\[\]{}]", "_", metric)
|
|
# Eliminate multiple _
|
|
name = re.sub(r"__+", "_", name)
|
|
# Don't start/end with _
|
|
name = re.sub(r"^_", "", name)
|
|
name = re.sub(r"_$", "", name)
|
|
# Drop ._ and _.
|
|
name = re.sub(r"\._", ".", name)
|
|
name = re.sub(r"_\.", ".", name)
|
|
|
|
if prefix is not None:
|
|
return prefix + "." + name
|
|
else:
|
|
return name
|
|
|
|
@staticmethod
|
|
def read_config(instance, key, message=None, cast=None):
|
|
val = instance.get(key)
|
|
if val is None:
|
|
message = message or 'Must provide `%s` value in instance config' % key
|
|
raise Exception(message)
|
|
|
|
if cast is None:
|
|
return val
|
|
else:
|
|
return cast(val)
|
|
|
|
|
|
def run_check(name, path=None):
|
|
import tests.common
|
|
|
|
# Read the config file
|
|
confd_path = path or os.path.join(monagent.common.config.get_confd_path(monagent.common.util.get_os()),
|
|
'%s.yaml' % name)
|
|
|
|
try:
|
|
f = open(confd_path)
|
|
except IOError:
|
|
raise Exception('Unable to open configuration at %s' % confd_path)
|
|
|
|
config_str = f.read()
|
|
f.close()
|
|
|
|
# Run the check
|
|
check, instances = tests.common.get_check(name, config_str)
|
|
if not instances:
|
|
raise Exception('YAML configuration returned no instances.')
|
|
for instance in instances:
|
|
check.check(instance)
|
|
if check.has_events():
|
|
print("Events:\n")
|
|
pprint.pprint(check.get_events(), indent=4)
|
|
print("Metrics:\n")
|
|
pprint.pprint(check.get_metrics(), indent=4)
|