# (C) Copyright 2016 Hewlett Packard Enterprise Development LP # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain # a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. from copy import deepcopy import errno import fcntl import json import os import time from monasca_agent.collector import checks OK = 0 FAIL = 1 # name used for metrics reported directly by this module PLUGIN_METRIC_NAME = 'monasca.json_plugin.status' # Assumes metrics file written every 60 seconds DEFAULT_STALE_AGE = 60 * 4 # These are too old to report # Valid attributes of a metric METRIC_KEYS = ['name', 'metric', 'timestamp', 'value', 'dimensions', 'value_meta'] def _now(): """Makes unit testing easier""" return time.time() class JsonPlugin(checks.AgentCheck): """Read measurements from JSON-formatted files This plugin reads measurements from JSON-formatted files. The format of the file is shown in the following example: { "stale_age": 300, "replace_timestamps": false, "measurements: [ { "metric": "a_metric", "dimensions: ["dim1": "val1"], "value: 30.0, "timestamp": 1474644040 }, { "metric": "second_metric", "dimensions: ["dim2": "val2"], "value: 22.4, "timestamp": 1474644040 }, ] } In effect, the file contains a header and a list of measurements. The header has the following fields: stale_age: A time in seconds. If the timestamp of a measurement is older than this, this plugin reports a json_plugin.check metric with a value of 1. The value_meta contains the name of the JSON file that is aged. This header is optional. It defaults to 4 minutes replace_timestamps: A boolean. If set, the next time the plugin is called, it will send all the measurements with a timestamp equal to the current time (ignoring the timestamp in the measurements list). The stale_age value is ignored with this setting. This header is optional. It defaults to false. measurements: This is a list of the measurements, formatted in the same way that measurements are presented to the Monasca API. However, if replace_timestamps is set, the timestamp key can be omitted (since it is set to current time). An alternate format for the file is that the header is omitted, i.e., if the first item in the file is a list, it is assumed this is the measurement list and the header values are defaulted. """ def __init__(self, name, init_config, agent_config, instances=None, logger=None): super(JsonPlugin, self).__init__(name, init_config, agent_config, instances) self.log = logger or self.log self.plugin_failures = {} self.now = time.time() self.posted_metrics = {} def _plugin_failed(self, file_name, msg): self.plugin_failures[file_name] = msg self.log.warn('%s: %s' % (file_name, msg)) def _plugin_check_metric(self): """Generate metric to report status of the plugin""" plugin_metric = dict(metric=PLUGIN_METRIC_NAME, dimensions={}, value=OK, timestamp=self.now) # If there were any failures, put the path # and error message into value_meta. errors = [] for path, err in self.plugin_failures.items(): if err: errors.append('%s: %s' % (path, err)) msg = '' if errors: msg = ', '.join(errors) if msg: if len(msg) > 1024: # keep well below length limit msg = msg[:-1021] + '...' plugin_metric.update(dict(value_meta=dict(msg=msg), value=FAIL)) return plugin_metric @staticmethod def _take_shared_lock(fd): """Take shared lock on a file descriptor Assuming the writer of the JSON file also takes a lock, this function locks a file descriptor so that we can read the file without worrying that the content is changing as we read. Raises IOError if lock cannot be taken after a number of attempts. :param fd: the file descriptor of the file being read """ max_retries = 5 delay = 0.02 attempts = 0 while True: attempts += 1 try: fcntl.flock(fd, fcntl.LOCK_SH | fcntl.LOCK_NB) break except IOError as err: if (err.errno not in [errno.EWOULDBLOCK, errno.EACCES] or attempts > max_retries): raise time.sleep(delay * attempts) def _load_measurements_from_file(self, file_name): handling = {} file_data = {'measurements': []} try: with open(file_name, 'r') as f: self._take_shared_lock(f) f.seek(0) file_data = json.load(f) except (ValueError, TypeError) as e: self._plugin_failed(file_name, 'failed parsing json: %s' % e) except Exception as e: # noqa self._plugin_failed(file_name, 'loading error: %s' % e) try: if isinstance(file_data, list): metrics = file_data handling['stale_age'] = DEFAULT_STALE_AGE handling['replace_timestamps'] = False else: metrics = file_data.get('measurements', []) handling['stale_age'] = file_data.get('stale_age', DEFAULT_STALE_AGE) handling['replace_timestamps'] = file_data.get( 'replace_timestamps', False) except Exception as e: # noqa self._plugin_failed(file_name, 'unable to process file contents: %s' % e) metrics = [] metrics = self._filter_metrics(metrics, file_name) return self._remove_duplicate_metrics(handling, metrics, file_name) def _filter_metrics(self, metrics, file_name): """Remove invalid metrics from the metric list Validate and clean up so the metric is suitable for passing to AgentCheck.gauge(). The metric might be invalid (e.g., value_meta too long), but that's not our concern here. """ invalid_metric = None valid_metrics = [] for metric in metrics: if not isinstance(metric, dict): invalid_metric = metric # not a dict continue for key in metric.keys(): if key not in METRIC_KEYS: invalid_metric = metric # spurious attribute continue if 'name' not in metric.keys() and 'metric' not in metric.keys(): invalid_metric = metric # missing name continue if 'value' not in metric.keys(): invalid_metric = metric # missing value continue if 'name' in metric: # API use 'name'; AgentCheck uses 'metric' metric['metric'] = metric.get('name') del metric['name'] if not metric.get('dimensions', None): metric['dimensions'] = {} valid_metrics.append(metric) if invalid_metric: # Only report one invalid metric per file self._plugin_failed(file_name, 'invalid metric found: %s' % metric) return valid_metrics def _remove_duplicate_metrics(self, handling, metrics, file_name): """Remove metrics if we've already reported them We track the metrics we've posted to the Monasca Agent This allows us to discard duplicate metrics. The most common cause of duplicates is that the agent runs more often than the update period of the JSON file. We also discard metrics that seem stale. This can occur when the program creating the metrics file has died, so the JSON file does not update with new metrics. :param: handling: options for how measurements are handled :param metrics: The metrics we found in the JSON file :param file_name: the path of the JSON file :returns: A list of metrics that should be posted """ # Set timestamp if asked if handling['replace_timestamps']: for metric in metrics: metric['timestamp'] = self.now # Since we've set the timestamp, these are unique (not duplicate) # so no further processing is required return metrics # Remove metrics we've already posted. Also remove stale metrics. if file_name not in self.posted_metrics: self.posted_metrics[file_name] = [] stale_metrics = False for metric in deepcopy(metrics): if ((self.now - metric.get('timestamp', 0)) > handling.get('stale_age')): metrics.remove(metric) # too old stale_metrics = True elif metric in self.posted_metrics[file_name]: metrics.remove(metric) # already sent to Monasca else: # New metric; not stale. self.posted_metrics[file_name].append(metric) # Purge really old metrics from posted for metric in list(self.posted_metrics[file_name]): if ((self.now - metric.get('timestamp', 0)) >= handling.get('stale_age') * 2): self.posted_metrics[file_name].remove(metric) if stale_metrics: self._plugin_failed(file_name, 'Metrics are older than %s seconds;' ' file not updating?' % handling.get('stale_age')) return metrics def _get_metrics(self): reported = [] for file_name in self.metrics_files: metrics = self._load_measurements_from_file(file_name) for metric in metrics: reported.append(metric) return reported def _load_instance_config(self, instance): self.metrics_files = [] self.metrics_dir = instance.get('metrics_dir', '') if self.metrics_dir: self.plugin_failures[self.metrics_dir] = '' try: file_names = os.listdir(self.metrics_dir) for name in [os.path.join(self.metrics_dir, name) for name in file_names]: # .json extension protects from reading .swp and similar if os.path.isfile(name) and name.lower().endswith('.json'): self.metrics_files.append(name) except OSError as err: self._plugin_failed(self.metrics_dir, 'Error processing: %s' % err) else: metric_file = instance.get('metrics_file', '') if metric_file: self.metrics_files = [metric_file] self.log.debug('Using metrics files %s' % ','.join(self.metrics_files)) for file_name in self.metrics_files: self.plugin_failures[file_name] = '' def check(self, instance): self._load_instance_config(instance) all_metrics = [] self.now = _now() # Load measurements from files metrics = self._get_metrics() all_metrics.extend(metrics) # Add this plugin status all_metrics.append(self._plugin_check_metric()) for metric in all_metrics: # apply any instance dimensions that may be configured, # overriding any dimension with same key that check has set. metric['dimensions'] = self._set_dimensions(metric['dimensions'], instance) self.log.debug('Posting metric: %s' % metric) try: self.gauge(**metric) except Exception as e: # noqa self.log.exception('Exception while reporting metric: %s' % e)