monasca-agent/monasca_agent/collector/checks_d/json_plugin.py

# (C) Copyright 2016 Hewlett Packard Enterprise Development LP
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

from copy import deepcopy
import errno
import fcntl
import json
import os
import time

from monasca_agent.collector import checks


OK = 0
FAIL = 1

# name used for metrics reported directly by this module
PLUGIN_METRIC_NAME = 'monasca.json_plugin.status'

# Assumes metrics file written every 60 seconds
DEFAULT_STALE_AGE = 60 * 4                # These are too old to report

# Valid attributes of a metric
METRIC_KEYS = ['name', 'metric', 'timestamp', 'value', 'dimensions',
               'value_meta']


def _now():
    """Makes unit testing easier"""
    return time.time()


class JsonPlugin(checks.AgentCheck):
    """Read measurements from JSON-formatted files

    This plugin reads measurements from JSON-formatted files.

    The format of the file is shown in the following example:

    {
        "stale_age": 300,
        "replace_timestamps": false,
        "measurements: [
            {
                "metric": "a_metric",
                "dimensions: ["dim1": "val1"],
                "value: 30.0,
                "timestamp": 1474644040
            },
            {
                "metric": "second_metric",
                "dimensions: ["dim2": "val2"],
                "value: 22.4,
                "timestamp": 1474644040
            },
        ]
    }

    In effect, the file contains a header and a list of measurements. The
    header has the following fields:

    stale_age:

        A time in seconds. If the timestamp of a measurement is
        older than this, this plugin reports a json_plugin.check metric
        with a value of 1. The value_meta contains the name of
        the JSON file that is aged.

        This header is optional. It defaults to 4 minutes

    replace_timestamps:

        A boolean. If set, the next time the plugin is called, it will
        send all the measurements with a timestamp equal to the current
        time (ignoring the timestamp in the measurements list). The
        stale_age value is ignored with this setting.

        This header is optional. It defaults to false.

    measurements:

        This is a list of the measurements, formatted in the same way
        that measurements are presented to the Monasca API. However,
        if replace_timestamps is set, the timestamp key can be omitted
        (since it is set to current time).

    An alternate format for the file is that the header is omitted, i.e.,
    if the first item in the file is a list, it is assumed this is the
    measurement list and the header values are defaulted.
    """

    def __init__(self, name, init_config, agent_config, instances=None,
                 logger=None):
        super(JsonPlugin, self).__init__(name, init_config, agent_config,
                                         instances)
        self.log = logger or self.log
        self.plugin_failures = {}
        self.now = time.time()
        self.posted_metrics = {}

    def _plugin_failed(self, file_name, msg):
        self.plugin_failures[file_name] = msg
        self.log.warn('%s: %s' % (file_name, msg))

    def _plugin_check_metric(self):
        """Generate metric to report status of the plugin"""
        plugin_metric = dict(metric=PLUGIN_METRIC_NAME,
                             dimensions={},
                             value=OK,
                             timestamp=self.now)
        # If there were any failures, put the path
        # and error message into value_meta.
        errors = []
        for path, err in self.plugin_failures.items():
            if err:
                errors.append('%s: %s' % (path, err))
        msg = ''
        if errors:
            msg = ', '.join(errors)
        if msg:
            if len(msg) > 1024:  # keep well below length limit
                msg = msg[:-1021] + '...'
            plugin_metric.update(dict(value_meta=dict(msg=msg),
                                      value=FAIL))
        return plugin_metric

    @staticmethod
    def _take_shared_lock(fd):
        """Take shared lock on a file descriptor

        Assuming the writer of the JSON file also takes a lock, this
        function locks a file descriptor so that we can read the file
        without worrying that the content is changing as we read.

        Raises IOError if lock cannot be taken after a number of attempts.

        :param fd: the file descriptor of the file being read
        """
        max_retries = 5
        delay = 0.02
        attempts = 0
        while True:
            attempts += 1
            try:
                fcntl.flock(fd, fcntl.LOCK_SH | fcntl.LOCK_NB)
                break
            except IOError as err:
                if (err.errno not in [errno.EWOULDBLOCK, errno.EACCES] or
                        attempts > max_retries):
                    raise
                time.sleep(delay * attempts)

    def _load_measurements_from_file(self, file_name):
        handling = {}
        file_data = {'measurements': []}
        try:
            with open(file_name, 'r') as f:
                self._take_shared_lock(f)
                f.seek(0)
                file_data = json.load(f)
        except (ValueError, TypeError) as e:
            self._plugin_failed(file_name,
                                'failed parsing json: %s' % e)
        except Exception as e:  # noqa
            self._plugin_failed(file_name,
                                'loading error: %s' % e)
        try:
            if isinstance(file_data, list):
                metrics = file_data
                handling['stale_age'] = DEFAULT_STALE_AGE
                handling['replace_timestamps'] = False
            else:
                metrics = file_data.get('measurements', [])
                handling['stale_age'] = file_data.get('stale_age',
                                                      DEFAULT_STALE_AGE)
                handling['replace_timestamps'] = file_data.get(
                    'replace_timestamps', False)
        except Exception as e:  # noqa
            self._plugin_failed(file_name,
                                'unable to process file contents: %s' % e)
            metrics = []

        metrics = self._filter_metrics(metrics, file_name)
        return self._remove_duplicate_metrics(handling, metrics, file_name)

    def _filter_metrics(self, metrics, file_name):
        """Remove invalid metrics from the metric list

        Validate and clean up so the metric is suitable for passing to
        AgentCheck.gauge(). The metric might be invalid (e.g., value_meta too
        long), but that's not our concern here.
        """
        invalid_metric = None
        valid_metrics = []
        for metric in metrics:
            if not isinstance(metric, dict):
                invalid_metric = metric  # not a dict
                continue
            for key in metric.keys():
                if key not in METRIC_KEYS:
                    invalid_metric = metric  # spurious attribute
                    continue
            if 'name' not in metric.keys() and 'metric' not in metric.keys():
                invalid_metric = metric  # missing name
                continue
            if 'value' not in metric.keys():
                invalid_metric = metric  # missing value
                continue

            if 'name' in metric:
                # API use 'name'; AgentCheck uses 'metric'
                metric['metric'] = metric.get('name')
                del metric['name']
            if not metric.get('dimensions', None):
                metric['dimensions'] = {}
            valid_metrics.append(metric)

        if invalid_metric:
            # Only report one invalid metric per file
            self._plugin_failed(file_name, 'invalid metric found: %s' % metric)
        return valid_metrics

    def _remove_duplicate_metrics(self, handling, metrics, file_name):
        """Remove metrics if we've already reported them

        We track the metrics we've posted to the Monasca Agent This allows us
        to discard duplicate metrics. The most common cause of duplicates is
        that the agent runs more often than the update period of the JSON file.

        We also discard metrics that seem stale. This can occur when the
        program creating the metrics file has died, so the JSON file
        does not update with new metrics.

        :param: handling: options for how measurements are handled
        :param metrics: The metrics we found in the JSON file
        :param file_name: the path of the JSON file
        :returns: A list of metrics that should be posted
        """

        # Set timestamp if asked
        if handling['replace_timestamps']:
            for metric in metrics:
                metric['timestamp'] = self.now
            # Since we've set the timestamp, these are unique (not duplicate)
            # so no further processing is required
            return metrics

        # Remove metrics we've already posted. Also remove stale metrics.
        if file_name not in self.posted_metrics:
            self.posted_metrics[file_name] = []
        stale_metrics = False
        for metric in deepcopy(metrics):
            if ((self.now - metric.get('timestamp', 0)) >
                    handling.get('stale_age')):
                metrics.remove(metric)  # too old
                stale_metrics = True
            elif metric in self.posted_metrics[file_name]:
                metrics.remove(metric)  # already sent to Monasca
            else:
                # New metric; not stale.
                self.posted_metrics[file_name].append(metric)

        # Purge really old metrics from posted
        for metric in list(self.posted_metrics[file_name]):
            if ((self.now - metric.get('timestamp', 0)) >=
                    handling.get('stale_age') * 2):
                self.posted_metrics[file_name].remove(metric)

        if stale_metrics:
            self._plugin_failed(file_name, 'Metrics are older than %s seconds;'
                                           ' file not updating?' %
                                handling.get('stale_age'))
        return metrics

    def _get_metrics(self):
        reported = []
        for file_name in self.metrics_files:
            metrics = self._load_measurements_from_file(file_name)
            for metric in metrics:
                reported.append(metric)
        return reported

    def _load_instance_config(self, instance):
        self.metrics_files = []
        self.metrics_dir = instance.get('metrics_dir', '')
        if self.metrics_dir:
            self.plugin_failures[self.metrics_dir] = ''
            try:
                file_names = os.listdir(self.metrics_dir)
                for name in [os.path.join(self.metrics_dir, name) for
                             name in file_names]:
                    # .json extension protects from reading .swp and similar
                    if os.path.isfile(name) and name.lower().endswith('.json'):
                        self.metrics_files.append(name)
            except OSError as err:
                self._plugin_failed(self.metrics_dir,
                                    'Error processing: %s' % err)
        else:
            metric_file = instance.get('metrics_file', '')
            if metric_file:
                self.metrics_files = [metric_file]
        self.log.debug('Using metrics files %s' % ','.join(self.metrics_files))
        for file_name in self.metrics_files:
            self.plugin_failures[file_name] = ''

    def check(self, instance):
        self._load_instance_config(instance)
        all_metrics = []
        self.now = _now()

        # Load measurements from files
        metrics = self._get_metrics()
        all_metrics.extend(metrics)

        # Add this plugin status
        all_metrics.append(self._plugin_check_metric())

        for metric in all_metrics:
            # apply any instance dimensions that may be configured,
            # overriding any dimension with same key that check has set.
            metric['dimensions'] = self._set_dimensions(metric['dimensions'],
                                                        instance)
            self.log.debug('Posting metric: %s' % metric)
            try:
                self.gauge(**metric)
            except Exception as e:  # noqa
                self.log.exception('Exception while reporting metric: %s' % e)