monasca-agent/monasca_agent/collector/checks/services_checks.py

# (C) Copyright 2015 Hewlett Packard Enterprise Development Company LP

import collections
import Queue
import threading
import time

import monasca_agent.collector.checks
import monasca_agent.collector.checks.libs.thread_pool


DEFAULT_TIMEOUT = 180
DEFAULT_SIZE_POOL = 6
MAX_LOOP_ITERATIONS = 1000
MAX_ALLOWED_THREADS = 200
FAILURE = "FAILURE"

up_down = collections.namedtuple('up_down', ['UP', 'DOWN'])
Status = up_down('UP', 'DOWN')
EventType = up_down("servicecheck.state_change.up", "servicecheck.state_change.down")


class ServicesCheck(monasca_agent.collector.checks.AgentCheck):
    SOURCE_TYPE_NAME = 'servicecheck'

    """Services checks inherits from this class.

    This class should never be directly instanciated.

    Work flow:
        The main agent loop will call the check function for each instance for
        each iteration of the loop.
        The check method will make an asynchronous call to the _process method in
        one of the thread initiated in the thread pool created in this class constructor.
        The _process method will call the _check method of the inherited class
        which will perform the actual check.

        The _check method must return a tuple which first element is either
            Status.UP or Status.DOWN.
            The second element is a short error message that will be displayed
            when the service turns down.
    """

    def __init__(self, name, init_config, agent_config, instances):
        monasca_agent.collector.checks.AgentCheck.__init__(self, name, init_config, agent_config, instances)

        # A dictionary to keep track of service statuses
        self.statuses = {}
        self.notified = {}
        self.nb_failures = 0
        self.pool_started = False

    def stop(self):
        self.stop_pool()
        self.pool_started = False

    def start_pool(self):
        # The pool size should be the minimum between the number of instances
        # and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count'
        # parameter in the init_config of the check
        self.log.info("Starting Thread Pool")
        default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
        self.pool_size = int(self.init_config.get('threads_count', default_size))
        self.timeout = int(self.agent_config.get('timeout', DEFAULT_TIMEOUT))

        self.pool = monasca_agent.collector.checks.libs.thread_pool.Pool(self.pool_size)

        self.resultsq = Queue.Queue()
        self.jobs_status = {}
        self.pool_started = True

    def stop_pool(self):
        self.log.info("Stopping Thread Pool")
        if self.pool_started:
            self.pool.terminate()
            self.pool.join()
            self.jobs_status.clear()
            assert self.pool.get_nworkers() == 0

    def restart_pool(self):
        self.stop_pool()
        self.start_pool()

    def check(self, instance):
        if not self.pool_started:
            self.start_pool()
        if threading.activeCount() > MAX_ALLOWED_THREADS:
            exception = "Thread number ({0}) exceeds maximum ({1}). Skipping this check.".format(threading.activeCount(),
                                                                                                 MAX_ALLOWED_THREADS)
            if self.pool_size >= MAX_ALLOWED_THREADS:
                exception += " threads_count is set too high in the {0} plugin config.".format(self.name)
            else:
                exception += "  Another plugin may have threads_count set too high."
            raise Exception(exception)
        self._process_results()
        self._clean()
        name = instance.get('name', None)
        if name is None:
            self.log.error('Each service check must have a name')
            return

        if name not in self.jobs_status:
            # A given instance should be processed one at a time
            self.jobs_status[name] = time.time()
            self.pool.apply_async(self._process, args=(instance,))
        else:
            self.log.info("Instance: %s skipped because it's already running." % name)

    def _process(self, instance):
        name = instance.get('name', None)

        try:
            return_value = self._check(instance)
            if not return_value:
                del self.jobs_status[name]
                return
            status, msg = return_value
            result = (status, msg, name, instance)
            # We put the results in the result queue
            self.resultsq.put(result)

        except Exception:
            self.log.exception('Failure in ServiceCheck {0}'.format(name))
            result = (FAILURE, FAILURE, FAILURE, FAILURE)
            self.resultsq.put(result)

    def _process_results(self):
        for i in range(MAX_LOOP_ITERATIONS):
            try:
                # We want to fetch the result in a non blocking way
                status, msg, name, queue_instance = self.resultsq.get_nowait()
            except Queue.Empty:
                break

            if status == FAILURE:
                self.nb_failures += 1
                if self.nb_failures >= self.pool_size - 1:
                    self.nb_failures = 0
                    self.restart_pool()
                continue

            event = None

            if name not in self.statuses:
                self.statuses[name] = []

            self.statuses[name].append(status)

            window = int(queue_instance.get('window', 1))

            if window > 256:
                self.log.warning("Maximum window size (256) exceeded, defaulting it to 256")
                window = 256

            threshold = queue_instance.get('threshold', 1)

            if len(self.statuses[name]) > window:
                self.statuses[name].pop(0)

            nb_failures = self.statuses[name].count(Status.DOWN)

            if nb_failures >= threshold:
                if self.notified.get(name, Status.UP) != Status.DOWN:
                    event = self._create_status_event(status, msg, queue_instance)
                    self.notified[name] = Status.DOWN
            else:
                if self.notified.get(name, Status.UP) != Status.UP:
                    event = self._create_status_event(status, msg, queue_instance)
                    self.notified[name] = Status.UP

            if event is not None:
                self.events.append(event)

            # The job is finished here, this instance can be re processed
            del self.jobs_status[name]

    def _check(self, instance):
        """This function should be implemented by inherited classes.

        """
        raise NotImplementedError

    def _clean(self):
        now = time.time()
        for name, start_time in self.jobs_status.items():
            if now - start_time > self.timeout:
                self.log.critical("Restarting Pool. One check is stuck.")
                self.restart_pool()