monasca-agent/monasca_agent/collector/checks/services_checks.py

189 lines
6.8 KiB
Python

# (C) Copyright 2015 Hewlett Packard Enterprise Development Company LP
import collections
import Queue
import threading
import time
import monasca_agent.collector.checks
import monasca_agent.collector.checks.libs.thread_pool
DEFAULT_TIMEOUT = 180
DEFAULT_SIZE_POOL = 6
MAX_LOOP_ITERATIONS = 1000
MAX_ALLOWED_THREADS = 200
FAILURE = "FAILURE"
up_down = collections.namedtuple('up_down', ['UP', 'DOWN'])
Status = up_down('UP', 'DOWN')
EventType = up_down("servicecheck.state_change.up", "servicecheck.state_change.down")
class ServicesCheck(monasca_agent.collector.checks.AgentCheck):
SOURCE_TYPE_NAME = 'servicecheck'
"""Services checks inherits from this class.
This class should never be directly instanciated.
Work flow:
The main agent loop will call the check function for each instance for
each iteration of the loop.
The check method will make an asynchronous call to the _process method in
one of the thread initiated in the thread pool created in this class constructor.
The _process method will call the _check method of the inherited class
which will perform the actual check.
The _check method must return a tuple which first element is either
Status.UP or Status.DOWN.
The second element is a short error message that will be displayed
when the service turns down.
"""
def __init__(self, name, init_config, agent_config, instances):
monasca_agent.collector.checks.AgentCheck.__init__(self, name, init_config, agent_config, instances)
# A dictionary to keep track of service statuses
self.statuses = {}
self.notified = {}
self.nb_failures = 0
self.pool_started = False
def stop(self):
self.stop_pool()
self.pool_started = False
def start_pool(self):
# The pool size should be the minimum between the number of instances
# and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count'
# parameter in the init_config of the check
self.log.info("Starting Thread Pool")
default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
self.pool_size = int(self.init_config.get('threads_count', default_size))
self.timeout = int(self.agent_config.get('timeout', DEFAULT_TIMEOUT))
self.pool = monasca_agent.collector.checks.libs.thread_pool.Pool(self.pool_size)
self.resultsq = Queue.Queue()
self.jobs_status = {}
self.pool_started = True
def stop_pool(self):
self.log.info("Stopping Thread Pool")
if self.pool_started:
self.pool.terminate()
self.pool.join()
self.jobs_status.clear()
assert self.pool.get_nworkers() == 0
def restart_pool(self):
self.stop_pool()
self.start_pool()
def check(self, instance):
if not self.pool_started:
self.start_pool()
if threading.activeCount() > MAX_ALLOWED_THREADS:
exception = "Thread number ({0}) exceeds maximum ({1}). Skipping this check.".format(threading.activeCount(),
MAX_ALLOWED_THREADS)
if self.pool_size >= MAX_ALLOWED_THREADS:
exception += " threads_count is set too high in the {0} plugin config.".format(self.name)
else:
exception += " Another plugin may have threads_count set too high."
raise Exception(exception)
self._process_results()
self._clean()
name = instance.get('name', None)
if name is None:
self.log.error('Each service check must have a name')
return
if name not in self.jobs_status:
# A given instance should be processed one at a time
self.jobs_status[name] = time.time()
self.pool.apply_async(self._process, args=(instance,))
else:
self.log.info("Instance: %s skipped because it's already running." % name)
def _process(self, instance):
name = instance.get('name', None)
try:
return_value = self._check(instance)
if not return_value:
del self.jobs_status[name]
return
status, msg = return_value
result = (status, msg, name, instance)
# We put the results in the result queue
self.resultsq.put(result)
except Exception:
self.log.exception('Failure in ServiceCheck {0}'.format(name))
result = (FAILURE, FAILURE, FAILURE, FAILURE)
self.resultsq.put(result)
def _process_results(self):
for i in range(MAX_LOOP_ITERATIONS):
try:
# We want to fetch the result in a non blocking way
status, msg, name, queue_instance = self.resultsq.get_nowait()
except Queue.Empty:
break
if status == FAILURE:
self.nb_failures += 1
if self.nb_failures >= self.pool_size - 1:
self.nb_failures = 0
self.restart_pool()
continue
event = None
if name not in self.statuses:
self.statuses[name] = []
self.statuses[name].append(status)
window = int(queue_instance.get('window', 1))
if window > 256:
self.log.warning("Maximum window size (256) exceeded, defaulting it to 256")
window = 256
threshold = queue_instance.get('threshold', 1)
if len(self.statuses[name]) > window:
self.statuses[name].pop(0)
nb_failures = self.statuses[name].count(Status.DOWN)
if nb_failures >= threshold:
if self.notified.get(name, Status.UP) != Status.DOWN:
event = self._create_status_event(status, msg, queue_instance)
self.notified[name] = Status.DOWN
else:
if self.notified.get(name, Status.UP) != Status.UP:
event = self._create_status_event(status, msg, queue_instance)
self.notified[name] = Status.UP
if event is not None:
self.events.append(event)
# The job is finished here, this instance can be re processed
del self.jobs_status[name]
def _check(self, instance):
"""This function should be implemented by inherited classes.
"""
raise NotImplementedError
def _clean(self):
now = time.time()
for name, start_time in self.jobs_status.items():
if now - start_time > self.timeout:
self.log.critical("Restarting Pool. One check is stuck.")
self.restart_pool()