189 lines
6.8 KiB
Python
189 lines
6.8 KiB
Python
# (C) Copyright 2015 Hewlett Packard Enterprise Development Company LP
|
|
|
|
import collections
|
|
import Queue
|
|
import threading
|
|
import time
|
|
|
|
import monasca_agent.collector.checks
|
|
import monasca_agent.collector.checks.libs.thread_pool
|
|
|
|
|
|
DEFAULT_TIMEOUT = 180
|
|
DEFAULT_SIZE_POOL = 6
|
|
MAX_LOOP_ITERATIONS = 1000
|
|
MAX_ALLOWED_THREADS = 200
|
|
FAILURE = "FAILURE"
|
|
|
|
up_down = collections.namedtuple('up_down', ['UP', 'DOWN'])
|
|
Status = up_down('UP', 'DOWN')
|
|
EventType = up_down("servicecheck.state_change.up", "servicecheck.state_change.down")
|
|
|
|
|
|
class ServicesCheck(monasca_agent.collector.checks.AgentCheck):
|
|
SOURCE_TYPE_NAME = 'servicecheck'
|
|
|
|
"""Services checks inherits from this class.
|
|
|
|
This class should never be directly instanciated.
|
|
|
|
Work flow:
|
|
The main agent loop will call the check function for each instance for
|
|
each iteration of the loop.
|
|
The check method will make an asynchronous call to the _process method in
|
|
one of the thread initiated in the thread pool created in this class constructor.
|
|
The _process method will call the _check method of the inherited class
|
|
which will perform the actual check.
|
|
|
|
The _check method must return a tuple which first element is either
|
|
Status.UP or Status.DOWN.
|
|
The second element is a short error message that will be displayed
|
|
when the service turns down.
|
|
"""
|
|
|
|
def __init__(self, name, init_config, agent_config, instances):
|
|
monasca_agent.collector.checks.AgentCheck.__init__(self, name, init_config, agent_config, instances)
|
|
|
|
# A dictionary to keep track of service statuses
|
|
self.statuses = {}
|
|
self.notified = {}
|
|
self.nb_failures = 0
|
|
self.pool_started = False
|
|
|
|
def stop(self):
|
|
self.stop_pool()
|
|
self.pool_started = False
|
|
|
|
def start_pool(self):
|
|
# The pool size should be the minimum between the number of instances
|
|
# and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count'
|
|
# parameter in the init_config of the check
|
|
self.log.info("Starting Thread Pool")
|
|
default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
|
|
self.pool_size = int(self.init_config.get('threads_count', default_size))
|
|
self.timeout = int(self.agent_config.get('timeout', DEFAULT_TIMEOUT))
|
|
|
|
self.pool = monasca_agent.collector.checks.libs.thread_pool.Pool(self.pool_size)
|
|
|
|
self.resultsq = Queue.Queue()
|
|
self.jobs_status = {}
|
|
self.pool_started = True
|
|
|
|
def stop_pool(self):
|
|
self.log.info("Stopping Thread Pool")
|
|
if self.pool_started:
|
|
self.pool.terminate()
|
|
self.pool.join()
|
|
self.jobs_status.clear()
|
|
assert self.pool.get_nworkers() == 0
|
|
|
|
def restart_pool(self):
|
|
self.stop_pool()
|
|
self.start_pool()
|
|
|
|
def check(self, instance):
|
|
if not self.pool_started:
|
|
self.start_pool()
|
|
if threading.activeCount() > MAX_ALLOWED_THREADS:
|
|
exception = "Thread number ({0}) exceeds maximum ({1}). Skipping this check.".format(threading.activeCount(),
|
|
MAX_ALLOWED_THREADS)
|
|
if self.pool_size >= MAX_ALLOWED_THREADS:
|
|
exception += " threads_count is set too high in the {0} plugin config.".format(self.name)
|
|
else:
|
|
exception += " Another plugin may have threads_count set too high."
|
|
raise Exception(exception)
|
|
self._process_results()
|
|
self._clean()
|
|
name = instance.get('name', None)
|
|
if name is None:
|
|
self.log.error('Each service check must have a name')
|
|
return
|
|
|
|
if name not in self.jobs_status:
|
|
# A given instance should be processed one at a time
|
|
self.jobs_status[name] = time.time()
|
|
self.pool.apply_async(self._process, args=(instance,))
|
|
else:
|
|
self.log.info("Instance: %s skipped because it's already running." % name)
|
|
|
|
def _process(self, instance):
|
|
name = instance.get('name', None)
|
|
|
|
try:
|
|
return_value = self._check(instance)
|
|
if not return_value:
|
|
del self.jobs_status[name]
|
|
return
|
|
status, msg = return_value
|
|
result = (status, msg, name, instance)
|
|
# We put the results in the result queue
|
|
self.resultsq.put(result)
|
|
|
|
except Exception:
|
|
self.log.exception('Failure in ServiceCheck {0}'.format(name))
|
|
result = (FAILURE, FAILURE, FAILURE, FAILURE)
|
|
self.resultsq.put(result)
|
|
|
|
def _process_results(self):
|
|
for i in range(MAX_LOOP_ITERATIONS):
|
|
try:
|
|
# We want to fetch the result in a non blocking way
|
|
status, msg, name, queue_instance = self.resultsq.get_nowait()
|
|
except Queue.Empty:
|
|
break
|
|
|
|
if status == FAILURE:
|
|
self.nb_failures += 1
|
|
if self.nb_failures >= self.pool_size - 1:
|
|
self.nb_failures = 0
|
|
self.restart_pool()
|
|
continue
|
|
|
|
event = None
|
|
|
|
if name not in self.statuses:
|
|
self.statuses[name] = []
|
|
|
|
self.statuses[name].append(status)
|
|
|
|
window = int(queue_instance.get('window', 1))
|
|
|
|
if window > 256:
|
|
self.log.warning("Maximum window size (256) exceeded, defaulting it to 256")
|
|
window = 256
|
|
|
|
threshold = queue_instance.get('threshold', 1)
|
|
|
|
if len(self.statuses[name]) > window:
|
|
self.statuses[name].pop(0)
|
|
|
|
nb_failures = self.statuses[name].count(Status.DOWN)
|
|
|
|
if nb_failures >= threshold:
|
|
if self.notified.get(name, Status.UP) != Status.DOWN:
|
|
event = self._create_status_event(status, msg, queue_instance)
|
|
self.notified[name] = Status.DOWN
|
|
else:
|
|
if self.notified.get(name, Status.UP) != Status.UP:
|
|
event = self._create_status_event(status, msg, queue_instance)
|
|
self.notified[name] = Status.UP
|
|
|
|
if event is not None:
|
|
self.events.append(event)
|
|
|
|
# The job is finished here, this instance can be re processed
|
|
del self.jobs_status[name]
|
|
|
|
def _check(self, instance):
|
|
"""This function should be implemented by inherited classes.
|
|
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def _clean(self):
|
|
now = time.time()
|
|
for name, start_time in self.jobs_status.items():
|
|
if now - start_time > self.timeout:
|
|
self.log.critical("Restarting Pool. One check is stuck.")
|
|
self.restart_pool()
|