monasca-agent/monasca_agent/collector/checks/services_checks.py

149 lines
5.3 KiB
Python

# (C) Copyright 2015-2017 Hewlett Packard Enterprise Development Company LP
import collections
from concurrent import futures
import Queue
import threading
import eventlet
import multiprocessing
import monasca_agent.collector.checks
DEFAULT_TIMEOUT = 180
DEFAULT_SIZE_POOL = 6
MAX_LOOP_ITERATIONS = 1000
MAX_ALLOWED_THREADS = 200
FAILURE = "FAILURE"
up_down = collections.namedtuple('up_down', ['UP', 'DOWN'])
Status = up_down('UP', 'DOWN')
EventType = up_down("servicecheck.state_change.up", "servicecheck.state_change.down")
class ServicesCheck(monasca_agent.collector.checks.AgentCheck):
SOURCE_TYPE_NAME = 'servicecheck'
"""Services checks inherits from this class.
This class should never be directly instantiated.
Work flow:
The main agent loop will call the check function for each instance for
each iteration of the loop.
The check method will make an asynchronous call to the _process method in
one of the thread pool executors created in this class constructor.
The _process method will call the _check method of the inherited class
which will perform the actual check.
The _check method must return a tuple which first element is either
Status.UP or Status.DOWN.
The second element is a short error message that will be displayed
when the service turns down.
"""
def __init__(self, name, init_config, agent_config, instances):
monasca_agent.collector.checks.AgentCheck.__init__(
self, name, init_config, agent_config, instances)
# A dictionary to keep track of service statuses
self.statuses = {}
self.notified = {}
self.resultsq = Queue.Queue()
self.nb_failures = 0
self.pool = None
# The pool size should be the minimum between the number of instances
# and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count'
# parameter in the init_config of the check
try:
default_size = min(self.instance_count(), multiprocessing.cpu_count() + 1)
except NotImplementedError:
default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
self.pool_size = int(self.init_config.get('threads_count', default_size))
self.timeout = int(self.agent_config.get('timeout', DEFAULT_TIMEOUT))
def start_pool(self):
if self.pool is None:
self.log.info("Starting Thread Pool Exceutor")
self.pool = futures.ThreadPoolExecutor(max_workers=self.pool_size)
if threading.activeCount() > MAX_ALLOWED_THREADS:
self.log.error('Thread count (%d) exceeds maximum (%d)' % (threading.activeCount(),
MAX_ALLOWED_THREADS))
self.running_jobs = {}
def stop_pool(self):
self.log.info("Stopping Thread Pool")
if self.pool:
self.pool.shutdown(wait=True)
self.pool = None
def restart_pool(self):
self.stop_pool()
self.start_pool()
def check(self, instance):
self.start_pool()
name = instance.get('name', None)
if name is None:
self.log.error('Each service check must have a name')
return
if (name not in self.running_jobs) or self.running_jobs[name].done():
# A given instance should be processed one at a time
self.running_jobs[name] = self.pool.submit(self._process, instance)
else:
self.log.info("Instance: %s skipped because it's already running." % name)
def _process(self, instance):
name = instance.get('name', None)
try:
with eventlet.timeout.Timeout(self.timeout):
return_value = self._check(instance)
if not return_value:
return
status, msg = return_value
self._process_result(status, msg, name, instance)
except eventlet.Timeout:
msg = 'ServiceCheck {0} timed out'.format(name)
self.log.error(msg)
self._process_result(FAILURE, msg, name, instance)
except Exception:
msg = 'Failure in ServiceCheck {0}'.format(name)
self.log.exception(msg)
self._process_result(FAILURE, msg, name, instance)
finally:
del self.running_jobs[name]
def _process_result(self, status, msg, name, queue_instance):
if name not in self.statuses:
self.statuses[name] = []
self.statuses[name].append(status)
window = int(queue_instance.get('window', 1))
if window > 256:
self.log.warning("Maximum window size (256) exceeded, defaulting it to 256")
window = 256
threshold = queue_instance.get('threshold', 1)
if len(self.statuses[name]) > window:
self.statuses[name].pop(0)
nb_failures = self.statuses[name].count(Status.DOWN)
if nb_failures >= threshold:
if self.notified.get(name, Status.UP) != Status.DOWN:
self.notified[name] = Status.DOWN
else:
if self.notified.get(name, Status.UP) != Status.UP:
self.notified[name] = Status.UP
def _check(self, instance):
"""This function should be implemented by inherited classes.
"""
raise NotImplementedError