289 lines
9.8 KiB
Python
289 lines
9.8 KiB
Python
# stdlib
|
|
from datetime import datetime
|
|
from datetime import timedelta
|
|
import logging
|
|
from operator import attrgetter
|
|
import sys
|
|
import time
|
|
|
|
# project
|
|
import monasca_agent.common.check_status as check_status
|
|
import monasca_agent.common.metrics as metrics
|
|
import monasca_agent.common.util as util
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
FLUSH_LOGGING_PERIOD = 20
|
|
FLUSH_LOGGING_INITIAL = 5
|
|
|
|
|
|
class Transaction(object):
|
|
|
|
def __init__(self):
|
|
|
|
self._id = None
|
|
self._error_count = 0
|
|
self._next_flush = datetime.now()
|
|
self._size = None
|
|
|
|
def get_id(self):
|
|
return self._id
|
|
|
|
def set_id(self, new_id):
|
|
assert self._id is None
|
|
self._id = new_id
|
|
|
|
def inc_error_count(self):
|
|
self._error_count += 1
|
|
|
|
def get_error_count(self):
|
|
return self._error_count
|
|
|
|
def get_size(self):
|
|
if self._size is None:
|
|
self._size = sys.getsizeof(self)
|
|
|
|
return self._size
|
|
|
|
def get_next_flush(self):
|
|
return self._next_flush
|
|
|
|
def compute_next_flush(self, max_delay):
|
|
# Transactions are replayed, try to send them faster for newer transactions
|
|
# Send them every MAX_WAIT_FOR_REPLAY at most
|
|
td = timedelta(seconds=self._error_count * 20)
|
|
if td > max_delay:
|
|
td = max_delay
|
|
|
|
newdate = datetime.now() + td
|
|
self._next_flush = newdate.replace(microsecond=0)
|
|
|
|
def time_to_flush(self, now=datetime.now()):
|
|
return self._next_flush < now
|
|
|
|
def flush(self):
|
|
raise NotImplementedError("To be implemented in a subclass")
|
|
|
|
|
|
class MetricTransaction(Transaction):
|
|
|
|
_application = None
|
|
_trManager = None
|
|
_endpoints = []
|
|
|
|
@classmethod
|
|
def set_application(cls, app):
|
|
cls._application = app
|
|
|
|
@classmethod
|
|
def set_tr_manager(cls, manager):
|
|
cls._trManager = manager
|
|
|
|
@classmethod
|
|
def get_tr_manager(cls):
|
|
return cls._trManager
|
|
|
|
@classmethod
|
|
def set_endpoints(cls, endpoint):
|
|
# todo we only have one endpoint option, generalize it better
|
|
# the immediate use case for two endpoints could be our own monitoring boxes, they could send to
|
|
# the main api and mini-mon api
|
|
cls._endpoints.append(endpoint)
|
|
|
|
def __init__(self, data, headers):
|
|
self._data = data
|
|
self._headers = headers
|
|
|
|
# Call after data has been set (size is computed in Transaction's init)
|
|
Transaction.__init__(self)
|
|
|
|
# Insert the transaction in the Manager
|
|
self._trManager.append(self)
|
|
log.debug("Created transaction %d" % self.get_id())
|
|
|
|
def __sizeof__(self):
|
|
return sys.getsizeof(self._data)
|
|
|
|
def flush(self):
|
|
try:
|
|
for endpoint in self._endpoints:
|
|
endpoint.post_metrics(self._data)
|
|
except Exception:
|
|
log.exception('Error flushing metrics to remote endpoints')
|
|
self._trManager.tr_error(self)
|
|
else:
|
|
self._trManager.tr_success(self)
|
|
self._trManager.flush_next()
|
|
|
|
|
|
class TransactionManager(util.Dimensions):
|
|
"""Holds any transaction derived object list and make sure they
|
|
are all commited, without exceeding parameters (throttling, memory consumption)
|
|
"""
|
|
|
|
def __init__(self, max_wait_for_replay, max_queue_size, throttling_delay, agent_config):
|
|
super(TransactionManager, self).__init__(agent_config)
|
|
self._MAX_WAIT_FOR_REPLAY = max_wait_for_replay
|
|
self._MAX_QUEUE_SIZE = max_queue_size
|
|
self._THROTTLING_DELAY = throttling_delay
|
|
|
|
self._flush_without_ioloop = False # useful for tests
|
|
|
|
self._transactions = [] # List of all non commited transactions
|
|
self._total_count = 0 # Maintain size/count not to recompute it everytime
|
|
self._total_size = 0
|
|
self._flush_count = 0
|
|
self._transactions_received = 0
|
|
self._transactions_flushed = 0
|
|
|
|
# Global counter to assign a number to each transaction: we may have an issue
|
|
# if this overlaps
|
|
self._counter = 0
|
|
|
|
self._trs_to_flush = None # Current transactions being flushed
|
|
self._last_flush = datetime.now() # Last flush (for throttling)
|
|
|
|
# Track an initial status message.
|
|
check_status.ForwarderStatus().persist()
|
|
|
|
def get_transactions(self):
|
|
return self._transactions
|
|
|
|
def print_queue_stats(self):
|
|
log.debug("Queue size: at %s, %s transaction(s), %s KB" %
|
|
(time.time(), self._total_count, (self._total_size / 1024)))
|
|
|
|
def get_tr_id(self):
|
|
self._counter += 1
|
|
return self._counter
|
|
|
|
def append(self, tr):
|
|
|
|
# Give the transaction an id
|
|
tr.set_id(self.get_tr_id())
|
|
|
|
# Check the size
|
|
tr_size = tr.get_size()
|
|
|
|
log.debug("New transaction to add, total size of queue would be: %s KB" %
|
|
((self._total_size + tr_size) / 1024))
|
|
|
|
if (self._total_size + tr_size) > self._MAX_QUEUE_SIZE:
|
|
log.warn("Queue is too big, removing old transactions...")
|
|
new_trs = sorted(self._transactions, key=attrgetter('_next_flush'), reverse=True)
|
|
for tr2 in new_trs:
|
|
if (self._total_size + tr_size) > self._MAX_QUEUE_SIZE:
|
|
self._transactions.remove(tr2)
|
|
self._total_count -= 1
|
|
self._total_size = self._total_size - tr2.get_size()
|
|
log.warn("Removed transaction %s from queue" % tr2.get_id())
|
|
|
|
# Done
|
|
self._transactions.append(tr)
|
|
self._total_count += 1
|
|
self._transactions_received += 1
|
|
self._total_size = self._total_size + tr_size
|
|
|
|
log.debug("Transaction %s added" % (tr.get_id()))
|
|
self.print_queue_stats()
|
|
|
|
def flush(self):
|
|
|
|
if self._trs_to_flush is not None:
|
|
log.debug("A flush is already in progress, not doing anything")
|
|
return
|
|
|
|
to_flush = []
|
|
# Do we have something to do ?
|
|
now = datetime.now()
|
|
for tr in self._transactions:
|
|
if tr.time_to_flush(now):
|
|
to_flush.append(tr)
|
|
|
|
count = len(to_flush)
|
|
should_log = self._flush_count + 1 <= FLUSH_LOGGING_INITIAL or \
|
|
(self._flush_count + 1) % FLUSH_LOGGING_PERIOD == 0
|
|
if count > 0:
|
|
if should_log:
|
|
log.info("Flushing %s transaction%s during flush #%s" %
|
|
(count, util.plural(count), str(self._flush_count + 1)))
|
|
else:
|
|
log.debug("Flushing %s transaction%s during flush #%s" %
|
|
(count, util.plural(count), str(self._flush_count + 1)))
|
|
|
|
timer = util.Timer()
|
|
self._trs_to_flush = to_flush
|
|
self.flush_next()
|
|
# The emit time is reported on the next run.
|
|
dimensions = self._set_dimensions({'component': 'monasca-agent'})
|
|
emit_measurement = metrics.Measurement('monasca.emit_time_sec', time.time(), timer.step(), dimensions)
|
|
MetricTransaction([emit_measurement], headers={'Content-Type': 'application/json'})
|
|
else:
|
|
if should_log:
|
|
log.info("No transaction to flush during flush #%s" % str(self._flush_count + 1))
|
|
else:
|
|
log.debug("No transaction to flush during flush #%s" % str(self._flush_count + 1))
|
|
|
|
if self._flush_count + 1 == FLUSH_LOGGING_INITIAL:
|
|
log.info("First flushes done, next flushes will be logged every %s flushes." %
|
|
FLUSH_LOGGING_PERIOD)
|
|
|
|
self._flush_count += 1
|
|
|
|
check_status.ForwarderStatus(queue_length=self._total_count,
|
|
queue_size=self._total_size,
|
|
flush_count=self._flush_count,
|
|
transactions_received=self._transactions_received,
|
|
transactions_flushed=self._transactions_flushed).persist()
|
|
|
|
def flush_next(self):
|
|
|
|
if len(self._trs_to_flush) > 0:
|
|
|
|
td = self._last_flush + self._THROTTLING_DELAY - datetime.now()
|
|
# Python 2.7 has this built in, python < 2.7 don't...
|
|
if hasattr(td, 'total_seconds'):
|
|
delay = td.total_seconds()
|
|
else:
|
|
delay = (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10 ** 6) / 10.0 ** 6
|
|
|
|
if delay <= 0:
|
|
tr = self._trs_to_flush.pop()
|
|
self._last_flush = datetime.now()
|
|
log.debug("Flushing transaction %d" % tr.get_id())
|
|
try:
|
|
tr.flush()
|
|
except Exception as e:
|
|
log.exception(e)
|
|
self.tr_error(tr)
|
|
self.flush_next()
|
|
else:
|
|
# Wait a little bit more
|
|
tornado_ioloop = util.get_tornado_ioloop()
|
|
if tornado_ioloop._running:
|
|
tornado_ioloop.add_timeout(time.time() + delay, lambda: self.flush_next())
|
|
elif self._flush_without_ioloop:
|
|
# Tornado is no started (ie, unittests), do it manually: BLOCKING
|
|
time.sleep(delay)
|
|
self.flush_next()
|
|
else:
|
|
self._trs_to_flush = None
|
|
|
|
def tr_error(self, tr):
|
|
tr.inc_error_count()
|
|
tr.compute_next_flush(self._MAX_WAIT_FOR_REPLAY)
|
|
log.warn(
|
|
"Transaction %d in error (%s error%s), it will be replayed after %s" %
|
|
(tr.get_id(),
|
|
tr.get_error_count(),
|
|
util.plural(tr.get_error_count()),
|
|
tr.get_next_flush()))
|
|
|
|
def tr_success(self, tr):
|
|
log.debug("Transaction %d completed" % tr.get_id())
|
|
self._transactions.remove(tr)
|
|
self._total_count += -1
|
|
self._total_size += - tr.get_size()
|
|
self._transactions_flushed += 1
|
|
self.print_queue_stats()
|