monasca-agent/monasca_agent/collector/daemon.py

334 lines
11 KiB
Python

#!/usr/bin/env python
# Core modules
import glob
import logging
import os
import signal
import sys
import time
# Custom modules
import checks.collector
import checks.services_checks as status_checks
import jmxfetch
import monasca_agent.common.check_status
import monasca_agent.common.config as cfg
import monasca_agent.common.daemon
import monasca_agent.common.emitter
import monasca_agent.common.util as util
# set up logging before importing any other components
util.initialize_logging('collector')
os.umask(0o22)
# Check we're not using an old version of Python. We need 2.4 above because
# some modules (like subprocess) were only introduced in 2.4.
if int(sys.version_info[1]) <= 3:
sys.stderr.write("Monasca Agent requires python 2.4 or later.\n")
sys.exit(2)
# Constants
PID_NAME = "monasca-agent"
RESTART_INTERVAL = 4 * 24 * 60 * 60 # Defaults to 4 days
START_COMMANDS = ['start', 'restart', 'foreground']
# Globals
log = logging.getLogger('collector')
# todo the collector has daemon code but is always run in foreground mode
# from the supervisor, is there a reason for the daemon code then?
class CollectorDaemon(monasca_agent.common.daemon.Daemon):
"""The agent class is a daemon that runs the collector in a background process.
"""
def __init__(self, pidfile, autorestart, start_event=True):
monasca_agent.common.daemon.Daemon.__init__(self, pidfile, autorestart=autorestart)
self.run_forever = True
self.collector = None
self.start_event = start_event
def _handle_sigterm(self, signum, frame):
log.debug("Caught sigterm. Stopping run loop.")
self.run_forever = False
if jmxfetch.JMXFetch.is_running():
jmxfetch.JMXFetch.stop()
if self.collector:
self.collector.stop()
log.debug("Collector is stopped.")
def _handle_sigusr1(self, signum, frame):
self._handle_sigterm(signum, frame)
self._do_restart()
def info(self, verbose=None):
logging.getLogger().setLevel(logging.ERROR)
return monasca_agent.common.check_status.CollectorStatus.print_latest_status(verbose=verbose)
def run(self, config):
"""Main loop of the collector.
"""
# Gracefully exit on sigterm.
signal.signal(signal.SIGTERM, self._handle_sigterm)
# A SIGUSR1 signals an exit with an autorestart
signal.signal(signal.SIGUSR1, self._handle_sigusr1)
# Handle Keyboard Interrupt
signal.signal(signal.SIGINT, self._handle_sigterm)
# Save the agent start-up stats.
monasca_agent.common.check_status.CollectorStatus().persist()
# Load the checks_d checks
checksd = util.load_check_directory()
self.collector = checks.collector.Collector(config, monasca_agent.common.emitter.http_emitter, checksd)
check_frequency = int(config['check_freq'])
# Initialize the auto-restarter
self.restart_interval = int(config.get('restart_interval', RESTART_INTERVAL))
self.agent_start = time.time()
# Run the main loop.
while self.run_forever:
# enable profiler if needed
profiled = False
if config.get('profile', False) and config.get('profile').lower() == 'yes':
try:
import cProfile
profiler = cProfile.Profile()
profiled = True
profiler.enable()
log.debug("Agent profiling is enabled")
except Exception:
log.warn("Cannot enable profiler")
# Do the work.
self.collector.run()
# disable profiler and printout stats to stdout
if config.get('profile', False) and config.get('profile').lower() == 'yes' and profiled:
try:
profiler.disable()
import cStringIO
import pstats
s = cStringIO.StringIO()
ps = pstats.Stats(profiler, stream=s).sort_stats("cumulative")
ps.print_stats()
log.debug(s.getvalue())
except Exception:
log.warn("Cannot disable profiler")
# Check if we should restart.
if self.autorestart and self._should_restart():
self._do_restart()
# Only plan for the next loop if we will continue,
# otherwise just exit quickly.
if self.run_forever:
time.sleep(check_frequency)
# Now clean-up.
try:
monasca_agent.common.check_status.CollectorStatus.remove_latest_status()
except Exception:
pass
# Explicitly kill the process, because it might be running
# as a daemon.
log.info("Exiting. Bye bye.")
sys.exit(0)
def _should_restart(self):
if time.time() - self.agent_start > self.restart_interval:
return True
return False
def _do_restart(self):
log.info("Running an auto-restart.")
if self.collector:
self.collector.stop()
sys.exit(monasca_agent.common.daemon.AgentSupervisor.RESTART_EXIT_STATUS)
def main():
options, args = util.get_parsed_args()
config = cfg.Config()
collector_config = config.get_config(['Main', 'Api', 'Logging'])
# todo autorestart isn't used remove
autorestart = collector_config.get('autorestart', False)
COMMANDS = [
'start',
'stop',
'restart',
'foreground',
'status',
'info',
'check',
'check_all',
'configcheck',
'jmx',
]
if len(args) < 1:
sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS)))
return 2
command = args[0]
if command not in COMMANDS:
sys.stderr.write("Unknown command: %s\n" % command)
return 3
pid_file = util.PidFile('monasca-agent')
if options.clean:
pid_file.clean()
agent = CollectorDaemon(pid_file.get_path(), autorestart)
if command in START_COMMANDS:
log.info('Agent version %s' % config.get_version())
if 'start' == command:
log.info('Start daemon')
agent.start()
elif 'stop' == command:
log.info('Stop daemon')
agent.stop()
elif 'restart' == command:
log.info('Restart daemon')
agent.restart()
elif 'status' == command:
agent.status()
elif 'info' == command:
return agent.info(verbose=options.verbose)
elif 'foreground' == command:
logging.info('Running in foreground')
if autorestart:
# Set-up the supervisor callbacks and fork it.
logging.info('Running Agent with auto-restart ON')
def child_func():
agent.run()
def parent_func():
agent.start_event = False
monasca_agent.common.daemon.AgentSupervisor.start(parent_func, child_func)
else:
# Run in the standard foreground.
agent.run(collector_config)
elif 'check' == command:
check_name = args[1]
checks = util.load_check_directory()
for check in checks['initialized_checks']:
if check.name == check_name:
run_check(check)
elif 'check_all' == command:
print("Loading check directory...")
checks = util.load_check_directory()
print("...directory loaded.\n")
for check in checks['initialized_checks']:
run_check(check)
elif 'configcheck' == command or 'configtest' == command:
all_valid = True
paths = util.Paths()
for conf_path in glob.glob(os.path.join(paths.get_confd_path(), "*.yaml")):
basename = os.path.basename(conf_path)
try:
config.check_yaml(conf_path)
except Exception as e:
all_valid = False
print("%s contains errors:\n %s" % (basename, e))
else:
print("%s is valid" % basename)
if all_valid:
print("All yaml files passed. You can now run the Monitoring agent.")
return 0
else:
print("Fix the invalid yaml files above in order to start the Monitoring agent. "
"A useful external tool for yaml parsing can be found at "
"http://yaml-online-parser.appspot.com/")
return 1
elif 'jmx' == command:
if len(args) < 2 or args[1] not in jmxfetch.JMX_LIST_COMMANDS.keys():
print("#" * 80)
print("JMX tool to be used to help configure your JMX checks.")
print("See http://docs.datadoghq.com/integrations/java/ for more information")
print("#" * 80)
print("\n")
print("You have to specify one of the following commands:")
for command, desc in jmxfetch.JMX_LIST_COMMANDS.iteritems():
print(" - %s [OPTIONAL: LIST OF CHECKS]: %s" % (command, desc))
print("Example: sudo /etc/init.d/monasca-agent jmx list_matching_attributes tomcat jmx solr")
print("\n")
else:
jmx_command = args[1]
checks_list = args[2:]
paths = util.Paths()
confd_path = paths.get_confd_path()
# Start JMXFetch if needed
should_run = jmxfetch.JMXFetch.init(confd_path,
config,
15,
jmx_command,
checks_list,
reporter="console")
if not should_run:
print("Couldn't find any valid JMX configuration in your conf.d directory: %s" % confd_path)
print("Have you enabled any JMX checks ?")
return 0
def run_check(check):
is_multi_threaded = False
if isinstance(check, status_checks.ServicesCheck):
is_multi_threaded = True
print("#" * 80)
print("Check name: '{0}'\n".format(check.name))
check.run()
# Sleep for a second and then run a second check to capture rate metrics
time.sleep(1)
check.run()
if is_multi_threaded:
# Sleep for a second to allow async threads to finish
time.sleep(1)
check.stop_pool()
print("Metrics: ")
check.get_metrics(prettyprint=True)
print("#" * 80 + "\n\n")
if __name__ == '__main__':
try:
sys.exit(main())
except Exception:
# Try our best to log the error.
try:
log.exception("Uncaught error running the Agent")
except Exception:
pass
raise