[bradm] initial nrpe checks

This commit is contained in:
root 2014-10-29 22:30:35 -05:00
parent 1f582de7f5
commit 497ca6215f
9 changed files with 486 additions and 0 deletions

View File

@ -153,3 +153,14 @@ options:
The CPU core multiplier to use when configuring worker processes for The CPU core multiplier to use when configuring worker processes for
Glance. By default, the number of workers for each daemon is set to Glance. By default, the number of workers for each daemon is set to
twice the number of CPU cores a service unit has. twice the number of CPU cores a service unit has.
nagios_context:
default: "juju"
type: string
description: |
Used by the nrpe-external-master subordinate charm.
A string that will be prepended to instance name to set the host name
in nagios. So for instance the hostname would be something like:
juju-myservice-0
If you're running multiple environments with the same services in them
this allows you to differentiate between them.

View File

@ -0,0 +1,72 @@
#!/usr/bin/python
#
# Copyright 2012, 2013 Canonical Ltd.
#
# Author: Paul Collins <paul.collins@canonical.com>
#
# Based on http://www.eurion.net/python-snippets/snippet/Upstart%20service%20status.html
#
import sys
import dbus
class Upstart(object):
def __init__(self):
self._bus = dbus.SystemBus()
self._upstart = self._bus.get_object('com.ubuntu.Upstart',
'/com/ubuntu/Upstart')
def get_job(self, job_name):
path = self._upstart.GetJobByName(job_name,
dbus_interface='com.ubuntu.Upstart0_6')
return self._bus.get_object('com.ubuntu.Upstart', path)
def get_properties(self, job):
path = job.GetInstance([], dbus_interface='com.ubuntu.Upstart0_6.Job')
instance = self._bus.get_object('com.ubuntu.Upstart', path)
return instance.GetAll('com.ubuntu.Upstart0_6.Instance',
dbus_interface=dbus.PROPERTIES_IFACE)
def get_job_instances(self, job_name):
job = self.get_job(job_name)
paths = job.GetAllInstances([], dbus_interface='com.ubuntu.Upstart0_6.Job')
return [self._bus.get_object('com.ubuntu.Upstart', path) for path in paths]
def get_job_instance_properties(self, job):
return job.GetAll('com.ubuntu.Upstart0_6.Instance',
dbus_interface=dbus.PROPERTIES_IFACE)
try:
upstart = Upstart()
try:
job = upstart.get_job(sys.argv[1])
props = upstart.get_properties(job)
if props['state'] == 'running':
print 'OK: %s is running' % sys.argv[1]
sys.exit(0)
else:
print 'CRITICAL: %s is not running' % sys.argv[1]
sys.exit(2)
except dbus.DBusException as e:
instances = upstart.get_job_instances(sys.argv[1])
propses = [upstart.get_job_instance_properties(instance) for instance in instances]
states = dict([(props['name'], props['state']) for props in propses])
if len(states) != states.values().count('running'):
not_running = []
for name in states.keys():
if states[name] != 'running':
not_running.append(name)
print 'CRITICAL: %d instances of %s not running: %s' % \
(len(not_running), sys.argv[1], not_running.join(', '))
sys.exit(2)
else:
print 'OK: %d instances of %s running' % (len(states), sys.argv[1])
except dbus.DBusException as e:
print 'CRITICAL: failed to get properties of \'%s\' from upstart' % sys.argv[1]
sys.exit(2)

View File

@ -0,0 +1,218 @@
"""Compatibility with the nrpe-external-master charm"""
# Copyright 2012 Canonical Ltd.
#
# Authors:
# Matthew Wedgwood <matthew.wedgwood@canonical.com>
import subprocess
import pwd
import grp
import os
import re
import shlex
import yaml
from charmhelpers.core.hookenv import (
config,
local_unit,
log,
relation_ids,
relation_set,
)
from charmhelpers.core.host import service
# This module adds compatibility with the nrpe-external-master and plain nrpe
# subordinate charms. To use it in your charm:
#
# 1. Update metadata.yaml
#
# provides:
# (...)
# nrpe-external-master:
# interface: nrpe-external-master
# scope: container
#
# and/or
#
# provides:
# (...)
# local-monitors:
# interface: local-monitors
# scope: container
#
# 2. Add the following to config.yaml
#
# nagios_context:
# default: "juju"
# type: string
# description: |
# Used by the nrpe subordinate charms.
# A string that will be prepended to instance name to set the host name
# in nagios. So for instance the hostname would be something like:
# juju-myservice-0
# If you're running multiple environments with the same services in them
# this allows you to differentiate between them.
#
# 3. Add custom checks (Nagios plugins) to files/nrpe-external-master
#
# 4. Update your hooks.py with something like this:
#
# from charmsupport.nrpe import NRPE
# (...)
# def update_nrpe_config():
# nrpe_compat = NRPE()
# nrpe_compat.add_check(
# shortname = "myservice",
# description = "Check MyService",
# check_cmd = "check_http -w 2 -c 10 http://localhost"
# )
# nrpe_compat.add_check(
# "myservice_other",
# "Check for widget failures",
# check_cmd = "/srv/myapp/scripts/widget_check"
# )
# nrpe_compat.write()
#
# def config_changed():
# (...)
# update_nrpe_config()
#
# def nrpe_external_master_relation_changed():
# update_nrpe_config()
#
# def local_monitors_relation_changed():
# update_nrpe_config()
#
# 5. ln -s hooks.py nrpe-external-master-relation-changed
# ln -s hooks.py local-monitors-relation-changed
class CheckException(Exception):
pass
class Check(object):
shortname_re = '[A-Za-z0-9-_]+$'
service_template = ("""
#---------------------------------------------------
# This file is Juju managed
#---------------------------------------------------
define service {{
use active-service
host_name {nagios_hostname}
service_description {nagios_hostname}[{shortname}] """
"""{description}
check_command check_nrpe!{command}
servicegroups {nagios_servicegroup}
}}
""")
def __init__(self, shortname, description, check_cmd):
super(Check, self).__init__()
# XXX: could be better to calculate this from the service name
if not re.match(self.shortname_re, shortname):
raise CheckException("shortname must match {}".format(
Check.shortname_re))
self.shortname = shortname
self.command = "check_{}".format(shortname)
# Note: a set of invalid characters is defined by the
# Nagios server config
# The default is: illegal_object_name_chars=`~!$%^&*"|'<>?,()=
self.description = description
self.check_cmd = self._locate_cmd(check_cmd)
def _locate_cmd(self, check_cmd):
search_path = (
'/',
os.path.join(os.environ['CHARM_DIR'],
'files/nrpe-external-master'),
'/usr/lib/nagios/plugins',
)
parts = shlex.split(check_cmd)
for path in search_path:
if os.path.exists(os.path.join(path, parts[0])):
command = os.path.join(path, parts[0])
if len(parts) > 1:
command += " " + " ".join(parts[1:])
return command
log('Check command not found: {}'.format(parts[0]))
return ''
def write(self, nagios_context, hostname):
nrpe_check_file = '/etc/nagios/nrpe.d/{}.cfg'.format(
self.command)
with open(nrpe_check_file, 'w') as nrpe_check_config:
nrpe_check_config.write("# check {}\n".format(self.shortname))
nrpe_check_config.write("command[{}]={}\n".format(
self.command, self.check_cmd))
if not os.path.exists(NRPE.nagios_exportdir):
log('Not writing service config as {} is not accessible'.format(
NRPE.nagios_exportdir))
else:
self.write_service_config(nagios_context, hostname)
def write_service_config(self, nagios_context, hostname):
for f in os.listdir(NRPE.nagios_exportdir):
if re.search('.*{}.cfg'.format(self.command), f):
os.remove(os.path.join(NRPE.nagios_exportdir, f))
templ_vars = {
'nagios_hostname': hostname,
'nagios_servicegroup': nagios_context,
'description': self.description,
'shortname': self.shortname,
'command': self.command,
}
nrpe_service_text = Check.service_template.format(**templ_vars)
nrpe_service_file = '{}/service__{}_{}.cfg'.format(
NRPE.nagios_exportdir, hostname, self.command)
with open(nrpe_service_file, 'w') as nrpe_service_config:
nrpe_service_config.write(str(nrpe_service_text))
def run(self):
subprocess.call(self.check_cmd)
class NRPE(object):
nagios_logdir = '/var/log/nagios'
nagios_exportdir = '/var/lib/nagios/export'
nrpe_confdir = '/etc/nagios/nrpe.d'
def __init__(self):
super(NRPE, self).__init__()
self.config = config()
self.nagios_context = self.config['nagios_context']
self.unit_name = local_unit().replace('/', '-')
self.hostname = "{}-{}".format(self.nagios_context, self.unit_name)
self.checks = []
def add_check(self, *args, **kwargs):
self.checks.append(Check(*args, **kwargs))
def write(self):
try:
nagios_uid = pwd.getpwnam('nagios').pw_uid
nagios_gid = grp.getgrnam('nagios').gr_gid
except:
log("Nagios user not set up, nrpe checks not updated")
return
if not os.path.exists(NRPE.nagios_logdir):
os.mkdir(NRPE.nagios_logdir)
os.chown(NRPE.nagios_logdir, nagios_uid, nagios_gid)
nrpe_monitors = {}
monitors = {"monitors": {"remote": {"nrpe": nrpe_monitors}}}
for nrpecheck in self.checks:
nrpecheck.write(self.nagios_context, self.hostname)
nrpe_monitors[nrpecheck.shortname] = {
"command": nrpecheck.command,
}
service('restart', 'nagios-nrpe-server')
for rid in relation_ids("local-monitors"):
relation_set(relation_id=rid, monitors=yaml.dump(monitors))

View File

@ -0,0 +1,156 @@
'''
Functions for managing volumes in juju units. One volume is supported per unit.
Subordinates may have their own storage, provided it is on its own partition.
Configuration stanzas:
volume-ephemeral:
type: boolean
default: true
description: >
If false, a volume is mounted as sepecified in "volume-map"
If true, ephemeral storage will be used, meaning that log data
will only exist as long as the machine. YOU HAVE BEEN WARNED.
volume-map:
type: string
default: {}
description: >
YAML map of units to device names, e.g:
"{ rsyslog/0: /dev/vdb, rsyslog/1: /dev/vdb }"
Service units will raise a configure-error if volume-ephemeral
is 'true' and no volume-map value is set. Use 'juju set' to set a
value and 'juju resolved' to complete configuration.
Usage:
from charmsupport.volumes import configure_volume, VolumeConfigurationError
from charmsupport.hookenv import log, ERROR
def post_mount_hook():
stop_service('myservice')
def post_mount_hook():
start_service('myservice')
if __name__ == '__main__':
try:
configure_volume(before_change=pre_mount_hook,
after_change=post_mount_hook)
except VolumeConfigurationError:
log('Storage could not be configured', ERROR)
'''
# XXX: Known limitations
# - fstab is neither consulted nor updated
import os
from charmhelpers.core import hookenv
from charmhelpers.core import host
import yaml
MOUNT_BASE = '/srv/juju/volumes'
class VolumeConfigurationError(Exception):
'''Volume configuration data is missing or invalid'''
pass
def get_config():
'''Gather and sanity-check volume configuration data'''
volume_config = {}
config = hookenv.config()
errors = False
if config.get('volume-ephemeral') in (True, 'True', 'true', 'Yes', 'yes'):
volume_config['ephemeral'] = True
else:
volume_config['ephemeral'] = False
try:
volume_map = yaml.safe_load(config.get('volume-map', '{}'))
except yaml.YAMLError as e:
hookenv.log("Error parsing YAML volume-map: {}".format(e),
hookenv.ERROR)
errors = True
if volume_map is None:
# probably an empty string
volume_map = {}
elif not isinstance(volume_map, dict):
hookenv.log("Volume-map should be a dictionary, not {}".format(
type(volume_map)))
errors = True
volume_config['device'] = volume_map.get(os.environ['JUJU_UNIT_NAME'])
if volume_config['device'] and volume_config['ephemeral']:
# asked for ephemeral storage but also defined a volume ID
hookenv.log('A volume is defined for this unit, but ephemeral '
'storage was requested', hookenv.ERROR)
errors = True
elif not volume_config['device'] and not volume_config['ephemeral']:
# asked for permanent storage but did not define volume ID
hookenv.log('Ephemeral storage was requested, but there is no volume '
'defined for this unit.', hookenv.ERROR)
errors = True
unit_mount_name = hookenv.local_unit().replace('/', '-')
volume_config['mountpoint'] = os.path.join(MOUNT_BASE, unit_mount_name)
if errors:
return None
return volume_config
def mount_volume(config):
if os.path.exists(config['mountpoint']):
if not os.path.isdir(config['mountpoint']):
hookenv.log('Not a directory: {}'.format(config['mountpoint']))
raise VolumeConfigurationError()
else:
host.mkdir(config['mountpoint'])
if os.path.ismount(config['mountpoint']):
unmount_volume(config)
if not host.mount(config['device'], config['mountpoint'], persist=True):
raise VolumeConfigurationError()
def unmount_volume(config):
if os.path.ismount(config['mountpoint']):
if not host.umount(config['mountpoint'], persist=True):
raise VolumeConfigurationError()
def managed_mounts():
'''List of all mounted managed volumes'''
return filter(lambda mount: mount[0].startswith(MOUNT_BASE), host.mounts())
def configure_volume(before_change=lambda: None, after_change=lambda: None):
'''Set up storage (or don't) according to the charm's volume configuration.
Returns the mount point or "ephemeral". before_change and after_change
are optional functions to be called if the volume configuration changes.
'''
config = get_config()
if not config:
hookenv.log('Failed to read volume configuration', hookenv.CRITICAL)
raise VolumeConfigurationError()
if config['ephemeral']:
if os.path.ismount(config['mountpoint']):
before_change()
unmount_volume(config)
after_change()
return 'ephemeral'
else:
# persistent storage
if os.path.ismount(config['mountpoint']):
mounts = dict(managed_mounts())
if mounts.get(config['mountpoint']) != config['device']:
before_change()
unmount_volume(config)
mount_volume(config)
after_change()
else:
before_change()
mount_volume(config)
after_change()
return config['mountpoint']

View File

@ -73,6 +73,8 @@ from charmhelpers.contrib.openstack.ip import (
from charmhelpers.contrib.openstack.context import ADDRESS_TYPES from charmhelpers.contrib.openstack.context import ADDRESS_TYPES
from charmhelpers.contrib.charmsupport.nrpe import NRPE
from subprocess import ( from subprocess import (
check_call, check_call,
call, ) call, )
@ -297,6 +299,8 @@ def config_changed():
open_port(9292) open_port(9292)
configure_https() configure_https()
update_nrpe_config()
# Pickup and changes due to network reference architecture # Pickup and changes due to network reference architecture
# configuration # configuration
[keystone_joined(rid) for rid in relation_ids('identity-service')] [keystone_joined(rid) for rid in relation_ids('identity-service')]
@ -334,6 +338,7 @@ def cluster_changed():
def upgrade_charm(): def upgrade_charm():
apt_install(filter_installed_packages(PACKAGES), fatal=True) apt_install(filter_installed_packages(PACKAGES), fatal=True)
configure_https() configure_https()
update_nrpe_config()
CONFIGS.write_all() CONFIGS.write_all()
@ -446,6 +451,25 @@ def amqp_changed():
return return
CONFIGS.write(GLANCE_API_CONF) CONFIGS.write(GLANCE_API_CONF)
@hooks.hook('nrpe-external-master-relation-joined', 'nrpe-external-master-relation-changed')
def update_nrpe_config():
nrpe = NRPE()
apt_install('python-dbus')
nrpe.add_check(
shortname='glance-api',
description='glance-api process',
check_cmd = 'check_upstart_job glance-api',
)
nrpe.add_check(
shortname='glance-registry',
description='glance-registry process',
check_cmd = 'check_upstart_job glance-registry',
)
nrpe.write()
if __name__ == '__main__': if __name__ == '__main__':
try: try:
hooks.execute(sys.argv) hooks.execute(sys.argv)

View File

@ -0,0 +1 @@
glance_relations.py

View File

@ -0,0 +1 @@
glance_relations.py

View File

@ -9,6 +9,9 @@ description: |
categories: categories:
- miscellaneous - miscellaneous
provides: provides:
nrpe-external-master:
interface: nrpe-external-master
scope: container
image-service: image-service:
interface: glance interface: glance
requires: requires: