# Copyright 2014-2015 Canonical Limited. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Compatibility with the nrpe-external-master charm""" # Copyright 2012 Canonical Ltd. # # Authors: # Matthew Wedgwood import subprocess import pwd import grp import os import glob import shutil import re import shlex import yaml from charmhelpers.core.hookenv import ( config, hook_name, local_unit, log, relation_ids, relation_set, relations_of_type, ) from charmhelpers.core.host import service from charmhelpers.core import host # This module adds compatibility with the nrpe-external-master and plain nrpe # subordinate charms. To use it in your charm: # # 1. Update metadata.yaml # # provides: # (...) # nrpe-external-master: # interface: nrpe-external-master # scope: container # # and/or # # provides: # (...) # local-monitors: # interface: local-monitors # scope: container # # 2. Add the following to config.yaml # # nagios_context: # default: "juju" # type: string # description: | # Used by the nrpe subordinate charms. # A string that will be prepended to instance name to set the host name # in nagios. So for instance the hostname would be something like: # juju-myservice-0 # If you're running multiple environments with the same services in them # this allows you to differentiate between them. # nagios_servicegroups: # default: "" # type: string # description: | # A comma-separated list of nagios servicegroups. # If left empty, the nagios_context will be used as the servicegroup # # 3. Add custom checks (Nagios plugins) to files/nrpe-external-master # # 4. Update your hooks.py with something like this: # # from charmsupport.nrpe import NRPE # (...) # def update_nrpe_config(): # nrpe_compat = NRPE() # nrpe_compat.add_check( # shortname = "myservice", # description = "Check MyService", # check_cmd = "check_http -w 2 -c 10 http://localhost" # ) # nrpe_compat.add_check( # "myservice_other", # "Check for widget failures", # check_cmd = "/srv/myapp/scripts/widget_check" # ) # nrpe_compat.write() # # def config_changed(): # (...) # update_nrpe_config() # # def nrpe_external_master_relation_changed(): # update_nrpe_config() # # def local_monitors_relation_changed(): # update_nrpe_config() # # 4.a If your charm is a subordinate charm set primary=False # # from charmsupport.nrpe import NRPE # (...) # def update_nrpe_config(): # nrpe_compat = NRPE(primary=False) # # 5. ln -s hooks.py nrpe-external-master-relation-changed # ln -s hooks.py local-monitors-relation-changed class CheckException(Exception): pass class Check(object): shortname_re = '[A-Za-z0-9-_.]+$' service_template = (""" #--------------------------------------------------- # This file is Juju managed #--------------------------------------------------- define service {{ use active-service host_name {nagios_hostname} service_description {nagios_hostname}[{shortname}] """ """{description} check_command check_nrpe!{command} servicegroups {nagios_servicegroup} }} """) def __init__(self, shortname, description, check_cmd): super(Check, self).__init__() # XXX: could be better to calculate this from the service name if not re.match(self.shortname_re, shortname): raise CheckException("shortname must match {}".format( Check.shortname_re)) self.shortname = shortname self.command = "check_{}".format(shortname) # Note: a set of invalid characters is defined by the # Nagios server config # The default is: illegal_object_name_chars=`~!$%^&*"|'<>?,()= self.description = description self.check_cmd = self._locate_cmd(check_cmd) def _get_check_filename(self): return os.path.join(NRPE.nrpe_confdir, '{}.cfg'.format(self.command)) def _get_service_filename(self, hostname): return os.path.join(NRPE.nagios_exportdir, 'service__{}_{}.cfg'.format(hostname, self.command)) def _locate_cmd(self, check_cmd): search_path = ( '/usr/lib/nagios/plugins', '/usr/local/lib/nagios/plugins', ) parts = shlex.split(check_cmd) for path in search_path: if os.path.exists(os.path.join(path, parts[0])): command = os.path.join(path, parts[0]) if len(parts) > 1: command += " " + " ".join(parts[1:]) return command log('Check command not found: {}'.format(parts[0])) return '' def _remove_service_files(self): if not os.path.exists(NRPE.nagios_exportdir): return for f in os.listdir(NRPE.nagios_exportdir): if f.endswith('_{}.cfg'.format(self.command)): os.remove(os.path.join(NRPE.nagios_exportdir, f)) def remove(self, hostname): nrpe_check_file = self._get_check_filename() if os.path.exists(nrpe_check_file): os.remove(nrpe_check_file) self._remove_service_files() def write(self, nagios_context, hostname, nagios_servicegroups): nrpe_check_file = self._get_check_filename() with open(nrpe_check_file, 'w') as nrpe_check_config: nrpe_check_config.write("# check {}\n".format(self.shortname)) if nagios_servicegroups: nrpe_check_config.write( "# The following header was added automatically by juju\n") nrpe_check_config.write( "# Modifying it will affect nagios monitoring and alerting\n") nrpe_check_config.write( "# servicegroups: {}\n".format(nagios_servicegroups)) nrpe_check_config.write("command[{}]={}\n".format( self.command, self.check_cmd)) if not os.path.exists(NRPE.nagios_exportdir): log('Not writing service config as {} is not accessible'.format( NRPE.nagios_exportdir)) else: self.write_service_config(nagios_context, hostname, nagios_servicegroups) def write_service_config(self, nagios_context, hostname, nagios_servicegroups): self._remove_service_files() templ_vars = { 'nagios_hostname': hostname, 'nagios_servicegroup': nagios_servicegroups, 'description': self.description, 'shortname': self.shortname, 'command': self.command, } nrpe_service_text = Check.service_template.format(**templ_vars) nrpe_service_file = self._get_service_filename(hostname) with open(nrpe_service_file, 'w') as nrpe_service_config: nrpe_service_config.write(str(nrpe_service_text)) def run(self): subprocess.call(self.check_cmd) class NRPE(object): nagios_logdir = '/var/log/nagios' nagios_exportdir = '/var/lib/nagios/export' nrpe_confdir = '/etc/nagios/nrpe.d' homedir = '/var/lib/nagios' # home dir provided by nagios-nrpe-server def __init__(self, hostname=None, primary=True): super(NRPE, self).__init__() self.config = config() self.primary = primary self.nagios_context = self.config['nagios_context'] if 'nagios_servicegroups' in self.config and self.config['nagios_servicegroups']: self.nagios_servicegroups = self.config['nagios_servicegroups'] else: self.nagios_servicegroups = self.nagios_context self.unit_name = local_unit().replace('/', '-') if hostname: self.hostname = hostname else: nagios_hostname = get_nagios_hostname() if nagios_hostname: self.hostname = nagios_hostname else: self.hostname = "{}-{}".format(self.nagios_context, self.unit_name) self.checks = [] # Iff in an nrpe-external-master relation hook, set primary status relation = relation_ids('nrpe-external-master') if relation: log("Setting charm primary status {}".format(primary)) for rid in relation_ids('nrpe-external-master'): relation_set(relation_id=rid, relation_settings={'primary': self.primary}) def add_check(self, *args, **kwargs): self.checks.append(Check(*args, **kwargs)) def remove_check(self, *args, **kwargs): if kwargs.get('shortname') is None: raise ValueError('shortname of check must be specified') # Use sensible defaults if they're not specified - these are not # actually used during removal, but they're required for constructing # the Check object; check_disk is chosen because it's part of the # nagios-plugins-basic package. if kwargs.get('check_cmd') is None: kwargs['check_cmd'] = 'check_disk' if kwargs.get('description') is None: kwargs['description'] = '' check = Check(*args, **kwargs) check.remove(self.hostname) def write(self): try: nagios_uid = pwd.getpwnam('nagios').pw_uid nagios_gid = grp.getgrnam('nagios').gr_gid except Exception: log("Nagios user not set up, nrpe checks not updated") return if not os.path.exists(NRPE.nagios_logdir): os.mkdir(NRPE.nagios_logdir) os.chown(NRPE.nagios_logdir, nagios_uid, nagios_gid) nrpe_monitors = {} monitors = {"monitors": {"remote": {"nrpe": nrpe_monitors}}} for nrpecheck in self.checks: nrpecheck.write(self.nagios_context, self.hostname, self.nagios_servicegroups) nrpe_monitors[nrpecheck.shortname] = { "command": nrpecheck.command, } # update-status hooks are configured to firing every 5 minutes by # default. When nagios-nrpe-server is restarted, the nagios server # reports checks failing causing unneccessary alerts. Let's not restart # on update-status hooks. if not hook_name() == 'update-status': service('restart', 'nagios-nrpe-server') monitor_ids = relation_ids("local-monitors") + \ relation_ids("nrpe-external-master") for rid in monitor_ids: relation_set(relation_id=rid, monitors=yaml.dump(monitors)) def get_nagios_hostcontext(relation_name='nrpe-external-master'): """ Query relation with nrpe subordinate, return the nagios_host_context :param str relation_name: Name of relation nrpe sub joined to """ for rel in relations_of_type(relation_name): if 'nagios_host_context' in rel: return rel['nagios_host_context'] def get_nagios_hostname(relation_name='nrpe-external-master'): """ Query relation with nrpe subordinate, return the nagios_hostname :param str relation_name: Name of relation nrpe sub joined to """ for rel in relations_of_type(relation_name): if 'nagios_hostname' in rel: return rel['nagios_hostname'] def get_nagios_unit_name(relation_name='nrpe-external-master'): """ Return the nagios unit name prepended with host_context if needed :param str relation_name: Name of relation nrpe sub joined to """ host_context = get_nagios_hostcontext(relation_name) if host_context: unit = "%s:%s" % (host_context, local_unit()) else: unit = local_unit() return unit def add_init_service_checks(nrpe, services, unit_name, immediate_check=True): """ Add checks for each service in list :param NRPE nrpe: NRPE object to add check to :param list services: List of services to check :param str unit_name: Unit name to use in check description :param bool immediate_check: For sysv init, run the service check immediately """ for svc in services: # Don't add a check for these services from neutron-gateway if svc in ['ext-port', 'os-charm-phy-nic-mtu']: next upstart_init = '/etc/init/%s.conf' % svc sysv_init = '/etc/init.d/%s' % svc if host.init_is_systemd(): nrpe.add_check( shortname=svc, description='process check {%s}' % unit_name, check_cmd='check_systemd.py %s' % svc ) elif os.path.exists(upstart_init): nrpe.add_check( shortname=svc, description='process check {%s}' % unit_name, check_cmd='check_upstart_job %s' % svc ) elif os.path.exists(sysv_init): cronpath = '/etc/cron.d/nagios-service-check-%s' % svc checkpath = '%s/service-check-%s.txt' % (nrpe.homedir, svc) croncmd = ( '/usr/local/lib/nagios/plugins/check_exit_status.pl ' '-e -s /etc/init.d/%s status' % svc ) cron_file = '*/5 * * * * root %s > %s\n' % (croncmd, checkpath) f = open(cronpath, 'w') f.write(cron_file) f.close() nrpe.add_check( shortname=svc, description='service check {%s}' % unit_name, check_cmd='check_status_file.py -f %s' % checkpath, ) # if /var/lib/nagios doesn't exist open(checkpath, 'w') will fail # (LP: #1670223). if immediate_check and os.path.isdir(nrpe.homedir): f = open(checkpath, 'w') subprocess.call( croncmd.split(), stdout=f, stderr=subprocess.STDOUT ) f.close() os.chmod(checkpath, 0o644) def copy_nrpe_checks(nrpe_files_dir=None): """ Copy the nrpe checks into place """ NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins' default_nrpe_files_dir = os.path.join( os.getenv('CHARM_DIR'), 'hooks', 'charmhelpers', 'contrib', 'openstack', 'files') if not nrpe_files_dir: nrpe_files_dir = default_nrpe_files_dir if not os.path.exists(NAGIOS_PLUGINS): os.makedirs(NAGIOS_PLUGINS) for fname in glob.glob(os.path.join(nrpe_files_dir, "check_*")): if os.path.isfile(fname): shutil.copy2(fname, os.path.join(NAGIOS_PLUGINS, os.path.basename(fname))) def add_haproxy_checks(nrpe, unit_name): """ Add checks for each service in list :param NRPE nrpe: NRPE object to add check to :param str unit_name: Unit name to use in check description """ nrpe.add_check( shortname='haproxy_servers', description='Check HAProxy {%s}' % unit_name, check_cmd='check_haproxy.sh') nrpe.add_check( shortname='haproxy_queue', description='Check HAProxy queue depth {%s}' % unit_name, check_cmd='check_haproxy_queue_depth.sh')