From 4b0617cd24740a74efba200b73b144668cb01e6b Mon Sep 17 00:00:00 2001 From: Olivier Bourdon Date: Mon, 12 Sep 2016 21:25:36 +0200 Subject: [PATCH] Implement alarm-manager alarm-manager is responsible for watching a configurable location within the filesystem where the user can put a YAML file which defines alarms. When change/creation is detected, the YAML file is checked for proper contents and if verification is successfull, LUA code is generated as well as LUA configuration files. Hindsight will pick up those changes after a certain period of time and provides proper alarming to the platform. Change-Id: I7b2b98f379c49bdbf23177a038bdca9433d1c6e5 --- docker/alarm-manager/Dockerfile.j2 | 20 + docker/alarm-manager/alarm-manager.py | 603 ++++++++++++++++++ .../config-files/config/alarm-manager.ini | 22 + .../templates/lua_alarming_template.j2 | 41 ++ docker/alarm-manager/requirements.txt | 9 + docker/hindsight/Dockerfile.j2 | 5 +- docker/hindsight/bootstrap-hindsight.sh | 31 + .../alarm_manager_lua_config_template.cfg.j2 | 11 + service/files/alarms.yaml | 79 +++ service/stacklight-collector.yaml | 46 +- tox.ini | 12 +- 11 files changed, 869 insertions(+), 10 deletions(-) create mode 100644 docker/alarm-manager/Dockerfile.j2 create mode 100755 docker/alarm-manager/alarm-manager.py create mode 100644 docker/alarm-manager/config-files/config/alarm-manager.ini create mode 100644 docker/alarm-manager/config-files/templates/lua_alarming_template.j2 create mode 100644 docker/alarm-manager/requirements.txt create mode 100755 docker/hindsight/bootstrap-hindsight.sh create mode 100644 service/files/alarm_manager_lua_config_template.cfg.j2 create mode 100644 service/files/alarms.yaml diff --git a/docker/alarm-manager/Dockerfile.j2 b/docker/alarm-manager/Dockerfile.j2 new file mode 100644 index 0000000..72aa7e8 --- /dev/null +++ b/docker/alarm-manager/Dockerfile.j2 @@ -0,0 +1,20 @@ +FROM {{ namespace }}/base:{{ tag }} +MAINTAINER {{ maintainer }} + +# Install alarm-manager and dependencies +COPY alarm-manager.py /opt/ccp/bin/ +COPY requirements.txt /tmp/requirements.txt +COPY config-files /etc/alarm-manager/ + +RUN apt-get install -y --no-install-recommends patch gcc python-dev \ + && apt-get clean \ + && pip install --no-cache-dir -r /tmp/requirements.txt \ + && useradd --user-group alarm-manager \ + && usermod -a -G microservices alarm-manager \ + && chown -R alarm-manager: /etc/alarm-manager \ + && chmod 755 /opt/ccp/bin/alarm-manager.py \ + && rm -f /tmp/requirements.txt \ + && apt-get -y purge patch gcc python-dev \ + && apt-get -y autoremove + +USER alarm-manager diff --git a/docker/alarm-manager/alarm-manager.py b/docker/alarm-manager/alarm-manager.py new file mode 100755 index 0000000..cd1e1d9 --- /dev/null +++ b/docker/alarm-manager/alarm-manager.py @@ -0,0 +1,603 @@ +#!/usr/bin/env python +# +# Copyright 2016 Mirantis, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# + +# Global imports +# -------------- +import argparse +import hashlib +import jinja2 +import logging +import os +import pyinotify +import re +import sys +import yaml + +# Best practice code for logging +# ------------------------------ +try: # Python 2.7+ + from logging import NullHandler +except ImportError: + class NullHandler(logging.Handler): + def emit(self, record): + pass + +# Global variables initialization +# ------------------------------- + +dflt_cfg_dir = os.path.join( + '/etc', 'alarm-manager') +dflt_config = os.path.join( + dflt_cfg_dir, 'config', 'alarm-manager.ini') +dflt_template = os.path.join( + dflt_cfg_dir, 'templates', 'lua_alarming_template.j2') +dflt_cfg_template = os.path.join( + dflt_cfg_dir, + 'templates', 'alarm_manager_lua_config_template.cfg.j2') +dflt_dest_dir = os.path.join( + '/opt', 'ccp', 'lua', 'modules', 'stacklight_alarms') +dflt_cfg_dest_dir = os.path.join( + '/var', 'lib', 'hindsight', 'load', 'analysis') +dflt_alarm_file = 'alarms.yaml' + +# Logging initialization +# ---------------------- + + +def logger_init(cfg_file): + """Initialize logger instance.""" + log = logging.getLogger() + log.setLevel(logging.DEBUG) + try: + log.debug('Looking for log configuration file: %s' % cfg_file) + # Default logging configuration file + logging.config.fileConfig(cfg_file) + except Exception: + # Only add handler if not already done + if len(log.handlers) == 0: + # Hardcoded default logging configuration if no/bad config file + console_handler = logging.StreamHandler(sys.stdout) + fmt_str = "[%(asctime)s.%(msecs)03d %(name)s %(levelname)s] " \ + "%(message)s" + console_handler.setFormatter( + logging.Formatter(fmt_str, "%Y-%m-%d %H:%M:%S")) + log.addHandler(console_handler) + log.setLevel(logging.DEBUG) + log.debug('Defaulting to stdout') + return log + +log = logger_init(None) + +# Class for keeping configuration parameters +# ------------------------------------------ + + +class AlarmConfig(): + """ + Class used to store parameters + """ + def __init__(self, code_dest_dir, config_dest_dir, + source_file, template, config_template): + self._code_dest_dir = code_dest_dir + self._config_dest_dir = config_dest_dir + self._source_file = source_file + self._template = template + self._config_template = config_template + self._sha256 = None + +# Class for processing inotify events +# ------------------------------------ + + +class InotifyEventsHandler(pyinotify.ProcessEvent): + """ + Class used to process inotify events. + """ + def my_init(self, cfg, name, out=None): + """ + @param cfg: configuration to use for generation callback. + @type cfg: AlarmConfig. + @param name: File name to be watched. + @type name: String. + @param out: Logger where events will be written. + @type out: Object providing a valid logging object interface. + """ + if out is None: + out = log + self._out = out + self._cfg = cfg + self._name = name + + def process_default(self, event): + """ + Writes event string representation to logging object provided to + my_init(). + + @param event: Event to be processed. Can be of any type of events but + IN_Q_OVERFLOW events (see method process_IN_Q_OVERFLOW). + @type event: Event instance + """ + self._out.debug( + 'Received event %s' + % str(event)) + # File name on which inotify event has been triggered does + # not match => return right away + if event.name != self._name: + self._out.debug( + 'Ignoring event %s (path does not match %s)' + % (str(event), self._name)) + return + self._out.info('File %s has been updated' % event.name) + # Callback function called with proper parameters + if not yaml_alarms_2_lua_and_hindsight_cfg_files( + self._cfg + ): + log.error('Error converting YAML alarms into LUA code') + +# Check alarm entry for field existence and type +# TODO: see if we ca use similar methods from +# fuel-ccp which uses jsonschema to validate types. +# ------------------------------------------------- + + +def check_alarm_entry_field(alarm, field, ftype): + try: + akeys = alarm.keys() + # Field lookup + if field not in akeys: + log.error('Error parsing file: alarm entry does ' + + 'not have a %s field: %s' + % (field, alarm)) + return False + # Do we need to check for proper type too ? + if ftype is not None: + vfield = alarm[field] + vftype = type(vfield) + # Check for proper type + if vftype is not ftype: + log.error('Error parsing file: alarm entry does ' + + 'not have a field %s is not of type ' + + '%s: found %s [%s]' + % (field, ftype.__name__, vftype.__name__, alarm)) + return False + except Exception as e: + log.error('Error checking for %s: %s' % (field, e)) + return False + return True + +# YAML alarms structure validation +# +# TODO: see if we ca use similar methods from +# fuel-ccp which uses jsonschema to validate types. +# +# TODO: do not return false right away +# when processing lists so that most errors +# are reported at once allowing for faster +# achievement of correctness +# ------------------------------------------------- + + +def validate_yaml(alarms_yaml): + log.info('Validating YAML alarms structure') + ctx = '' + try: + log.debug('Retrieving all alarms') + # Try to retrieve alarms definitions + # and check for overall validity + alarms = alarms_yaml['alarms'] + if alarms is None: + log.error('Error parsing file: empty alarm list') + return False + # alarms entry should be a list + atype = type(alarms) + if atype is not list: + log.error('Error parsing file: alarms entry is not a list (%s)' + % atype.__name__) + return False + # Keep the complete list of alarm names + anames = [] + # Checking all alarms + for alarm in alarms: + akeys = alarm.keys() + if not check_alarm_entry_field(alarm, 'name', str): + return False + # TODO do we need to add some more checks here ? + anames.append(alarm['name']) + log.debug('Found %d alarms' % len(anames)) + # Try to retrieve alarms groups definitions + # and check overall validity + log.debug('Retrieving alarms groups') + cluster_alarms = alarms_yaml['node_cluster_alarms'] + ckeys = cluster_alarms.keys() + for ckey in ckeys: + log.debug('Parsing alarms group %s' % ckey) + ctx = ' under node_cluster_alarms[%s]' % ckey + # Are there some alarm key defined + # (if not, the next line throws exception) + c_alarms = cluster_alarms[ckey]['alarms'] + if c_alarms is None: + log.error('Error parsing file: empty alarm list%s' % ctx) + return False + # Now check validity of alarm entries + akeys = c_alarms.keys() + log.debug('Found %d alarms in group %s' % (len(akeys), ckey)) + for k in akeys: + # Must be a list + v = c_alarms[k] + ktype = type(v) + if ktype is not list: + log.error('Error parsing file: alarm entry for %s ' + + 'is not a list (%s)%s' + % (k, ktype.__name__, ctx)) + return False + # Each member of list must be a string + for s in v: + stype = type(s) + if stype is not str: + log.error('Error parsing file: alarm entry for %s ' + + 'is not a list of strings (%s) [%s]%s' + % (k, stype.__name__, s, ctx)) + return False + # Now check that all alarm referenced in + # alarm groups have been defined + for agroup in c_alarms: + for aname in c_alarms[agroup]: + if aname not in anames: + log.error( + ('Error parsing file: alarm with name %s is not ' + + 'defined but is referenced in alarm group %s') + % (aname, agroup)) + return False + except KeyError as e: + log.error('Error parsing file: can not find %s key%s' % (e, ctx)) + return False + except Exception as e: + log.error('Error parsing file: unknown exception %s %s' + % (type(e), str(e))) + return False + return True + +# Retrieve alarm by its name within list +# -------------------------------------- + + +def find_alarm_by_name(aname, alarms): + for alarm in alarms: + if alarm['name'] == aname: + return alarm + return None + + +# Check file for content changes and returns boolean +# True => content has changed +# False = content is unchanged +# +# File path can be altered using string substitutions +# so as to adapt to Hindsight current running state +# when file are moved around once taken into account +# --------------------------------------------------- + + +def content_changed(file_fullpath, file_content, replace=None): + log.debug( + 'Checking file %s for changes' + % file_fullpath) + fullpath = file_fullpath + # Do we need to replace some parts of the path + if replace is not None: + for k in replace.keys(): + fullpath = fullpath.replace(k, replace[k]) + log.debug( + 'Checking file path %s for changes' + % fullpath) + # File does not exist => needs to be created therefore + # content has changed + if not os.path.isfile(fullpath): + log.debug( + 'File %s does not exist' + % fullpath) + return True + # Read the file content + with open(fullpath, 'r') as in_fd: + try: + old_content = in_fd.read() + # Compare former content to new one + if old_content == file_content: + log.debug( + 'File %s content has not changed' + % fullpath) + return False + except Exception as e: + log.error( + 'Error reading %s got exception: %s' + % (fullpath, e)) + return True + log.debug( + 'File %s content has changed' + % fullpath) + return True + +# Convert YAML file containing alarms into lua code +# and create Hindsight configuration files +# ------------------------------------------------- + + +def yaml_alarms_2_lua_and_hindsight_cfg_files( + alarm_config): + (lua_code_dest_dir, + lua_config_dest_dir, + yaml_file, + template, + cfg_template) = (alarm_config._code_dest_dir, + alarm_config._config_dest_dir, + alarm_config._source_file, + alarm_config._template, + alarm_config._config_template) + log.info( + 'Converting alarm YAML file %s to LUA code in %s and configs in %s' + % (yaml_file, lua_code_dest_dir, lua_config_dest_dir)) + try: + if os.stat(yaml_file).st_size == 0: + log.error('File %s will not be parsed: size = 0' % yaml_file) + return False + # Open file and retrieve YAML structure if correctly formed + with open(yaml_file, 'r') as in_fd: + try: + alarms_defs = in_fd.read() + sha256sum = hashlib.sha256(alarms_defs).hexdigest() + if sha256sum == alarm_config._sha256: + log.warning('No change detected in file: %s' % yaml_file) + return True + alarm_config._sha256 = sha256sum + alarms_yaml = yaml.load(alarms_defs) + except yaml.YAMLError as exc: + log.error('Error parsing file: %s' % exc) + return False + # Check overall validity of alarms definitions + if not validate_yaml(alarms_yaml): + log.error('Error validating alarms definitions') + return False + # Now retrieve the information for config and code files generation + cluster_alarms = alarms_yaml['node_cluster_alarms'] + for afd_cluster_name in cluster_alarms: + for key in cluster_alarms[afd_cluster_name]['alarms'].keys(): + # Key can not contain dash or other non letter/numbers + if not re.match('^[A-Za-z0-9]*$', key): + log.error('Alarm group name can only contain letters ' + + 'and digits: %s' + % key) + return False + # Build list of associated alarms + alarms = [] + for aname in cluster_alarms[afd_cluster_name]['alarms'][key]: + alarms.append( + find_alarm_by_name( + aname, alarms_yaml['alarms'])) + # Write LUA code file + afd_file = 'afd_node_%s_%s_alarms' % (afd_cluster_name, key) + lua_code_dest_file = os.path.join( + lua_code_dest_dir, "%s.lua" % afd_file) + lua_code = template.render(alarms=alarms) + updated_lua_code = False + # Check if the generated code has changed + if not content_changed(lua_code_dest_file, lua_code): + log.info('Unchanged LUA file %s' % lua_code_dest_file) + else: + # LUA code changes should force re-generation of config + # file so as to force Hindsight to take changes into + # account + updated_lua_code = True + log.info('Writing LUA file: %s' % lua_code_dest_file) + # Produce LUA code file corresponding to alarm + with open(lua_code_dest_file, 'w') as out_fd: + try: + out_fd.write(lua_code) + except Exception as e: + log.error('Error writing %s: got exception: %s' + % (lua_code_dest_file, e)) + return False + # Write LUA config file + afd_file = 'afd_node_%s_%s_alarms' % (afd_cluster_name, key) + lua_config_dest_file = os.path.join( + lua_config_dest_dir, "%s.cfg" % afd_file) + lua_config = cfg_template.render( + afd_file=afd_file, + afd_cluster_name=afd_cluster_name, + afd_logical_name=key + ) + # Check if the generated config has changed + # of if we need to force config writing due to + # changes in LUA code above + # + # Note that config is written into .../load/... + # and moved to .../run/... by Hindsight + if ( + not content_changed( + lua_config_dest_file, + lua_config, + {'/load/': '/run/'}) and + not updated_lua_code): + log.info('Unchanged config file %s' % lua_config_dest_file) + else: + log.info('Writing config file: %s' % lua_config_dest_file) + with open(lua_config_dest_file, 'w') as out_fd: + try: + out_fd.write(lua_config) + except Exception as e: + log.error('Error writing %s: got exception: %s' + % (lua_config_dest_file, e)) + return False + except Exception as e: + log.error('Error got exception: %s' % e) + return False + return True + +# Command line argument parsing +# ----------------------------- + + +def cmd_line_args_parser(): + parser = argparse.ArgumentParser( + description="""Alarm manager watches for new alarms definitions + in specified directory and applies them TBC ... + """ + ) + parser.add_argument( + '-c', '--config', + help='log level and format configuration file (default %s)' + % dflt_config, + default=dflt_config, + dest='config' + ) + parser.add_argument( + '-d', '--code-destdir', + help='destination path for LUA plugins code files ' + + '(default %s)' % dflt_dest_dir, + default=dflt_dest_dir, + dest='code_dest_dir' + ) + parser.add_argument( + '-D', '--config-destdir', + help='destination path for LUA plugins configuration ' + + 'files (default %s)' % dflt_cfg_dest_dir, + default=dflt_cfg_dest_dir, + dest='config_dest_dir' + ) + parser.add_argument( + '-t', '--template', + help='LUA template file (default %s)' % dflt_template, + default=dflt_template, + dest='template' + ) + parser.add_argument( + '-T', '--config-template', + help='LUA plugins configuration template file (default %s)' % + (dflt_cfg_template), + default=dflt_cfg_template, + dest='cfg_template' + ) + parser.add_argument( + '-w', '--watch-path', + help='path to watch for changes (default %s)' % + (dflt_cfg_dir), + default=dflt_cfg_dir, + dest='watch_path' + ) + parser.add_argument( + '-x', '--exit', + help='exit program without watching filesystem changes', + action='store_const', + const=True, default=False, + dest='exit' + ) + args = parser.parse_args() + log = logger_init(args.config) + log.info('Watch path: %s\n\tConfig: %s\n\tTemplate: %s' + % (args.watch_path, args.config, args.template)) + + if ( + not os.path.isdir(args.watch_path) or + not os.access(args.watch_path, os.R_OK)): + log.error("{} not a directory or is not readable" + .format(args.watch_path)) + sys.exit(1) + + if ( + not os.path.isdir(args.code_dest_dir) or + not os.access(args.code_dest_dir, os.W_OK)): + log.error("{} not a directory or is not writable" + .format(args.code_dest_dir)) + sys.exit(1) + + if ( + not os.path.isdir(args.config_dest_dir) or + not os.access(args.config_dest_dir, os.W_OK)): + log.error("{} not a directory or is not writable" + .format(args.config_dest_dir)) + sys.exit(1) + + if ( + not os.path.isfile(args.template) or + not os.access(args.template, os.R_OK)): + log.error("{} not a file or is not readable".format(args.template)) + sys.exit(1) + + if ( + not os.path.isfile(args.cfg_template) or + not os.access(args.cfg_template, os.R_OK)): + log.error("{} not a file or is not readable".format(args.cfg_template)) + sys.exit(1) + + src = os.path.join(args.watch_path, dflt_alarm_file) + log.info('Looking for existing readable file: %s' % src) + if os.access(src, os.R_OK): + log.info('Using LUA template %s and LUA config template %s' + % (args.template, args.cfg_template)) + j2_env = jinja2.Environment( + loader=jinja2.FileSystemLoader( + os.path.dirname( + args.template)), + trim_blocks=True) + template = j2_env.get_template( + os.path.basename( + args.template)) + j2_cfg_env = jinja2.Environment( + loader=jinja2.FileSystemLoader( + os.path.dirname( + args.cfg_template)), + trim_blocks=True) + cfg_template = j2_cfg_env.get_template( + os.path.basename( + args.cfg_template)) + alarm_cfg = AlarmConfig( + args.code_dest_dir, + args.config_dest_dir, + src, + template, + cfg_template) + if not yaml_alarms_2_lua_and_hindsight_cfg_files( + alarm_cfg + ): + log.error('Error converting YAML alarms into LUA code') + + # Asked to leave right away or continue watching inotify events ? + if args.exit: + sys.exit(0) + + # watch manager instance + wm = pyinotify.WatchManager() + # notifier instance and init + notifier = pyinotify.Notifier( + wm, + default_proc_fun=InotifyEventsHandler( + cfg=alarm_cfg, + name=dflt_alarm_file)) + # What mask to apply + mask = pyinotify.IN_CLOSE_WRITE + log.debug('Start monitoring of %s' % args.watch_path) + # Do not recursively dive into path + # Do not add watches on newly created subdir in path + # Do not do globbing on path name + wm.add_watch(args.watch_path, + mask, rec=False, + auto_add=False, + do_glob=False) + # Loop forever (until sigint signal get caught) + notifier.loop(callback=None) + +if __name__ == '__main__': + cmd_line_args_parser() diff --git a/docker/alarm-manager/config-files/config/alarm-manager.ini b/docker/alarm-manager/config-files/config/alarm-manager.ini new file mode 100644 index 0000000..60dd6c9 --- /dev/null +++ b/docker/alarm-manager/config-files/config/alarm-manager.ini @@ -0,0 +1,22 @@ +[loggers] +keys=root + +[handlers] +keys=stream_handler + +[formatters] +keys=formatter + +[logger_root] +level=DEBUG +handlers=stream_handler + +[handler_stream_handler] +class=StreamHandler +level=DEBUG +formatter=formatter +args=(sys.stdout,) + +[formatter_formatter] +format=%(asctime)s.%(msecs)03d - %(name)s - %(levelname)s - %(message)s +datefmt=%Y-%m-%d %H:%M:%S diff --git a/docker/alarm-manager/config-files/templates/lua_alarming_template.j2 b/docker/alarm-manager/config-files/templates/lua_alarming_template.j2 new file mode 100644 index 0000000..075b1c8 --- /dev/null +++ b/docker/alarm-manager/config-files/templates/lua_alarming_template.j2 @@ -0,0 +1,41 @@ +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +local alarms = { +{% for alarm in alarms %} + { +{% for fkey in alarm.keys()|sort() %} +{% if fkey != "trigger" %} + ['{{ fkey }}'] = '{{ alarm[fkey] }}', +{% endif %} +{% endfor %} +{% if alarm.trigger is defined %} + ['trigger'] = { +{% if alarm.trigger.logical_operator is defined %} + ['logical_operator'] = '{{ alarm.trigger.logical_operator }}', +{% endif %} + ['rules'] = { +{% for rule in alarm.trigger.rules %} + { +{% for fkey in rule.keys()|sort() %} +{% if fkey != "fields" %} + ['{{ fkey }}'] = '{{ rule[fkey] }}', +{% endif %} +{% endfor %} +{% if rule.fields is defined %} + ['fields'] = { +{% for fkey in rule.fields.keys() %} + ['{{ fkey }}'] = '{{ rule.fields[fkey] }}' +{% endfor %} + }, +{% endif %} + }, +{% endfor %} + }, + }, +{% endif %} + }, +{% endfor %} +} + +return alarms diff --git a/docker/alarm-manager/requirements.txt b/docker/alarm-manager/requirements.txt new file mode 100644 index 0000000..9de53a9 --- /dev/null +++ b/docker/alarm-manager/requirements.txt @@ -0,0 +1,9 @@ +pyinotify==0.9.6 +# Mandatory utilities for use in CCP +dumb-init # init system for containers +python-etcd +netifaces +six==1.10.0 # MIT +# Common utilities +PyYAML==3.12 # BSD License (3 clause) +Jinja2==2.8 # MIT diff --git a/docker/hindsight/Dockerfile.j2 b/docker/hindsight/Dockerfile.j2 index 1b6d880..acb9582 100644 --- a/docker/hindsight/Dockerfile.j2 +++ b/docker/hindsight/Dockerfile.j2 @@ -5,6 +5,7 @@ MAINTAINER {{ maintainer }} COPY sources.mos.list /etc/apt/sources.list.d/ COPY mos.pref /etc/apt/preferences.d/ +COPY bootstrap-hindsight.sh /opt/ccp/bin/ RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 1FA22B08 \ && apt-get update \ @@ -19,10 +20,10 @@ ADD output/*.lua /var/lib/hindsight/run/output/ ADD input/*.lua /var/lib/hindsight/run/input/ ADD analysis/*.lua /var/lib/hindsight/run/analysis/ ADD modules/*.lua /opt/ccp/lua/modules/stacklight/ -ADD modules_alarms/afd_node_default_cpu_alarms.lua /opt/ccp/lua/modules/stacklight_alarms/ RUN useradd --user-group hindsight \ && usermod -a -G microservices hindsight \ - && chown -R hindsight: /var/lib/hindsight /etc/hindsight + && chown -R hindsight: /var/lib/hindsight /etc/hindsight \ + && tar cf - -C /var/lib hindsight | tar xf - -C /opt/ccp USER hindsight diff --git a/docker/hindsight/bootstrap-hindsight.sh b/docker/hindsight/bootstrap-hindsight.sh new file mode 100755 index 0000000..e479463 --- /dev/null +++ b/docker/hindsight/bootstrap-hindsight.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# This script is used for bootstrapping +# Hindsight with proper directories contents +# when using emptydir Kubernetes volumes +# As these are created empty +# Hindsight will not start properly +# as files will be missing +# Therefore the need to run this script +# with the proper destination directory +# as its command line parameter + +set -e + +if [ $# -ne 1 ]; then + echo "Usage: $0 directory" + exit 1 +fi + +if [ ! -d "$1" ]; then + echo "Error: $1 does not exist or is not a directory" + exit 1 +fi + +SRC=/opt/ccp/hindsight +if [ ! -d "$SRC" ]; then + echo "Error: $SRC does not exist or is not a directory" + exit 1 +fi + +tar cf - -C $SRC . | tar xf - -C $1 --strip-components=1 diff --git a/service/files/alarm_manager_lua_config_template.cfg.j2 b/service/files/alarm_manager_lua_config_template.cfg.j2 new file mode 100644 index 0000000..427f801 --- /dev/null +++ b/service/files/alarm_manager_lua_config_template.cfg.j2 @@ -0,0 +1,11 @@ +filename = "afd.lua" +-- log_level = 7 +message_matcher = "TRUE" +ticker_interval = 10 +{% raw %} +afd_type = "node" +afd_file = "{{ afd_file }}" +afd_cluster_name = "{{ afd_cluster_name }}" +afd_logical_name = "{{ afd_logical_name }}" +{% endraw %} +hostname = "{{ CCP_ALERT_MANAGER_NODE_NAME }}" diff --git a/service/files/alarms.yaml b/service/files/alarms.yaml new file mode 100644 index 0000000..f57e488 --- /dev/null +++ b/service/files/alarms.yaml @@ -0,0 +1,79 @@ +alarms: + - name: 'root-fs-warning' + description: 'The root filesystem free space is low' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: 'intel.procfs.filesystem.space_percent_free' + fields: + filesystem: 'rootfs' + relational_operator: '<' + threshold: 10 + window: 60 + periods: 0 + function: min + - name: 'root-fs-critical' + description: 'The root filesystem free space is too low' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: 'intel.procfs.filesystem.space_percent_free' + fields: + filesystem: 'rootfs' + relational_operator: '<' + threshold: 5 + window: 60 + periods: 0 + function: min + - name: 'cpu-critical' + description: 'The CPU usage is too high' + severity: 'critical' + trigger: + logical_operator: 'or' + rules: + - metric: 'intel.procfs.cpu.idle_percentage' + fields: + cpuID: 'all' + relational_operator: '<=' + threshold: '5' + window: '120' + periods: '0' + function: 'avg' + - metric: 'intel.procfs.cpu.iowait_percentage' + fields: + cpuID: 'all' + relational_operator: '>=' + threshold: '35' + window: '120' + periods: '0' + function: 'avg' + - name: 'cpu-warning' + description: 'The CPU usage is high' + severity: 'warning' + trigger: + logical_operator: 'or' + rules: + - metric: 'intel.procfs.cpu.idle_percentage' + fields: + cpuID: 'all' + relational_operator: '<=' + threshold: '15' + window: '120' + periods: '0' + function: 'avg' + - metric: 'intel.procfs.cpu.iowait_percentage' + fields: + cpuID: 'all' + relational_operator: '>=' + threshold: '25' + window: '120' + periods: '0' + function: 'avg' + +node_cluster_alarms: + system: + alarms: + rootfs: ['root-fs-critical', 'root-fs-warning'] + cpu: ['cpu-critical', 'cpu-warning'] diff --git a/service/stacklight-collector.yaml b/service/stacklight-collector.yaml index 07286bc..5867f55 100644 --- a/service/stacklight-collector.yaml +++ b/service/stacklight-collector.yaml @@ -7,6 +7,10 @@ service: probes: readiness: "true" liveness: "true" + pre: + - name: service-bootstrap + type: local + command: /opt/ccp/bin/bootstrap-hindsight.sh /var/lib/hindsight daemon: command: /usr/bin/hindsight /etc/hindsight/hindsight.cfg files: @@ -15,11 +19,13 @@ service: - prune-input.cfg - influxdb-tcp.cfg - kubelet-stats.cfg - - afd-node-default-cpu-alarms.cfg volumes: - - name: hindsight-output + - name: hindsight type: empty-dir - path: /var/lib/hindsight/output + path: /var/lib/hindsight + - name: stacklight-alarms + type: empty-dir + path: /opt/ccp/lua/modules/stacklight_alarms env: - name: CCP_HINDSIGHT_NODE_NAME valueFrom: @@ -50,6 +56,28 @@ service: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: alarm-manager + image: alarm-manager + probes: + readiness: "true" + liveness: "true" + daemon: + command: /opt/ccp/bin/alarm-manager.py -w /etc/alarm-manager + files: + - alarms.yaml + - lua-cfg-template.j2 + volumes: + - name: hindsight + type: empty-dir + path: /var/lib/hindsight + - name: stacklight-alarms + type: empty-dir + path: /opt/ccp/lua/modules/stacklight_alarms + env: + - name: CCP_ALERT_MANAGER_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName files: hindsight.cfg: path: /etc/hindsight/hindsight.cfg @@ -71,10 +99,6 @@ files: path: /var/lib/hindsight/run/input/kubelet_stats.cfg content: hindsight_kubelet_stats.cfg.j2 perm: "0600" - afd-node-default-cpu-alarms.cfg: - path: /var/lib/hindsight/run/analysis/afd_node_default_cpu_alarms.cfg - content: hindsight_afd_node_default_cpu_alarms.cfg.j2 - perm: "0600" snap.conf: path: /etc/snap/snap.conf content: snap.conf.j2 @@ -83,3 +107,11 @@ files: path: /etc/snap/auto/task.json content: snap-task.json.j2 perm: "0600" + alarms.yaml: + path: /etc/alarm-manager/alarms.yaml + content: alarms.yaml + perm: "0600" + lua-cfg-template.j2: + path: /etc/alarm-manager/templates/alarm_manager_lua_config_template.cfg.j2 + content: alarm_manager_lua_config_template.cfg.j2 + perm: "0600" diff --git a/tox.ini b/tox.ini index 3c1338c..bb67163 100644 --- a/tox.ini +++ b/tox.ini @@ -1,8 +1,11 @@ [tox] minversion = 1.6 -envlist = linters,bashate +envlist = linters,bashate,py34,py27,pep8 skipsdist = True +[testenv:pep8] +commands = flake8 {posargs} + [testenv:venv] commands = {posargs} @@ -15,3 +18,10 @@ commands = deps = bashate>=0.2 whitelist_externals = bash commands = bash -c "find {toxinidir} -type f -name '*.sh' -not -path '*/.tox/*' -print0 | xargs -0 bashate -v" + +[flake8] +# E123, E125 skipped as they are invalid PEP-8. +show-source = True +ignore = E123,E125,H102 +builtins = _ +exclude=.venv,.git,.tox,dist,doc,*openstack/common*,*lib/python*,*egg,build