Implement alarm-manager

alarm-manager is responsible for watching a configurable
location within the filesystem where the user can put
a YAML file which defines alarms. When change/creation is
detected, the YAML file is checked for proper contents
and if verification is successfull, LUA code is generated
as well as LUA configuration files. Hindsight will pick
up those changes after a certain period of time and
provides proper alarming to the platform.

Change-Id: I7b2b98f379c49bdbf23177a038bdca9433d1c6e5
This commit is contained in:
Olivier Bourdon 2016-09-12 21:25:36 +02:00 committed by Olivier Bourdon
parent e1e524dfc5
commit 4b0617cd24
11 changed files with 869 additions and 10 deletions

View File

@ -0,0 +1,20 @@
FROM {{ namespace }}/base:{{ tag }}
MAINTAINER {{ maintainer }}
# Install alarm-manager and dependencies
COPY alarm-manager.py /opt/ccp/bin/
COPY requirements.txt /tmp/requirements.txt
COPY config-files /etc/alarm-manager/
RUN apt-get install -y --no-install-recommends patch gcc python-dev \
&& apt-get clean \
&& pip install --no-cache-dir -r /tmp/requirements.txt \
&& useradd --user-group alarm-manager \
&& usermod -a -G microservices alarm-manager \
&& chown -R alarm-manager: /etc/alarm-manager \
&& chmod 755 /opt/ccp/bin/alarm-manager.py \
&& rm -f /tmp/requirements.txt \
&& apt-get -y purge patch gcc python-dev \
&& apt-get -y autoremove
USER alarm-manager

View File

@ -0,0 +1,603 @@
#!/usr/bin/env python
#
# Copyright 2016 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# Global imports
# --------------
import argparse
import hashlib
import jinja2
import logging
import os
import pyinotify
import re
import sys
import yaml
# Best practice code for logging
# ------------------------------
try: # Python 2.7+
from logging import NullHandler
except ImportError:
class NullHandler(logging.Handler):
def emit(self, record):
pass
# Global variables initialization
# -------------------------------
dflt_cfg_dir = os.path.join(
'/etc', 'alarm-manager')
dflt_config = os.path.join(
dflt_cfg_dir, 'config', 'alarm-manager.ini')
dflt_template = os.path.join(
dflt_cfg_dir, 'templates', 'lua_alarming_template.j2')
dflt_cfg_template = os.path.join(
dflt_cfg_dir,
'templates', 'alarm_manager_lua_config_template.cfg.j2')
dflt_dest_dir = os.path.join(
'/opt', 'ccp', 'lua', 'modules', 'stacklight_alarms')
dflt_cfg_dest_dir = os.path.join(
'/var', 'lib', 'hindsight', 'load', 'analysis')
dflt_alarm_file = 'alarms.yaml'
# Logging initialization
# ----------------------
def logger_init(cfg_file):
"""Initialize logger instance."""
log = logging.getLogger()
log.setLevel(logging.DEBUG)
try:
log.debug('Looking for log configuration file: %s' % cfg_file)
# Default logging configuration file
logging.config.fileConfig(cfg_file)
except Exception:
# Only add handler if not already done
if len(log.handlers) == 0:
# Hardcoded default logging configuration if no/bad config file
console_handler = logging.StreamHandler(sys.stdout)
fmt_str = "[%(asctime)s.%(msecs)03d %(name)s %(levelname)s] " \
"%(message)s"
console_handler.setFormatter(
logging.Formatter(fmt_str, "%Y-%m-%d %H:%M:%S"))
log.addHandler(console_handler)
log.setLevel(logging.DEBUG)
log.debug('Defaulting to stdout')
return log
log = logger_init(None)
# Class for keeping configuration parameters
# ------------------------------------------
class AlarmConfig():
"""
Class used to store parameters
"""
def __init__(self, code_dest_dir, config_dest_dir,
source_file, template, config_template):
self._code_dest_dir = code_dest_dir
self._config_dest_dir = config_dest_dir
self._source_file = source_file
self._template = template
self._config_template = config_template
self._sha256 = None
# Class for processing inotify events
# ------------------------------------
class InotifyEventsHandler(pyinotify.ProcessEvent):
"""
Class used to process inotify events.
"""
def my_init(self, cfg, name, out=None):
"""
@param cfg: configuration to use for generation callback.
@type cfg: AlarmConfig.
@param name: File name to be watched.
@type name: String.
@param out: Logger where events will be written.
@type out: Object providing a valid logging object interface.
"""
if out is None:
out = log
self._out = out
self._cfg = cfg
self._name = name
def process_default(self, event):
"""
Writes event string representation to logging object provided to
my_init().
@param event: Event to be processed. Can be of any type of events but
IN_Q_OVERFLOW events (see method process_IN_Q_OVERFLOW).
@type event: Event instance
"""
self._out.debug(
'Received event %s'
% str(event))
# File name on which inotify event has been triggered does
# not match => return right away
if event.name != self._name:
self._out.debug(
'Ignoring event %s (path does not match %s)'
% (str(event), self._name))
return
self._out.info('File %s has been updated' % event.name)
# Callback function called with proper parameters
if not yaml_alarms_2_lua_and_hindsight_cfg_files(
self._cfg
):
log.error('Error converting YAML alarms into LUA code')
# Check alarm entry for field existence and type
# TODO: see if we ca use similar methods from
# fuel-ccp which uses jsonschema to validate types.
# -------------------------------------------------
def check_alarm_entry_field(alarm, field, ftype):
try:
akeys = alarm.keys()
# Field lookup
if field not in akeys:
log.error('Error parsing file: alarm entry does ' +
'not have a %s field: %s'
% (field, alarm))
return False
# Do we need to check for proper type too ?
if ftype is not None:
vfield = alarm[field]
vftype = type(vfield)
# Check for proper type
if vftype is not ftype:
log.error('Error parsing file: alarm entry does ' +
'not have a field %s is not of type ' +
'%s: found %s [%s]'
% (field, ftype.__name__, vftype.__name__, alarm))
return False
except Exception as e:
log.error('Error checking for %s: %s' % (field, e))
return False
return True
# YAML alarms structure validation
#
# TODO: see if we ca use similar methods from
# fuel-ccp which uses jsonschema to validate types.
#
# TODO: do not return false right away
# when processing lists so that most errors
# are reported at once allowing for faster
# achievement of correctness
# -------------------------------------------------
def validate_yaml(alarms_yaml):
log.info('Validating YAML alarms structure')
ctx = ''
try:
log.debug('Retrieving all alarms')
# Try to retrieve alarms definitions
# and check for overall validity
alarms = alarms_yaml['alarms']
if alarms is None:
log.error('Error parsing file: empty alarm list')
return False
# alarms entry should be a list
atype = type(alarms)
if atype is not list:
log.error('Error parsing file: alarms entry is not a list (%s)'
% atype.__name__)
return False
# Keep the complete list of alarm names
anames = []
# Checking all alarms
for alarm in alarms:
akeys = alarm.keys()
if not check_alarm_entry_field(alarm, 'name', str):
return False
# TODO do we need to add some more checks here ?
anames.append(alarm['name'])
log.debug('Found %d alarms' % len(anames))
# Try to retrieve alarms groups definitions
# and check overall validity
log.debug('Retrieving alarms groups')
cluster_alarms = alarms_yaml['node_cluster_alarms']
ckeys = cluster_alarms.keys()
for ckey in ckeys:
log.debug('Parsing alarms group %s' % ckey)
ctx = ' under node_cluster_alarms[%s]' % ckey
# Are there some alarm key defined
# (if not, the next line throws exception)
c_alarms = cluster_alarms[ckey]['alarms']
if c_alarms is None:
log.error('Error parsing file: empty alarm list%s' % ctx)
return False
# Now check validity of alarm entries
akeys = c_alarms.keys()
log.debug('Found %d alarms in group %s' % (len(akeys), ckey))
for k in akeys:
# Must be a list
v = c_alarms[k]
ktype = type(v)
if ktype is not list:
log.error('Error parsing file: alarm entry for %s ' +
'is not a list (%s)%s'
% (k, ktype.__name__, ctx))
return False
# Each member of list must be a string
for s in v:
stype = type(s)
if stype is not str:
log.error('Error parsing file: alarm entry for %s ' +
'is not a list of strings (%s) [%s]%s'
% (k, stype.__name__, s, ctx))
return False
# Now check that all alarm referenced in
# alarm groups have been defined
for agroup in c_alarms:
for aname in c_alarms[agroup]:
if aname not in anames:
log.error(
('Error parsing file: alarm with name %s is not ' +
'defined but is referenced in alarm group %s')
% (aname, agroup))
return False
except KeyError as e:
log.error('Error parsing file: can not find %s key%s' % (e, ctx))
return False
except Exception as e:
log.error('Error parsing file: unknown exception %s %s'
% (type(e), str(e)))
return False
return True
# Retrieve alarm by its name within list
# --------------------------------------
def find_alarm_by_name(aname, alarms):
for alarm in alarms:
if alarm['name'] == aname:
return alarm
return None
# Check file for content changes and returns boolean
# True => content has changed
# False = content is unchanged
#
# File path can be altered using string substitutions
# so as to adapt to Hindsight current running state
# when file are moved around once taken into account
# ---------------------------------------------------
def content_changed(file_fullpath, file_content, replace=None):
log.debug(
'Checking file %s for changes'
% file_fullpath)
fullpath = file_fullpath
# Do we need to replace some parts of the path
if replace is not None:
for k in replace.keys():
fullpath = fullpath.replace(k, replace[k])
log.debug(
'Checking file path %s for changes'
% fullpath)
# File does not exist => needs to be created therefore
# content has changed
if not os.path.isfile(fullpath):
log.debug(
'File %s does not exist'
% fullpath)
return True
# Read the file content
with open(fullpath, 'r') as in_fd:
try:
old_content = in_fd.read()
# Compare former content to new one
if old_content == file_content:
log.debug(
'File %s content has not changed'
% fullpath)
return False
except Exception as e:
log.error(
'Error reading %s got exception: %s'
% (fullpath, e))
return True
log.debug(
'File %s content has changed'
% fullpath)
return True
# Convert YAML file containing alarms into lua code
# and create Hindsight configuration files
# -------------------------------------------------
def yaml_alarms_2_lua_and_hindsight_cfg_files(
alarm_config):
(lua_code_dest_dir,
lua_config_dest_dir,
yaml_file,
template,
cfg_template) = (alarm_config._code_dest_dir,
alarm_config._config_dest_dir,
alarm_config._source_file,
alarm_config._template,
alarm_config._config_template)
log.info(
'Converting alarm YAML file %s to LUA code in %s and configs in %s'
% (yaml_file, lua_code_dest_dir, lua_config_dest_dir))
try:
if os.stat(yaml_file).st_size == 0:
log.error('File %s will not be parsed: size = 0' % yaml_file)
return False
# Open file and retrieve YAML structure if correctly formed
with open(yaml_file, 'r') as in_fd:
try:
alarms_defs = in_fd.read()
sha256sum = hashlib.sha256(alarms_defs).hexdigest()
if sha256sum == alarm_config._sha256:
log.warning('No change detected in file: %s' % yaml_file)
return True
alarm_config._sha256 = sha256sum
alarms_yaml = yaml.load(alarms_defs)
except yaml.YAMLError as exc:
log.error('Error parsing file: %s' % exc)
return False
# Check overall validity of alarms definitions
if not validate_yaml(alarms_yaml):
log.error('Error validating alarms definitions')
return False
# Now retrieve the information for config and code files generation
cluster_alarms = alarms_yaml['node_cluster_alarms']
for afd_cluster_name in cluster_alarms:
for key in cluster_alarms[afd_cluster_name]['alarms'].keys():
# Key can not contain dash or other non letter/numbers
if not re.match('^[A-Za-z0-9]*$', key):
log.error('Alarm group name can only contain letters ' +
'and digits: %s'
% key)
return False
# Build list of associated alarms
alarms = []
for aname in cluster_alarms[afd_cluster_name]['alarms'][key]:
alarms.append(
find_alarm_by_name(
aname, alarms_yaml['alarms']))
# Write LUA code file
afd_file = 'afd_node_%s_%s_alarms' % (afd_cluster_name, key)
lua_code_dest_file = os.path.join(
lua_code_dest_dir, "%s.lua" % afd_file)
lua_code = template.render(alarms=alarms)
updated_lua_code = False
# Check if the generated code has changed
if not content_changed(lua_code_dest_file, lua_code):
log.info('Unchanged LUA file %s' % lua_code_dest_file)
else:
# LUA code changes should force re-generation of config
# file so as to force Hindsight to take changes into
# account
updated_lua_code = True
log.info('Writing LUA file: %s' % lua_code_dest_file)
# Produce LUA code file corresponding to alarm
with open(lua_code_dest_file, 'w') as out_fd:
try:
out_fd.write(lua_code)
except Exception as e:
log.error('Error writing %s: got exception: %s'
% (lua_code_dest_file, e))
return False
# Write LUA config file
afd_file = 'afd_node_%s_%s_alarms' % (afd_cluster_name, key)
lua_config_dest_file = os.path.join(
lua_config_dest_dir, "%s.cfg" % afd_file)
lua_config = cfg_template.render(
afd_file=afd_file,
afd_cluster_name=afd_cluster_name,
afd_logical_name=key
)
# Check if the generated config has changed
# of if we need to force config writing due to
# changes in LUA code above
#
# Note that config is written into .../load/...
# and moved to .../run/... by Hindsight
if (
not content_changed(
lua_config_dest_file,
lua_config,
{'/load/': '/run/'}) and
not updated_lua_code):
log.info('Unchanged config file %s' % lua_config_dest_file)
else:
log.info('Writing config file: %s' % lua_config_dest_file)
with open(lua_config_dest_file, 'w') as out_fd:
try:
out_fd.write(lua_config)
except Exception as e:
log.error('Error writing %s: got exception: %s'
% (lua_config_dest_file, e))
return False
except Exception as e:
log.error('Error got exception: %s' % e)
return False
return True
# Command line argument parsing
# -----------------------------
def cmd_line_args_parser():
parser = argparse.ArgumentParser(
description="""Alarm manager watches for new alarms definitions
in specified directory and applies them TBC ...
"""
)
parser.add_argument(
'-c', '--config',
help='log level and format configuration file (default %s)'
% dflt_config,
default=dflt_config,
dest='config'
)
parser.add_argument(
'-d', '--code-destdir',
help='destination path for LUA plugins code files ' +
'(default %s)' % dflt_dest_dir,
default=dflt_dest_dir,
dest='code_dest_dir'
)
parser.add_argument(
'-D', '--config-destdir',
help='destination path for LUA plugins configuration ' +
'files (default %s)' % dflt_cfg_dest_dir,
default=dflt_cfg_dest_dir,
dest='config_dest_dir'
)
parser.add_argument(
'-t', '--template',
help='LUA template file (default %s)' % dflt_template,
default=dflt_template,
dest='template'
)
parser.add_argument(
'-T', '--config-template',
help='LUA plugins configuration template file (default %s)' %
(dflt_cfg_template),
default=dflt_cfg_template,
dest='cfg_template'
)
parser.add_argument(
'-w', '--watch-path',
help='path to watch for changes (default %s)' %
(dflt_cfg_dir),
default=dflt_cfg_dir,
dest='watch_path'
)
parser.add_argument(
'-x', '--exit',
help='exit program without watching filesystem changes',
action='store_const',
const=True, default=False,
dest='exit'
)
args = parser.parse_args()
log = logger_init(args.config)
log.info('Watch path: %s\n\tConfig: %s\n\tTemplate: %s'
% (args.watch_path, args.config, args.template))
if (
not os.path.isdir(args.watch_path) or
not os.access(args.watch_path, os.R_OK)):
log.error("{} not a directory or is not readable"
.format(args.watch_path))
sys.exit(1)
if (
not os.path.isdir(args.code_dest_dir) or
not os.access(args.code_dest_dir, os.W_OK)):
log.error("{} not a directory or is not writable"
.format(args.code_dest_dir))
sys.exit(1)
if (
not os.path.isdir(args.config_dest_dir) or
not os.access(args.config_dest_dir, os.W_OK)):
log.error("{} not a directory or is not writable"
.format(args.config_dest_dir))
sys.exit(1)
if (
not os.path.isfile(args.template) or
not os.access(args.template, os.R_OK)):
log.error("{} not a file or is not readable".format(args.template))
sys.exit(1)
if (
not os.path.isfile(args.cfg_template) or
not os.access(args.cfg_template, os.R_OK)):
log.error("{} not a file or is not readable".format(args.cfg_template))
sys.exit(1)
src = os.path.join(args.watch_path, dflt_alarm_file)
log.info('Looking for existing readable file: %s' % src)
if os.access(src, os.R_OK):
log.info('Using LUA template %s and LUA config template %s'
% (args.template, args.cfg_template))
j2_env = jinja2.Environment(
loader=jinja2.FileSystemLoader(
os.path.dirname(
args.template)),
trim_blocks=True)
template = j2_env.get_template(
os.path.basename(
args.template))
j2_cfg_env = jinja2.Environment(
loader=jinja2.FileSystemLoader(
os.path.dirname(
args.cfg_template)),
trim_blocks=True)
cfg_template = j2_cfg_env.get_template(
os.path.basename(
args.cfg_template))
alarm_cfg = AlarmConfig(
args.code_dest_dir,
args.config_dest_dir,
src,
template,
cfg_template)
if not yaml_alarms_2_lua_and_hindsight_cfg_files(
alarm_cfg
):
log.error('Error converting YAML alarms into LUA code')
# Asked to leave right away or continue watching inotify events ?
if args.exit:
sys.exit(0)
# watch manager instance
wm = pyinotify.WatchManager()
# notifier instance and init
notifier = pyinotify.Notifier(
wm,
default_proc_fun=InotifyEventsHandler(
cfg=alarm_cfg,
name=dflt_alarm_file))
# What mask to apply
mask = pyinotify.IN_CLOSE_WRITE
log.debug('Start monitoring of %s' % args.watch_path)
# Do not recursively dive into path
# Do not add watches on newly created subdir in path
# Do not do globbing on path name
wm.add_watch(args.watch_path,
mask, rec=False,
auto_add=False,
do_glob=False)
# Loop forever (until sigint signal get caught)
notifier.loop(callback=None)
if __name__ == '__main__':
cmd_line_args_parser()

View File

@ -0,0 +1,22 @@
[loggers]
keys=root
[handlers]
keys=stream_handler
[formatters]
keys=formatter
[logger_root]
level=DEBUG
handlers=stream_handler
[handler_stream_handler]
class=StreamHandler
level=DEBUG
formatter=formatter
args=(sys.stdout,)
[formatter_formatter]
format=%(asctime)s.%(msecs)03d - %(name)s - %(levelname)s - %(message)s
datefmt=%Y-%m-%d %H:%M:%S

View File

@ -0,0 +1,41 @@
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
local alarms = {
{% for alarm in alarms %}
{
{% for fkey in alarm.keys()|sort() %}
{% if fkey != "trigger" %}
['{{ fkey }}'] = '{{ alarm[fkey] }}',
{% endif %}
{% endfor %}
{% if alarm.trigger is defined %}
['trigger'] = {
{% if alarm.trigger.logical_operator is defined %}
['logical_operator'] = '{{ alarm.trigger.logical_operator }}',
{% endif %}
['rules'] = {
{% for rule in alarm.trigger.rules %}
{
{% for fkey in rule.keys()|sort() %}
{% if fkey != "fields" %}
['{{ fkey }}'] = '{{ rule[fkey] }}',
{% endif %}
{% endfor %}
{% if rule.fields is defined %}
['fields'] = {
{% for fkey in rule.fields.keys() %}
['{{ fkey }}'] = '{{ rule.fields[fkey] }}'
{% endfor %}
},
{% endif %}
},
{% endfor %}
},
},
{% endif %}
},
{% endfor %}
}
return alarms

View File

@ -0,0 +1,9 @@
pyinotify==0.9.6
# Mandatory utilities for use in CCP
dumb-init # init system for containers
python-etcd
netifaces
six==1.10.0 # MIT
# Common utilities
PyYAML==3.12 # BSD License (3 clause)
Jinja2==2.8 # MIT

View File

@ -5,6 +5,7 @@ MAINTAINER {{ maintainer }}
COPY sources.mos.list /etc/apt/sources.list.d/
COPY mos.pref /etc/apt/preferences.d/
COPY bootstrap-hindsight.sh /opt/ccp/bin/
RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 1FA22B08 \
&& apt-get update \
@ -19,10 +20,10 @@ ADD output/*.lua /var/lib/hindsight/run/output/
ADD input/*.lua /var/lib/hindsight/run/input/
ADD analysis/*.lua /var/lib/hindsight/run/analysis/
ADD modules/*.lua /opt/ccp/lua/modules/stacklight/
ADD modules_alarms/afd_node_default_cpu_alarms.lua /opt/ccp/lua/modules/stacklight_alarms/
RUN useradd --user-group hindsight \
&& usermod -a -G microservices hindsight \
&& chown -R hindsight: /var/lib/hindsight /etc/hindsight
&& chown -R hindsight: /var/lib/hindsight /etc/hindsight \
&& tar cf - -C /var/lib hindsight | tar xf - -C /opt/ccp
USER hindsight

View File

@ -0,0 +1,31 @@
#!/bin/bash
# This script is used for bootstrapping
# Hindsight with proper directories contents
# when using emptydir Kubernetes volumes
# As these are created empty
# Hindsight will not start properly
# as files will be missing
# Therefore the need to run this script
# with the proper destination directory
# as its command line parameter
set -e
if [ $# -ne 1 ]; then
echo "Usage: $0 directory"
exit 1
fi
if [ ! -d "$1" ]; then
echo "Error: $1 does not exist or is not a directory"
exit 1
fi
SRC=/opt/ccp/hindsight
if [ ! -d "$SRC" ]; then
echo "Error: $SRC does not exist or is not a directory"
exit 1
fi
tar cf - -C $SRC . | tar xf - -C $1 --strip-components=1

View File

@ -0,0 +1,11 @@
filename = "afd.lua"
-- log_level = 7
message_matcher = "TRUE"
ticker_interval = 10
{% raw %}
afd_type = "node"
afd_file = "{{ afd_file }}"
afd_cluster_name = "{{ afd_cluster_name }}"
afd_logical_name = "{{ afd_logical_name }}"
{% endraw %}
hostname = "{{ CCP_ALERT_MANAGER_NODE_NAME }}"

79
service/files/alarms.yaml Normal file
View File

@ -0,0 +1,79 @@
alarms:
- name: 'root-fs-warning'
description: 'The root filesystem free space is low'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: 'intel.procfs.filesystem.space_percent_free'
fields:
filesystem: 'rootfs'
relational_operator: '<'
threshold: 10
window: 60
periods: 0
function: min
- name: 'root-fs-critical'
description: 'The root filesystem free space is too low'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: 'intel.procfs.filesystem.space_percent_free'
fields:
filesystem: 'rootfs'
relational_operator: '<'
threshold: 5
window: 60
periods: 0
function: min
- name: 'cpu-critical'
description: 'The CPU usage is too high'
severity: 'critical'
trigger:
logical_operator: 'or'
rules:
- metric: 'intel.procfs.cpu.idle_percentage'
fields:
cpuID: 'all'
relational_operator: '<='
threshold: '5'
window: '120'
periods: '0'
function: 'avg'
- metric: 'intel.procfs.cpu.iowait_percentage'
fields:
cpuID: 'all'
relational_operator: '>='
threshold: '35'
window: '120'
periods: '0'
function: 'avg'
- name: 'cpu-warning'
description: 'The CPU usage is high'
severity: 'warning'
trigger:
logical_operator: 'or'
rules:
- metric: 'intel.procfs.cpu.idle_percentage'
fields:
cpuID: 'all'
relational_operator: '<='
threshold: '15'
window: '120'
periods: '0'
function: 'avg'
- metric: 'intel.procfs.cpu.iowait_percentage'
fields:
cpuID: 'all'
relational_operator: '>='
threshold: '25'
window: '120'
periods: '0'
function: 'avg'
node_cluster_alarms:
system:
alarms:
rootfs: ['root-fs-critical', 'root-fs-warning']
cpu: ['cpu-critical', 'cpu-warning']

View File

@ -7,6 +7,10 @@ service:
probes:
readiness: "true"
liveness: "true"
pre:
- name: service-bootstrap
type: local
command: /opt/ccp/bin/bootstrap-hindsight.sh /var/lib/hindsight
daemon:
command: /usr/bin/hindsight /etc/hindsight/hindsight.cfg
files:
@ -15,11 +19,13 @@ service:
- prune-input.cfg
- influxdb-tcp.cfg
- kubelet-stats.cfg
- afd-node-default-cpu-alarms.cfg
volumes:
- name: hindsight-output
- name: hindsight
type: empty-dir
path: /var/lib/hindsight/output
path: /var/lib/hindsight
- name: stacklight-alarms
type: empty-dir
path: /opt/ccp/lua/modules/stacklight_alarms
env:
- name: CCP_HINDSIGHT_NODE_NAME
valueFrom:
@ -50,6 +56,28 @@ service:
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: alarm-manager
image: alarm-manager
probes:
readiness: "true"
liveness: "true"
daemon:
command: /opt/ccp/bin/alarm-manager.py -w /etc/alarm-manager
files:
- alarms.yaml
- lua-cfg-template.j2
volumes:
- name: hindsight
type: empty-dir
path: /var/lib/hindsight
- name: stacklight-alarms
type: empty-dir
path: /opt/ccp/lua/modules/stacklight_alarms
env:
- name: CCP_ALERT_MANAGER_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
files:
hindsight.cfg:
path: /etc/hindsight/hindsight.cfg
@ -71,10 +99,6 @@ files:
path: /var/lib/hindsight/run/input/kubelet_stats.cfg
content: hindsight_kubelet_stats.cfg.j2
perm: "0600"
afd-node-default-cpu-alarms.cfg:
path: /var/lib/hindsight/run/analysis/afd_node_default_cpu_alarms.cfg
content: hindsight_afd_node_default_cpu_alarms.cfg.j2
perm: "0600"
snap.conf:
path: /etc/snap/snap.conf
content: snap.conf.j2
@ -83,3 +107,11 @@ files:
path: /etc/snap/auto/task.json
content: snap-task.json.j2
perm: "0600"
alarms.yaml:
path: /etc/alarm-manager/alarms.yaml
content: alarms.yaml
perm: "0600"
lua-cfg-template.j2:
path: /etc/alarm-manager/templates/alarm_manager_lua_config_template.cfg.j2
content: alarm_manager_lua_config_template.cfg.j2
perm: "0600"

12
tox.ini
View File

@ -1,8 +1,11 @@
[tox]
minversion = 1.6
envlist = linters,bashate
envlist = linters,bashate,py34,py27,pep8
skipsdist = True
[testenv:pep8]
commands = flake8 {posargs}
[testenv:venv]
commands = {posargs}
@ -15,3 +18,10 @@ commands =
deps = bashate>=0.2
whitelist_externals = bash
commands = bash -c "find {toxinidir} -type f -name '*.sh' -not -path '*/.tox/*' -print0 | xargs -0 bashate -v"
[flake8]
# E123, E125 skipped as they are invalid PEP-8.
show-source = True
ignore = E123,E125,H102
builtins = _
exclude=.venv,.git,.tox,dist,doc,*openstack/common*,*lib/python*,*egg,build