Add Lua code for alarming

This commits adds Lua code for generating AFD (Anomaly and
Fault Detection) metrics based on the evaluation of alarms.
The Lua code was copied from the lma_collector Fuel plugin
[*], with changes to accomodate Hindsight and the versions
of lua_sandbox and lua_sandbox_extensions we rely on.

In the future we plan to move this Lua code in its own Git
repository. And the Hindsight Dockerfile will install the
Lua code in the image using Debian packages.

The afd_node_default_cpu_alarms.lua and
hindsight_afd_node_default_cpu_alarms.cfg.j2 files will be
removed. Instead the operator will configure alarms through
a YAML file, and we will use a sidecar container for
generating Lua tables including alarm definitions and
corresponding plugin configuration files.

[*] https://github.com/openstack/fuel-plugin-lma-collector/

Change-Id: If182c3a6453f7bf8b72f03af56a14ace109eaa68
This commit is contained in:
Éric Lemoine 2016-09-12 12:22:14 +00:00
parent 8149af754f
commit ed5934cd36
15 changed files with 1520 additions and 0 deletions

View File

@ -19,7 +19,9 @@ RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 1FA22B08 \
ADD output/*.lua /var/lib/hindsight/run/output/
ADD input/*.lua /var/lib/hindsight/run/input/
ADD analysis/*.lua /var/lib/hindsight/run/analysis/
ADD modules/*.lua /opt/ccp/lua/modules/stacklight/
ADD modules_alarms/afd_node_default_cpu_alarms.lua /opt/ccp/lua/modules/stacklight_alarms/
RUN useradd --user-group hindsight \
&& usermod -a -G microservices hindsight \

View File

@ -0,0 +1,120 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local string = require 'string'
local message = require 'stacklight.message'
local afd = require 'stacklight.afd'
local afd_annotation = require 'stacklight.afd_annotation'
-- node or service
local afd_type = read_config('afd_type') or error('afd_type must be specified!')
local msg_type
local msg_field_name
local afd_entity
if afd_type == 'node' then
msg_type = 'afd_node_metric'
msg_field_name = 'node_status'
afd_entity = 'node_role'
elseif afd_type == 'service' then
msg_type = 'afd_service_metric'
msg_field_name = 'service_status'
afd_entity = 'service'
else
error('invalid afd_type value')
end
-- ie: controller for node AFD / rabbitmq for service AFD
local afd_entity_value = read_config('afd_cluster_name') or
error('afd_cluster_name must be specified!')
-- ie: cpu for node AFD / queue for service AFD
local msg_field_source = read_config('afd_logical_name') or
error('afd_logical_name must be specified!')
local hostname = read_config('hostname') or error('hostname must be specified')
local afd_file = read_config('afd_file') or error('afd_file must be specified')
local all_alarms = require('stacklight_alarms.' .. afd_file)
local A = require 'stacklight.afd_alarms'
A.load_alarms(all_alarms)
function process_message()
local metric_name = read_message('Fields[name]')
local ts = read_message('Timestamp')
local value, err_msg = message.read_values()
if not value then
return -1, err_msg
end
-- retrieve field values
local fields = {}
for _, field in ipairs(A.get_metric_fields(metric_name)) do
local field_value = read_message(string.format('Fields[%s]', field))
if not field_value then
return -1, "Cannot find Fields[" .. field .. "] for the metric " .. metric_name
end
fields[field] = field_value
end
A.add_value(ts, metric_name, value, fields)
return 0
end
function timer_event(ns)
if A.is_started() then
local state, alarms = A.evaluate(ns)
if state then -- it was time to evaluate at least one alarm
for _, alarm in ipairs(alarms) do
afd.add_to_alarms(
alarm.state,
alarm.alert['function'],
alarm.alert.metric,
alarm.alert.fields,
{}, -- tags
alarm.alert.operator,
alarm.alert.value,
alarm.alert.threshold,
alarm.alert.window,
alarm.alert.periods,
alarm.alert.message)
end
-- Message example:
-- msg = {
-- Type = 'afd_node_metric',
-- Payload = '{"alarms":[...]}',
-- Fields = {
-- name = 'node_status',
-- value = 0,
-- hostname = 'node1',
-- source = 'cpu',
-- node_role = 'controller',
-- dimensions = {'node_role', 'source', 'hostname'},
-- }
-- }
local msg = afd.inject_afd_metric(
msg_type, afd_entity, afd_entity_value, msg_field_name,
state, hostname, msg_field_source)
if msg then
afd_annotation.inject_afd_annotation(msg)
end
end
else
A.set_start_time(ns)
end
end

View File

@ -0,0 +1,181 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local cjson = require 'cjson'
local string = require 'string'
local table = require 'table'
local utils = require 'stacklight.utils'
local constants = require 'stacklight.constants'
local read_message = read_message
local assert = assert
local ipairs = ipairs
local pcall = pcall
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
local function read_field(msg, name)
return msg.Fields[name]
end
function read_status(msg)
return read_field(msg, 'value')
end
function read_source(msg)
return read_field(msg, 'source')
end
function read_hostname(msg)
return read_field(msg, 'hostname')
end
function extract_alarms(msg)
local ok, payload = pcall(cjson.decode, msg.Payload)
if not ok or not payload.alarms then
return nil
end
return payload.alarms
end
-- return a human-readable message from an alarm table
-- for instance: "CPU load too high (WARNING, rule='last(load_midterm)>=5', current=7)"
function get_alarm_for_human(alarm)
local metric
if #(alarm.fields) > 0 then
local fields = {}
for _, field in ipairs(alarm.fields) do
fields[#fields+1] = field.name .. '="' .. field.value .. '"'
end
metric = string.format('%s[%s]', alarm.metric, table.concat(fields, ','))
else
metric = alarm.metric
end
local host = ''
if alarm.hostname then
host = string.format(', host=%s', alarm.hostname)
end
return string.format(
"%s (%s, rule='%s(%s)%s%s', current=%.2f%s)",
alarm.message,
alarm.severity,
alarm['function'],
metric,
alarm.operator,
alarm.threshold,
alarm.value,
host
)
end
function alarms_for_human(alarms)
local alarm_messages = {}
local hint_messages = {}
for _, v in ipairs(alarms) do
if v.tags and v.tags.dependency_level and v.tags.dependency_level == 'hint' then
hint_messages[#hint_messages+1] = get_alarm_for_human(v)
else
alarm_messages[#alarm_messages+1] = get_alarm_for_human(v)
end
end
if #hint_messages > 0 then
alarm_messages[#alarm_messages+1] = "Other related alarms:"
end
for _, v in ipairs(hint_messages) do
alarm_messages[#alarm_messages+1] = v
end
return alarm_messages
end
local alarms = {}
-- append an alarm to the list of pending alarms
-- the list is sent when inject_afd_metric is called
function add_to_alarms(status, fn, metric, fields, tags, operator, value, threshold, window, periods, message)
local severity = constants.status_label(status)
assert(severity)
alarms[#alarms+1] = {
severity=severity,
['function']=fn,
metric=metric,
fields=fields or {},
tags=tags or {},
operator=operator,
value=value,
threshold=threshold,
window=window or 0,
periods=periods or 0,
message=message
}
end
function get_alarms()
return alarms
end
function reset_alarms()
alarms = {}
end
-- inject an AFD event into the Heka pipeline
function inject_afd_metric(msg_type, msg_tag_name, msg_tag_value, metric_name,
value, hostname, source)
local payload
if #alarms > 0 then
payload = utils.safe_json_encode({alarms=alarms})
reset_alarms()
if not payload then
return
end
else
-- because cjson encodes empty tables as objects instead of arrays
payload = '{"alarms":[]}'
end
local msg = {
Type = msg_type,
Payload = payload,
Fields = {
name = metric_name,
value = value,
hostname = hostname,
source = source,
dimensions = {msg_tag_name, 'hostname', 'source'},
}
}
msg.Fields[msg_tag_name] = msg_tag_value
local err_code, err_msg = utils.safe_inject_message(msg)
if err_code ~= 0 then
return nil, err_msg
end
return msg
end
MATCH = 1
NO_MATCH = 2
NO_DATA = 3
MISSING_DATA = 4
return M

View File

@ -0,0 +1,224 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local assert = assert
local ipairs = ipairs
local pairs = pairs
local string = string
local setmetatable = setmetatable
local table_utils = require 'stacklight.table_utils'
local constants = require 'stacklight.constants'
local afd = require 'stacklight.afd'
local Rule = require 'stacklight.afd_rule'
local SEVERITIES = {
warning = constants.WARN,
critical = constants.CRIT,
down = constants.DOWN,
unknown = constants.UNKW,
okay = constants.OKAY,
}
local Alarm = {}
Alarm.__index = Alarm
setfenv(1, Alarm) -- Remove external access to contain everything in the module
function Alarm.new(alarm)
local a = {}
setmetatable(a, Alarm)
a._metrics_list = nil
a.name = alarm.name
a.description = alarm.description
if alarm.trigger.logical_operator then
a.logical_operator = string.lower(alarm.trigger.logical_operator)
else
a.logical_operator = 'or'
end
a.severity_str = string.upper(alarm.severity)
a.severity = SEVERITIES[string.lower(alarm.severity)]
assert(a.severity ~= nil)
a.skip_when_no_data = false
if alarm.no_data_policy then
if string.lower(alarm.no_data_policy) == 'skip' then
a.skip_when_no_data = true
else
a.no_data_severity = SEVERITIES[string.lower(alarm.no_data_policy)]
end
else
a.no_data_severity = constants.UNKW
end
assert(a.skip_when_no_data or a.no_data_severity ~= nil)
a.rules = {}
a.initial_wait = 0
for _, rule in ipairs(alarm.trigger.rules) do
local r = Rule.new(rule)
a.rules[#a.rules+1] = r
local wait = r.window * r.periods
if wait > a.initial_wait then
a.initial_wait = wait * 1e9
end
end
a.start_time_ns = 0
return a
end
-- return the Set of metrics used by the alarm
function Alarm:get_metrics()
if not self._metrics_list then
self._metrics_list = {}
for _, rule in ipairs(self.rules) do
if not table_utils.item_find(rule.metric, metrics) then
self._metrics_list[#self._metrics_list+1] = rule.metric
end
end
end
return self._metrics_list
end
-- return a list of field names used for the metric
-- (can have duplicate names)
function Alarm:get_metric_fields(metric_name)
local fields = {}
for _, rule in ipairs(self.rules) do
if rule.metric == metric_name then
for k, _ in pairs(rule.fields) do
fields[#fields+1] = k
end
for _, g in ipairs(rule.group_by) do
fields[#fields+1] = g
end
end
end
return fields
end
function Alarm:has_metric(metric)
return table_utils.item_find(metric, self:get_metrics())
end
-- dispatch datapoint in datastores
function Alarm:add_value(ts, metric, value, fields)
local data
for id, rule in pairs(self.rules) do
if rule.metric == metric then
rule:add_value(ts, value, fields)
end
end
end
-- convert fields to fields map
-- {foo="bar"} --> {name="foo", value="bar"}
local function convert_field_list(fields)
local named_fields = {}
for name, value in pairs(fields or {}) do
named_fields[#named_fields+1] = {name=name, value=value}
end
return named_fields
end
-- return: state of alarm and a list of alarm details.
--
-- with alarm list when state != OKAY:
-- {
-- {
-- value = <current value>,
-- fields = <metric fields table>,
-- message = <string>,
-- },
-- }
function Alarm:evaluate(ns)
local state = constants.OKAY
local matches = 0
local all_alerts = {}
local function add_alarm(rule, value, message, fields)
all_alerts[#all_alerts+1] = {
severity = self.severity_str,
['function'] = rule.fct,
metric = rule.metric,
operator = rule.relational_operator,
threshold = rule.threshold,
window = rule.window,
periods = rule.periods,
value = value,
fields = fields,
message = message
}
end
local one_unknown = false
local msg
for _, rule in ipairs(self.rules) do
local eval, context_list = rule:evaluate(ns)
if eval == afd.MATCH then
matches = matches + 1
msg = self.description
elseif eval == afd.MISSING_DATA then
msg = 'No datapoint have been received over the last ' .. rule.observation_window .. ' seconds'
one_unknown = true
elseif eval == afd.NO_DATA then
msg = 'No datapoint have been received ever'
one_unknown = true
end
for _, context in ipairs(context_list) do
add_alarm(rule, context.value, msg,
convert_field_list(context.fields))
end
end
if self.logical_operator == 'and' then
if one_unknown then
if self.skip_when_no_data then
state = nil
else
state = self.no_data_severity
end
elseif #self.rules == matches then
state = self.severity
end
elseif self.logical_operator == 'or' then
if matches > 0 then
state = self.severity
elseif one_unknown then
if self.skip_when_no_data then
state = nil
else
state = self.no_data_severity
end
end
end
if state == nil or state == constants.OKAY then
all_alerts = {}
end
return state, all_alerts
end
function Alarm:set_start_time(ns)
self.start_time_ns = ns
end
function Alarm:is_evaluation_time(ns)
local delta = ns - self.start_time_ns
if delta >= self.initial_wait then
return true
end
return false
end
return Alarm

View File

@ -0,0 +1,118 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local pairs = pairs
local ipairs = ipairs
local table_utils = require 'stacklight.table_utils'
local constants = require 'stacklight.constants'
local Alarm = require 'stacklight.afd_alarm'
local all_alarms = {}
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
-- return a list of field names required for the metric
function get_metric_fields(metric_name)
local fields = {}
for name, alarm in pairs(all_alarms) do
local mf = alarm:get_metric_fields(metric_name)
if mf then
for _, field in pairs(mf) do
if not table_utils.item_find(field, fields) then
fields[#fields+1] = field
end
end
end
end
return fields
end
-- return list of alarms interested by a metric
function get_interested_alarms(metric)
local interested_alarms = {}
for _, alarm in pairs(all_alarms) do
if alarm:has_metric(metric) then
interested_alarms[#interested_alarms+1] = alarm
end
end
return interested_alarms
end
function add_value(ts, metric, value, fields)
local interested_alarms = get_interested_alarms(metric)
for _, alarm in ipairs (interested_alarms) do
alarm:add_value(ts, metric, value, fields)
end
end
function reset_alarms()
all_alarms = {}
end
function evaluate(ns)
local global_state
local all_alerts = {}
for _, alarm in pairs(all_alarms) do
if alarm:is_evaluation_time(ns) then
local state, alerts = alarm:evaluate(ns)
global_state = constants.max_status(state, global_state)
for _, a in ipairs(alerts) do
all_alerts[#all_alerts+1] = { state=state, alert=a }
end
-- raise the first triggered alarm except for OKAY/UNKW states
if global_state ~= constants.UNKW and global_state ~= constants.OKAY then
break
end
end
end
return global_state, all_alerts
end
function get_alarms()
return all_alarms
end
function get_alarm(alarm_name)
for _, a in ipairs(all_alarms) do
if a.name == alarm_name then
return a
end
end
end
function load_alarm(alarm)
local A = Alarm.new(alarm)
all_alarms[#all_alarms+1] = A
end
function load_alarms(alarms)
for _, alarm in ipairs(alarms) do
load_alarm(alarm)
end
end
local started = false
function set_start_time(ns)
for _, alarm in ipairs(all_alarms) do
alarm:set_start_time(ns)
end
started = true
end
function is_started()
return started
end
return M

View File

@ -0,0 +1,99 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local string = require 'string'
local table = require 'table'
local utils = require 'stacklight.utils'
local consts = require 'stacklight.constants'
local afd = require 'stacklight.afd'
local M = {}
setfenv(1, M)
local statuses = {}
local annotation_msg = {
Type = 'metric',
Fields = {
name = 'annotation',
dimensions = {'source', 'hostname'},
value_fields = {'title', 'tags', 'text'},
title = nil,
tags = nil,
text = nil,
source = nil,
hostname = nil,
}
}
function inject_afd_annotation(msg)
local previous
local text
local source = afd.read_source(msg)
local status = afd.read_status(msg)
local hostname = afd.read_hostname(msg)
local alarms = afd.extract_alarms(msg)
if not source or not status or not alarms then
return -1
end
if not statuses[source] then
statuses[source] = {}
end
previous = statuses[source]
text = table.concat(afd.alarms_for_human(alarms), '<br />')
-- build the title
if not previous.status and status == consts.OKAY then
-- don't send an annotation when we detect a new cluster which is OKAY
return 0
elseif not previous.status then
title = string.format('General status is %s',
consts.status_label(status))
elseif previous.status ~= status then
title = string.format('General status %s -> %s',
consts.status_label(previous.status),
consts.status_label(status))
-- TODO(pasquier-s): generate an annotation when the set of alarms has
-- changed. the following code generated an annotation whenever at least
-- one value associated to an alarm was changing. This led to way too
-- many annotations with alarms monitoring the CPU usage for instance.
-- elseif previous.text ~= text then
-- title = string.format('General status remains %s',
-- consts.status_label(status))
else
-- nothing has changed since the last message
return 0
end
annotation_msg.Fields.title = title
annotation_msg.Fields.tags = source
annotation_msg.Fields.text = text
annotation_msg.Fields.source = source
annotation_msg.Fields.hostname = hostname
-- store the last status and alarm text for future messages
previous.status = status
previous.text = text
return utils.safe_inject_message(annotation_msg)
end
return M

View File

@ -0,0 +1,279 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local circular_buffer = require 'circular_buffer'
local stats = require 'lsb.stats'
local setmetatable = setmetatable
local ipairs = ipairs
local pairs = pairs
local math = require 'math'
local string = string
local table = table
local assert = assert
local type = type
-- StackLight libs
local table_utils = require 'stacklight.table_utils'
local constants = require 'stacklight.constants'
local afd = require 'stacklight.afd'
local matching = require 'stacklight.value_matching'
local MIN_WINDOW = 10
local MIN_PERIOD = 1
local SECONDS_PER_ROW = 5
local Rule = {}
Rule.__index = Rule
setfenv(1, Rule) -- Remove external access to contain everything in the module
function Rule.new(rule)
local r = {}
setmetatable(r, Rule)
local win = MIN_WINDOW
if rule.window and rule.window + 0 > 0 then
win = rule.window + 0
end
r.window = win
local periods = MIN_PERIOD
if rule.periods and rule.periods + 0 > 0 then
periods = rule.periods + 0
end
r.periods = periods
r.relational_operator = rule.relational_operator
r.metric = rule.metric
r.fields = rule.fields or {}
-- build field matching
r.field_matchers = {}
for f, expression in pairs(r.fields) do
r.field_matchers[f] = matching.new(expression)
end
r.fct = rule['function']
r.threshold = rule.threshold + 0
r.value_index = rule.value or nil -- Can be nil
-- build unique rule id
local arr = {r.metric, r.fct, r.window, r.periods}
for f, v in table_utils.orderedPairs(r.fields or {}) do
arr[#arr+1] = string.format('(%s=%s)', f, v)
end
r.rule_id = table.concat(arr, '/')
r.group_by = rule.group_by or {}
r.cbuf_size = math.ceil(r.window * r.periods / SECONDS_PER_ROW)
r.ids_datastore = {}
r.datastore = {}
r.observation_window = math.ceil(r.window * r.periods)
return r
end
function Rule:get_datastore_id(fields)
if #self.group_by == 0 or fields == nil then
return self.rule_id
end
local arr = {}
arr[#arr + 1] = self.rule_id
for _, g in ipairs(self.group_by) do
arr[#arr + 1] = fields[g]
end
return table.concat(arr, '/')
end
function Rule:fields_accepted(fields)
if not fields then
fields = {}
end
local matched_fields = 0
local no_match_on_fields = true
for f, expression in pairs(self.field_matchers) do
no_match_on_fields = false
for k, v in pairs(fields) do
if k == f then
if expression:matches(v) then
matched_fields = matched_fields + 1
else
return false
end
end
end
end
return no_match_on_fields or matched_fields > 0
end
function Rule:get_circular_buffer()
local fct
if self.fct == 'min' or self.fct == 'max' then
fct = self.fct
else
fct = 'sum'
end
local cbuf = circular_buffer.new(self.cbuf_size, 1, SECONDS_PER_ROW)
cbuf:set_header(1, self.metric, fct, fct)
return cbuf
end
-- store datapoints in cbuf, create the cbuf if not exists.
-- value can be a table where the index to choose is referenced by self.value_index
function Rule:add_value(ts, value, fields)
if not self:fields_accepted(fields) then
return
end
if type(value) == 'table' then
value = value[self.value_index]
end
if value == nil then
return
end
local data
local uniq_field_id = self:get_datastore_id(fields)
if not self.datastore[uniq_field_id] then
self.datastore[uniq_field_id] = {
fields = self.fields,
cbuf = self:get_circular_buffer()
}
if #self.group_by > 0 then
self.datastore[uniq_field_id].fields = fields
end
self:add_datastore(uniq_field_id)
end
data = self.datastore[uniq_field_id]
if self.fct == 'avg' then
data.cbuf:add(ts, 1, value)
else
data.cbuf:set(ts, 1, value)
end
end
function Rule:add_datastore(id)
if not table_utils.item_find(id, self.ids_datastore) then
self.ids_datastore[#self.ids_datastore+1] = id
end
end
function Rule:compare_threshold(value)
return constants.compare_threshold(value, self.relational_operator, self.threshold)
end
local function isnumber(value)
return value ~= nil and not (value ~= value)
end
local available_functions = {last=true, avg=true, max=true, min=true, sum=true,
variance=true, sd=true, diff=true}
-- evaluate the rule against datapoints
-- return a list: match (bool or string), context ({value=v, fields=list of field table})
--
-- examples:
-- true, { {value=100, fields={{queue='nova'}, {queue='neutron'}}, ..}
-- false, { {value=10, fields={}}, ..}
-- with 2 special cases:
-- - never receive one datapoint
-- 'nodata', {}
-- - no more datapoint received for a metric
-- 'missing', {value=-1, fields={}}
-- There is a drawback with the 'missing' state and could leads to emit false positive
-- state. For example when the monitored thing has been renamed/deleted,
-- it's normal to don't receive datapoint anymore .. for example a filesystem.
function Rule:evaluate(ns)
local fields = {}
local one_match, one_no_match, one_missing_data = false, false, false
for _, id in ipairs(self.ids_datastore) do
local data = self.datastore[id]
if data then
local cbuf_time = data.cbuf:current_time()
-- if we didn't receive datapoint within the observation window this means
-- we don't receive anymore data and cannot compute the rule.
if ns - cbuf_time > self.observation_window * 1e9 then
one_missing_data = true
fields[#fields+1] = {value = -1, fields = data.fields}
else
assert(available_functions[self.fct])
local result
if self.fct == 'last' then
local last
local t = ns
while (not isnumber(last)) and t >= ns - self.observation_window * 1e9 do
last = data.cbuf:get(t, 1)
t = t - SECONDS_PER_ROW * 1e9
end
if isnumber(last) then
result = last
else
one_missing_data = true
fields[#fields+1] = {value = -1, fields = data.fields}
end
elseif self.fct == 'diff' then
local first, last
local t = ns
while (not isnumber(last)) and t >= ns - self.observation_window * 1e9 do
last = data.cbuf:get(t, 1)
t = t - SECONDS_PER_ROW * 1e9
end
if isnumber(last) then
t = ns - self.observation_window * 1e9
while (not isnumber(first)) and t <= ns do
first = data.cbuf:get(t, 1)
t = t + SECONDS_PER_ROW * 1e9
end
end
if not isnumber(last) or not isnumber(first) then
one_missing_data = true
fields[#fields+1] = {value = -1, fields = data.fields}
else
result = last - first
end
else
local values = data.cbuf:get_range(1)
result = stats[self.fct](values)
end
if result then
local m = self:compare_threshold(result)
if m then
one_match = true
fields[#fields+1] = {value=result, fields=data.fields}
else
one_no_match = true
end
end
end
end
end
if one_match then
return afd.MATCH, fields
elseif one_missing_data then
return afd.MISSING_DATA, fields
elseif one_no_match then
return afd.NO_MATCH, {}
else
return afd.NO_DATA, {{value=-1, fields=self.fields}}
end
end
return Rule

View File

@ -0,0 +1,78 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
-- The status values were chosen to match with the Grafana constraints:
-- OKAY => green
-- WARN & UNKW => orange
-- CRIT & DOWN => red
OKAY=0
WARN=1
UNKW=2
CRIT=3
DOWN=4
local STATUS_LABELS = {
[OKAY]='OKAY',
[WARN]='WARN',
[UNKW]='UNKNOWN',
[CRIT]='CRITICAL',
[DOWN]='DOWN'
}
function status_label(v)
return STATUS_LABELS[v]
end
local STATUS_WEIGHTS = {
[UNKW]=0,
[OKAY]=1,
[WARN]=2,
[CRIT]=3,
[DOWN]=4
}
function max_status(val1, val2)
if not val1 then
return val2
elseif not val2 then
return val1
elseif STATUS_WEIGHTS[val1] > STATUS_WEIGHTS[val2] then
return val1
else
return val2
end
end
function compare_threshold(value, op, threshold)
local rule_matches = false
if op == '==' or op == 'eq' then
rule_matches = value == threshold
elseif op == '!=' or op == 'ne' then
rule_matches = value ~= threshold
elseif op == '>=' or op == 'gte' then
rule_matches = value >= threshold
elseif op == '>' or op == 'gt' then
rule_matches = value > threshold
elseif op == '<=' or op == 'lte' then
rule_matches = value <= threshold
elseif op == '<' or op == 'lt' then
rule_matches = value < threshold
end
return rule_matches
end
return M

View File

@ -0,0 +1,34 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local l = require 'lpeg'
l.locale(l)
local tonumber = tonumber
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
function anywhere (patt)
return l.P {
patt + 1 * l.V(1)
}
end
sp = l.space
-- Pattern used to match a number
Number = l.P"-"^-1 * l.xdigit^1 * (l.S(".,") * l.xdigit^1 )^-1 / tonumber
return M

View File

@ -0,0 +1,83 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local table = require 'table'
local ipairs = ipairs
local pairs = pairs
local type = type
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
-- return the position (index) of an item in a list, nil if not found
function item_pos(item, list)
if type(list) == 'table' then
for i, v in ipairs(list) do
if v == item then
return i
end
end
end
end
-- return true if an item is present in the list, false otherwise
function item_find(item, list)
return item_pos(item, list) ~= nil
end
-- from http://lua-users.org/wiki/SortedIteration
function __genOrderedIndex( t )
local orderedIndex = {}
for key in pairs(t) do
table.insert( orderedIndex, key )
end
table.sort( orderedIndex )
return orderedIndex
end
function orderedNext(t, state)
-- Equivalent of the next function, but returns the keys in the alphabetic
-- order. We use a temporary ordered key table that is stored in the
-- table being iterated.
key = nil
if state == nil then
-- the first time, generate the index
t.__orderedIndex = __genOrderedIndex( t )
key = t.__orderedIndex[1]
else
-- fetch the next value
for i = 1,table.getn(t.__orderedIndex) do
if t.__orderedIndex[i] == state then
key = t.__orderedIndex[i+1]
end
end
end
if key then
return key, t[key]
end
-- no more value to return, cleanup
t.__orderedIndex = nil
return
end
function orderedPairs(t)
-- Equivalent of the pairs() function on tables. Allows to iterate
-- in order
return orderedNext, t, nil
end
return M

View File

@ -0,0 +1,46 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local cjson = require 'cjson'
local inject_message = inject_message
local read_message = read_message
local string = string
local pcall = pcall
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
-- Encode a Lua variable as JSON without raising an exception if the encoding
-- fails for some reason (for instance, the encoded buffer exceeds the sandbox
-- limit)
function safe_json_encode(v)
local ok, data = pcall(cjson.encode, v)
if not ok then
return
end
return data
end
-- Call inject_message() wrapped by pcall()
function safe_inject_message(msg)
local ok, err_msg = pcall(inject_message, msg)
if not ok then
return -1, err_msg
else
return 0
end
end
return M

View File

@ -0,0 +1,171 @@
-- Copyright 2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local l = require "lpeg"
l.locale(l)
local pcall = pcall
local string = require 'string'
local patterns = require 'stacklight.patterns'
local error = error
local setmetatable = setmetatable
local tonumber = tonumber
local C = l.C
local P = l.P
local S = l.S
local V = l.V
local Ct = l.Ct
local Cc = l.Cc
local Optional_space = patterns.sp^0
local Only_spaces = patterns.sp^1 * -1
local function space(pat)
return Optional_space * pat * Optional_space
end
local EQ = P'=='
local NEQ = P'!='
local GT = P'>'
local LT = P'<'
local GTE = P'>='
local LTE = P'<='
local MATCH = P'=~'
local NO_MATCH = P'!~'
local OR = P'||'
local AND = P'&&'
local function get_operator(op)
if op == '' then
return '=='
end
return op
end
local numerical_operator = (EQ + NEQ + LTE + GTE + GT + LT )^-1 / get_operator
local sub_numerical_expression = space(numerical_operator) * patterns.Number * Optional_space
local is_plain_numeric = (sub_numerical_expression * ((OR^1 + AND^1) * sub_numerical_expression)^0) * -1
local quoted_string = (P'"' * C((P(1) - (P'"'))^1) * P'"' + C((P(1) - patterns.sp)^1))
local string_operator = (EQ + NEQ + MATCH + NO_MATCH)^-1 / get_operator
local sub_string_expression = space(string_operator) * quoted_string * Optional_space
local is_plain_string = (sub_string_expression * ((OR^1 + AND^1) * sub_string_expression)^0) * -1
local numerical_expression = P {
'OR';
AND = Ct(Cc('and') * V'SUB' * space(AND) * V'AND' + V'SUB'),
OR = Ct(Cc('or') * V'AND' * space(OR) * V'OR' + V'AND'),
SUB = Ct(sub_numerical_expression)
} * -1
local string_expression = P {
'OR';
AND = Ct(Cc('and') * V'SUB' * space(AND) * V'AND' + V'SUB'),
OR = Ct(Cc('or') * V'AND' * space(OR) * V'OR' + V'AND'),
SUB = Ct(sub_string_expression)
} * -1
local is_complex = patterns.anywhere(EQ + NEQ + LTE + GTE + GT + LT + MATCH + NO_MATCH + OR + AND)
local function eval_tree(tree, value)
local match = false
if type(tree[1]) == 'table' then
match = eval_tree(tree[1], value)
else
local operator = tree[1]
if operator == 'and' or operator == 'or' then
match = eval_tree(tree[2], value)
for i=3, #tree, 1 do
local m = eval_tree(tree[i], value)
if operator == 'or' then
match = match or m
else
match = match and m
end
end
else
local matcher = tree[2]
if operator == '==' then
return value == matcher
elseif operator == '!=' then
return value ~= matcher
elseif operator == '>' then
return value > matcher
elseif operator == '<' then
return value < matcher
elseif operator == '>=' then
return value >= matcher
elseif operator == '<=' then
return value <= matcher
elseif operator == '=~' then
local ok, m = pcall(string.find, value, matcher)
return ok and m ~= nil
elseif operator == '!~' then
local ok, m = pcall(string.find, value, matcher)
return ok and m == nil
end
end
end
return match
end
local MatchExpression = {}
MatchExpression.__index = MatchExpression
setfenv(1, MatchExpression) -- Remove external access to contain everything in the module
function MatchExpression.new(expression)
local r = {}
setmetatable(r, MatchExpression)
if is_complex:match(expression) then
r.is_plain_numeric_exp = is_plain_numeric:match(expression) ~= nil
if r.is_plain_numeric_exp then
r.tree = numerical_expression:match(expression)
elseif is_plain_string:match(expression) ~= nil then
r.tree = string_expression:match(expression)
end
if r.tree == nil then
error('Invalid expression: ' .. expression)
end
else
if expression == '' or Only_spaces:match(expression) then
error('Expression is empty')
end
r.is_simple_equality_matching = true
end
r.expression = expression
return r
end
function MatchExpression:matches(value)
if self.is_simple_equality_matching then
return self.expression == value or
tonumber(self.expression) == value or
tonumber(value) == self.expression
end
if self.is_plain_numeric_exp then
value = tonumber(value)
if value == nil then
return false
end
end
return eval_tree(self.tree, value)
end
return MatchExpression

View File

@ -0,0 +1,71 @@
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
local alarms = {
{
['name'] = 'cpu-critical',
['description'] = 'The CPU usage is too high',
['severity'] = 'critical',
['trigger'] = {
['logical_operator'] = 'or',
['rules'] = {
{
['metric'] = 'intel.procfs.cpu.idle_percentage',
['fields'] = {
['cpuID'] = 'all'
},
['relational_operator'] = '<=',
['threshold'] = '5',
['window'] = '120',
['periods'] = '0',
['function'] = 'avg',
},
{
['metric'] = 'intel.procfs.cpu.iowait_percentage',
['fields'] = {
['cpuID'] = 'all'
},
['relational_operator'] = '>=',
['threshold'] = '35',
['window'] = '120',
['periods'] = '0',
['function'] = 'avg',
},
},
},
},
{
['name'] = 'cpu-warning',
['description'] = 'The CPU usage is high',
['severity'] = 'warning',
['trigger'] = {
['logical_operator'] = 'or',
['rules'] = {
{
['metric'] = 'intel.procfs.cpu.idle_percentage',
['fields'] = {
['cpuID'] = 'all'
},
['relational_operator'] = '<=',
['threshold'] = '15',
['window'] = '120',
['periods'] = '0',
['function'] = 'avg',
},
{
['metric'] = 'intel.procfs.cpu.iowait_percentage',
['fields'] = {
['cpuID'] = 'all'
},
['relational_operator'] = '>=',
['threshold'] = '25',
['window'] = '120',
['periods'] = '0',
['function'] = 'avg',
},
},
},
},
}
return alarms

View File

@ -0,0 +1,9 @@
filename = "afd.lua"
log_level = 7
message_matcher = "TRUE"
ticker_interval = 10
afd_type = "node"
afd_file = "afd_node_default_cpu_alarms"
afd_cluster_name = "default"
afd_logical_name = "cpu"
hostname = "{{ CCP_HINDSIGHT_NODE_NAME }}"

View File

@ -15,6 +15,7 @@ service:
- prune-input.cfg
- influxdb-tcp.cfg
- kubelet-stats.cfg
- afd-node-default-cpu-alarms.cfg
volumes:
- name: hindsight-output
type: empty-dir
@ -70,6 +71,10 @@ files:
path: /var/lib/hindsight/run/input/kubelet_stats.cfg
content: hindsight_kubelet_stats.cfg.j2
perm: "0600"
afd-node-default-cpu-alarms.cfg:
path: /var/lib/hindsight/run/analysis/afd_node_default_cpu_alarms.cfg
content: hindsight_afd_node_default_cpu_alarms.cfg.j2
perm: "0600"
snap.conf:
path: /etc/snap/snap.conf
content: snap.conf.j2