Merge "Add Lua code for alarming"

This commit is contained in:
Jenkins 2016-09-16 10:25:27 +00:00 committed by Gerrit Code Review
commit 710e0c01bb
15 changed files with 1520 additions and 0 deletions

View File

@ -19,7 +19,9 @@ RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 1FA22B08 \
ADD output/*.lua /var/lib/hindsight/run/output/
ADD input/*.lua /var/lib/hindsight/run/input/
ADD analysis/*.lua /var/lib/hindsight/run/analysis/
ADD modules/*.lua /opt/ccp/lua/modules/stacklight/
ADD modules_alarms/afd_node_default_cpu_alarms.lua /opt/ccp/lua/modules/stacklight_alarms/
RUN useradd --user-group hindsight \
&& usermod -a -G microservices hindsight \

View File

@ -0,0 +1,120 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local string = require 'string'
local message = require 'stacklight.message'
local afd = require 'stacklight.afd'
local afd_annotation = require 'stacklight.afd_annotation'
-- node or service
local afd_type = read_config('afd_type') or error('afd_type must be specified!')
local msg_type
local msg_field_name
local afd_entity
if afd_type == 'node' then
msg_type = 'afd_node_metric'
msg_field_name = 'node_status'
afd_entity = 'node_role'
elseif afd_type == 'service' then
msg_type = 'afd_service_metric'
msg_field_name = 'service_status'
afd_entity = 'service'
else
error('invalid afd_type value')
end
-- ie: controller for node AFD / rabbitmq for service AFD
local afd_entity_value = read_config('afd_cluster_name') or
error('afd_cluster_name must be specified!')
-- ie: cpu for node AFD / queue for service AFD
local msg_field_source = read_config('afd_logical_name') or
error('afd_logical_name must be specified!')
local hostname = read_config('hostname') or error('hostname must be specified')
local afd_file = read_config('afd_file') or error('afd_file must be specified')
local all_alarms = require('stacklight_alarms.' .. afd_file)
local A = require 'stacklight.afd_alarms'
A.load_alarms(all_alarms)
function process_message()
local metric_name = read_message('Fields[name]')
local ts = read_message('Timestamp')
local value, err_msg = message.read_values()
if not value then
return -1, err_msg
end
-- retrieve field values
local fields = {}
for _, field in ipairs(A.get_metric_fields(metric_name)) do
local field_value = read_message(string.format('Fields[%s]', field))
if not field_value then
return -1, "Cannot find Fields[" .. field .. "] for the metric " .. metric_name
end
fields[field] = field_value
end
A.add_value(ts, metric_name, value, fields)
return 0
end
function timer_event(ns)
if A.is_started() then
local state, alarms = A.evaluate(ns)
if state then -- it was time to evaluate at least one alarm
for _, alarm in ipairs(alarms) do
afd.add_to_alarms(
alarm.state,
alarm.alert['function'],
alarm.alert.metric,
alarm.alert.fields,
{}, -- tags
alarm.alert.operator,
alarm.alert.value,
alarm.alert.threshold,
alarm.alert.window,
alarm.alert.periods,
alarm.alert.message)
end
-- Message example:
-- msg = {
-- Type = 'afd_node_metric',
-- Payload = '{"alarms":[...]}',
-- Fields = {
-- name = 'node_status',
-- value = 0,
-- hostname = 'node1',
-- source = 'cpu',
-- node_role = 'controller',
-- dimensions = {'node_role', 'source', 'hostname'},
-- }
-- }
local msg = afd.inject_afd_metric(
msg_type, afd_entity, afd_entity_value, msg_field_name,
state, hostname, msg_field_source)
if msg then
afd_annotation.inject_afd_annotation(msg)
end
end
else
A.set_start_time(ns)
end
end

View File

@ -0,0 +1,181 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local cjson = require 'cjson'
local string = require 'string'
local table = require 'table'
local utils = require 'stacklight.utils'
local constants = require 'stacklight.constants'
local read_message = read_message
local assert = assert
local ipairs = ipairs
local pcall = pcall
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
local function read_field(msg, name)
return msg.Fields[name]
end
function read_status(msg)
return read_field(msg, 'value')
end
function read_source(msg)
return read_field(msg, 'source')
end
function read_hostname(msg)
return read_field(msg, 'hostname')
end
function extract_alarms(msg)
local ok, payload = pcall(cjson.decode, msg.Payload)
if not ok or not payload.alarms then
return nil
end
return payload.alarms
end
-- return a human-readable message from an alarm table
-- for instance: "CPU load too high (WARNING, rule='last(load_midterm)>=5', current=7)"
function get_alarm_for_human(alarm)
local metric
if #(alarm.fields) > 0 then
local fields = {}
for _, field in ipairs(alarm.fields) do
fields[#fields+1] = field.name .. '="' .. field.value .. '"'
end
metric = string.format('%s[%s]', alarm.metric, table.concat(fields, ','))
else
metric = alarm.metric
end
local host = ''
if alarm.hostname then
host = string.format(', host=%s', alarm.hostname)
end
return string.format(
"%s (%s, rule='%s(%s)%s%s', current=%.2f%s)",
alarm.message,
alarm.severity,
alarm['function'],
metric,
alarm.operator,
alarm.threshold,
alarm.value,
host
)
end
function alarms_for_human(alarms)
local alarm_messages = {}
local hint_messages = {}
for _, v in ipairs(alarms) do
if v.tags and v.tags.dependency_level and v.tags.dependency_level == 'hint' then
hint_messages[#hint_messages+1] = get_alarm_for_human(v)
else
alarm_messages[#alarm_messages+1] = get_alarm_for_human(v)
end
end
if #hint_messages > 0 then
alarm_messages[#alarm_messages+1] = "Other related alarms:"
end
for _, v in ipairs(hint_messages) do
alarm_messages[#alarm_messages+1] = v
end
return alarm_messages
end
local alarms = {}
-- append an alarm to the list of pending alarms
-- the list is sent when inject_afd_metric is called
function add_to_alarms(status, fn, metric, fields, tags, operator, value, threshold, window, periods, message)
local severity = constants.status_label(status)
assert(severity)
alarms[#alarms+1] = {
severity=severity,
['function']=fn,
metric=metric,
fields=fields or {},
tags=tags or {},
operator=operator,
value=value,
threshold=threshold,
window=window or 0,
periods=periods or 0,
message=message
}
end
function get_alarms()
return alarms
end
function reset_alarms()
alarms = {}
end
-- inject an AFD event into the Heka pipeline
function inject_afd_metric(msg_type, msg_tag_name, msg_tag_value, metric_name,
value, hostname, source)
local payload
if #alarms > 0 then
payload = utils.safe_json_encode({alarms=alarms})
reset_alarms()
if not payload then
return
end
else
-- because cjson encodes empty tables as objects instead of arrays
payload = '{"alarms":[]}'
end
local msg = {
Type = msg_type,
Payload = payload,
Fields = {
name = metric_name,
value = value,
hostname = hostname,
source = source,
dimensions = {msg_tag_name, 'hostname', 'source'},
}
}
msg.Fields[msg_tag_name] = msg_tag_value
local err_code, err_msg = utils.safe_inject_message(msg)
if err_code ~= 0 then
return nil, err_msg
end
return msg
end
MATCH = 1
NO_MATCH = 2
NO_DATA = 3
MISSING_DATA = 4
return M

View File

@ -0,0 +1,224 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local assert = assert
local ipairs = ipairs
local pairs = pairs
local string = string
local setmetatable = setmetatable
local table_utils = require 'stacklight.table_utils'
local constants = require 'stacklight.constants'
local afd = require 'stacklight.afd'
local Rule = require 'stacklight.afd_rule'
local SEVERITIES = {
warning = constants.WARN,
critical = constants.CRIT,
down = constants.DOWN,
unknown = constants.UNKW,
okay = constants.OKAY,
}
local Alarm = {}
Alarm.__index = Alarm
setfenv(1, Alarm) -- Remove external access to contain everything in the module
function Alarm.new(alarm)
local a = {}
setmetatable(a, Alarm)
a._metrics_list = nil
a.name = alarm.name
a.description = alarm.description
if alarm.trigger.logical_operator then
a.logical_operator = string.lower(alarm.trigger.logical_operator)
else
a.logical_operator = 'or'
end
a.severity_str = string.upper(alarm.severity)
a.severity = SEVERITIES[string.lower(alarm.severity)]
assert(a.severity ~= nil)
a.skip_when_no_data = false
if alarm.no_data_policy then
if string.lower(alarm.no_data_policy) == 'skip' then
a.skip_when_no_data = true
else
a.no_data_severity = SEVERITIES[string.lower(alarm.no_data_policy)]
end
else
a.no_data_severity = constants.UNKW
end
assert(a.skip_when_no_data or a.no_data_severity ~= nil)
a.rules = {}
a.initial_wait = 0
for _, rule in ipairs(alarm.trigger.rules) do
local r = Rule.new(rule)
a.rules[#a.rules+1] = r
local wait = r.window * r.periods
if wait > a.initial_wait then
a.initial_wait = wait * 1e9
end
end
a.start_time_ns = 0
return a
end
-- return the Set of metrics used by the alarm
function Alarm:get_metrics()
if not self._metrics_list then
self._metrics_list = {}
for _, rule in ipairs(self.rules) do
if not table_utils.item_find(rule.metric, metrics) then
self._metrics_list[#self._metrics_list+1] = rule.metric
end
end
end
return self._metrics_list
end
-- return a list of field names used for the metric
-- (can have duplicate names)
function Alarm:get_metric_fields(metric_name)
local fields = {}
for _, rule in ipairs(self.rules) do
if rule.metric == metric_name then
for k, _ in pairs(rule.fields) do
fields[#fields+1] = k
end
for _, g in ipairs(rule.group_by) do
fields[#fields+1] = g
end
end
end
return fields
end
function Alarm:has_metric(metric)
return table_utils.item_find(metric, self:get_metrics())
end
-- dispatch datapoint in datastores
function Alarm:add_value(ts, metric, value, fields)
local data
for id, rule in pairs(self.rules) do
if rule.metric == metric then
rule:add_value(ts, value, fields)
end
end
end
-- convert fields to fields map
-- {foo="bar"} --> {name="foo", value="bar"}
local function convert_field_list(fields)
local named_fields = {}
for name, value in pairs(fields or {}) do
named_fields[#named_fields+1] = {name=name, value=value}
end
return named_fields
end
-- return: state of alarm and a list of alarm details.
--
-- with alarm list when state != OKAY:
-- {
-- {
-- value = <current value>,
-- fields = <metric fields table>,
-- message = <string>,
-- },
-- }
function Alarm:evaluate(ns)
local state = constants.OKAY
local matches = 0
local all_alerts = {}
local function add_alarm(rule, value, message, fields)
all_alerts[#all_alerts+1] = {
severity = self.severity_str,
['function'] = rule.fct,
metric = rule.metric,
operator = rule.relational_operator,
threshold = rule.threshold,
window = rule.window,
periods = rule.periods,
value = value,
fields = fields,
message = message
}
end
local one_unknown = false
local msg
for _, rule in ipairs(self.rules) do
local eval, context_list = rule:evaluate(ns)
if eval == afd.MATCH then
matches = matches + 1
msg = self.description
elseif eval == afd.MISSING_DATA then
msg = 'No datapoint have been received over the last ' .. rule.observation_window .. ' seconds'
one_unknown = true
elseif eval == afd.NO_DATA then
msg = 'No datapoint have been received ever'
one_unknown = true
end
for _, context in ipairs(context_list) do
add_alarm(rule, context.value, msg,
convert_field_list(context.fields))
end
end
if self.logical_operator == 'and' then
if one_unknown then
if self.skip_when_no_data then
state = nil
else
state = self.no_data_severity
end
elseif #self.rules == matches then
state = self.severity
end
elseif self.logical_operator == 'or' then
if matches > 0 then
state = self.severity
elseif one_unknown then
if self.skip_when_no_data then
state = nil
else
state = self.no_data_severity
end
end
end
if state == nil or state == constants.OKAY then
all_alerts = {}
end
return state, all_alerts
end
function Alarm:set_start_time(ns)
self.start_time_ns = ns
end
function Alarm:is_evaluation_time(ns)
local delta = ns - self.start_time_ns
if delta >= self.initial_wait then
return true
end
return false
end
return Alarm

View File

@ -0,0 +1,118 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local pairs = pairs
local ipairs = ipairs
local table_utils = require 'stacklight.table_utils'
local constants = require 'stacklight.constants'
local Alarm = require 'stacklight.afd_alarm'
local all_alarms = {}
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
-- return a list of field names required for the metric
function get_metric_fields(metric_name)
local fields = {}
for name, alarm in pairs(all_alarms) do
local mf = alarm:get_metric_fields(metric_name)
if mf then
for _, field in pairs(mf) do
if not table_utils.item_find(field, fields) then
fields[#fields+1] = field
end
end
end
end
return fields
end
-- return list of alarms interested by a metric
function get_interested_alarms(metric)
local interested_alarms = {}
for _, alarm in pairs(all_alarms) do
if alarm:has_metric(metric) then
interested_alarms[#interested_alarms+1] = alarm
end
end
return interested_alarms
end
function add_value(ts, metric, value, fields)
local interested_alarms = get_interested_alarms(metric)
for _, alarm in ipairs (interested_alarms) do
alarm:add_value(ts, metric, value, fields)
end
end
function reset_alarms()
all_alarms = {}
end
function evaluate(ns)
local global_state
local all_alerts = {}
for _, alarm in pairs(all_alarms) do
if alarm:is_evaluation_time(ns) then
local state, alerts = alarm:evaluate(ns)
global_state = constants.max_status(state, global_state)
for _, a in ipairs(alerts) do
all_alerts[#all_alerts+1] = { state=state, alert=a }
end
-- raise the first triggered alarm except for OKAY/UNKW states
if global_state ~= constants.UNKW and global_state ~= constants.OKAY then
break
end
end
end
return global_state, all_alerts
end
function get_alarms()
return all_alarms
end
function get_alarm(alarm_name)
for _, a in ipairs(all_alarms) do
if a.name == alarm_name then
return a
end
end
end
function load_alarm(alarm)
local A = Alarm.new(alarm)
all_alarms[#all_alarms+1] = A
end
function load_alarms(alarms)
for _, alarm in ipairs(alarms) do
load_alarm(alarm)
end
end
local started = false
function set_start_time(ns)
for _, alarm in ipairs(all_alarms) do
alarm:set_start_time(ns)
end
started = true
end
function is_started()
return started
end
return M

View File

@ -0,0 +1,99 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local string = require 'string'
local table = require 'table'
local utils = require 'stacklight.utils'
local consts = require 'stacklight.constants'
local afd = require 'stacklight.afd'
local M = {}
setfenv(1, M)
local statuses = {}
local annotation_msg = {
Type = 'metric',
Fields = {
name = 'annotation',
dimensions = {'source', 'hostname'},
value_fields = {'title', 'tags', 'text'},
title = nil,
tags = nil,
text = nil,
source = nil,
hostname = nil,
}
}
function inject_afd_annotation(msg)
local previous
local text
local source = afd.read_source(msg)
local status = afd.read_status(msg)
local hostname = afd.read_hostname(msg)
local alarms = afd.extract_alarms(msg)
if not source or not status or not alarms then
return -1
end
if not statuses[source] then
statuses[source] = {}
end
previous = statuses[source]
text = table.concat(afd.alarms_for_human(alarms), '<br />')
-- build the title
if not previous.status and status == consts.OKAY then
-- don't send an annotation when we detect a new cluster which is OKAY
return 0
elseif not previous.status then
title = string.format('General status is %s',
consts.status_label(status))
elseif previous.status ~= status then
title = string.format('General status %s -> %s',
consts.status_label(previous.status),
consts.status_label(status))
-- TODO(pasquier-s): generate an annotation when the set of alarms has
-- changed. the following code generated an annotation whenever at least
-- one value associated to an alarm was changing. This led to way too
-- many annotations with alarms monitoring the CPU usage for instance.
-- elseif previous.text ~= text then
-- title = string.format('General status remains %s',
-- consts.status_label(status))
else
-- nothing has changed since the last message
return 0
end
annotation_msg.Fields.title = title
annotation_msg.Fields.tags = source
annotation_msg.Fields.text = text
annotation_msg.Fields.source = source
annotation_msg.Fields.hostname = hostname
-- store the last status and alarm text for future messages
previous.status = status
previous.text = text
return utils.safe_inject_message(annotation_msg)
end
return M

View File

@ -0,0 +1,279 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local circular_buffer = require 'circular_buffer'
local stats = require 'lsb.stats'
local setmetatable = setmetatable
local ipairs = ipairs
local pairs = pairs
local math = require 'math'
local string = string
local table = table
local assert = assert
local type = type
-- StackLight libs
local table_utils = require 'stacklight.table_utils'
local constants = require 'stacklight.constants'
local afd = require 'stacklight.afd'
local matching = require 'stacklight.value_matching'
local MIN_WINDOW = 10
local MIN_PERIOD = 1
local SECONDS_PER_ROW = 5
local Rule = {}
Rule.__index = Rule
setfenv(1, Rule) -- Remove external access to contain everything in the module
function Rule.new(rule)
local r = {}
setmetatable(r, Rule)
local win = MIN_WINDOW
if rule.window and rule.window + 0 > 0 then
win = rule.window + 0
end
r.window = win
local periods = MIN_PERIOD
if rule.periods and rule.periods + 0 > 0 then
periods = rule.periods + 0
end
r.periods = periods
r.relational_operator = rule.relational_operator
r.metric = rule.metric
r.fields = rule.fields or {}
-- build field matching
r.field_matchers = {}
for f, expression in pairs(r.fields) do
r.field_matchers[f] = matching.new(expression)
end
r.fct = rule['function']
r.threshold = rule.threshold + 0
r.value_index = rule.value or nil -- Can be nil
-- build unique rule id
local arr = {r.metric, r.fct, r.window, r.periods}
for f, v in table_utils.orderedPairs(r.fields or {}) do
arr[#arr+1] = string.format('(%s=%s)', f, v)
end
r.rule_id = table.concat(arr, '/')
r.group_by = rule.group_by or {}
r.cbuf_size = math.ceil(r.window * r.periods / SECONDS_PER_ROW)
r.ids_datastore = {}
r.datastore = {}
r.observation_window = math.ceil(r.window * r.periods)
return r
end
function Rule:get_datastore_id(fields)
if #self.group_by == 0 or fields == nil then
return self.rule_id
end
local arr = {}
arr[#arr + 1] = self.rule_id
for _, g in ipairs(self.group_by) do
arr[#arr + 1] = fields[g]
end
return table.concat(arr, '/')
end
function Rule:fields_accepted(fields)
if not fields then
fields = {}
end
local matched_fields = 0
local no_match_on_fields = true
for f, expression in pairs(self.field_matchers) do
no_match_on_fields = false
for k, v in pairs(fields) do
if k == f then
if expression:matches(v) then
matched_fields = matched_fields + 1
else
return false
end
end
end
end
return no_match_on_fields or matched_fields > 0
end
function Rule:get_circular_buffer()
local fct
if self.fct == 'min' or self.fct == 'max' then
fct = self.fct
else
fct = 'sum'
end
local cbuf = circular_buffer.new(self.cbuf_size, 1, SECONDS_PER_ROW)
cbuf:set_header(1, self.metric, fct, fct)
return cbuf
end
-- store datapoints in cbuf, create the cbuf if not exists.
-- value can be a table where the index to choose is referenced by self.value_index
function Rule:add_value(ts, value, fields)
if not self:fields_accepted(fields) then
return
end
if type(value) == 'table' then
value = value[self.value_index]
end
if value == nil then
return
end
local data
local uniq_field_id = self:get_datastore_id(fields)
if not self.datastore[uniq_field_id] then
self.datastore[uniq_field_id] = {
fields = self.fields,
cbuf = self:get_circular_buffer()
}
if #self.group_by > 0 then
self.datastore[uniq_field_id].fields = fields
end
self:add_datastore(uniq_field_id)
end
data = self.datastore[uniq_field_id]
if self.fct == 'avg' then
data.cbuf:add(ts, 1, value)
else
data.cbuf:set(ts, 1, value)
end
end
function Rule:add_datastore(id)
if not table_utils.item_find(id, self.ids_datastore) then
self.ids_datastore[#self.ids_datastore+1] = id
end
end
function Rule:compare_threshold(value)
return constants.compare_threshold(value, self.relational_operator, self.threshold)
end
local function isnumber(value)
return value ~= nil and not (value ~= value)
end
local available_functions = {last=true, avg=true, max=true, min=true, sum=true,
variance=true, sd=true, diff=true}
-- evaluate the rule against datapoints
-- return a list: match (bool or string), context ({value=v, fields=list of field table})
--
-- examples:
-- true, { {value=100, fields={{queue='nova'}, {queue='neutron'}}, ..}
-- false, { {value=10, fields={}}, ..}
-- with 2 special cases:
-- - never receive one datapoint
-- 'nodata', {}
-- - no more datapoint received for a metric
-- 'missing', {value=-1, fields={}}
-- There is a drawback with the 'missing' state and could leads to emit false positive
-- state. For example when the monitored thing has been renamed/deleted,
-- it's normal to don't receive datapoint anymore .. for example a filesystem.
function Rule:evaluate(ns)
local fields = {}
local one_match, one_no_match, one_missing_data = false, false, false
for _, id in ipairs(self.ids_datastore) do
local data = self.datastore[id]
if data then
local cbuf_time = data.cbuf:current_time()
-- if we didn't receive datapoint within the observation window this means
-- we don't receive anymore data and cannot compute the rule.
if ns - cbuf_time > self.observation_window * 1e9 then
one_missing_data = true
fields[#fields+1] = {value = -1, fields = data.fields}
else
assert(available_functions[self.fct])
local result
if self.fct == 'last' then
local last
local t = ns
while (not isnumber(last)) and t >= ns - self.observation_window * 1e9 do
last = data.cbuf:get(t, 1)
t = t - SECONDS_PER_ROW * 1e9
end
if isnumber(last) then
result = last
else
one_missing_data = true
fields[#fields+1] = {value = -1, fields = data.fields}
end
elseif self.fct == 'diff' then
local first, last
local t = ns
while (not isnumber(last)) and t >= ns - self.observation_window * 1e9 do
last = data.cbuf:get(t, 1)
t = t - SECONDS_PER_ROW * 1e9
end
if isnumber(last) then
t = ns - self.observation_window * 1e9
while (not isnumber(first)) and t <= ns do
first = data.cbuf:get(t, 1)
t = t + SECONDS_PER_ROW * 1e9
end
end
if not isnumber(last) or not isnumber(first) then
one_missing_data = true
fields[#fields+1] = {value = -1, fields = data.fields}
else
result = last - first
end
else
local values = data.cbuf:get_range(1)
result = stats[self.fct](values)
end
if result then
local m = self:compare_threshold(result)
if m then
one_match = true
fields[#fields+1] = {value=result, fields=data.fields}
else
one_no_match = true
end
end
end
end
end
if one_match then
return afd.MATCH, fields
elseif one_missing_data then
return afd.MISSING_DATA, fields
elseif one_no_match then
return afd.NO_MATCH, {}
else
return afd.NO_DATA, {{value=-1, fields=self.fields}}
end
end
return Rule

View File

@ -0,0 +1,78 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
-- The status values were chosen to match with the Grafana constraints:
-- OKAY => green
-- WARN & UNKW => orange
-- CRIT & DOWN => red
OKAY=0
WARN=1
UNKW=2
CRIT=3
DOWN=4
local STATUS_LABELS = {
[OKAY]='OKAY',
[WARN]='WARN',
[UNKW]='UNKNOWN',
[CRIT]='CRITICAL',
[DOWN]='DOWN'
}
function status_label(v)
return STATUS_LABELS[v]
end
local STATUS_WEIGHTS = {
[UNKW]=0,
[OKAY]=1,
[WARN]=2,
[CRIT]=3,
[DOWN]=4
}
function max_status(val1, val2)
if not val1 then
return val2
elseif not val2 then
return val1
elseif STATUS_WEIGHTS[val1] > STATUS_WEIGHTS[val2] then
return val1
else
return val2
end
end
function compare_threshold(value, op, threshold)
local rule_matches = false
if op == '==' or op == 'eq' then
rule_matches = value == threshold
elseif op == '!=' or op == 'ne' then
rule_matches = value ~= threshold
elseif op == '>=' or op == 'gte' then
rule_matches = value >= threshold
elseif op == '>' or op == 'gt' then
rule_matches = value > threshold
elseif op == '<=' or op == 'lte' then
rule_matches = value <= threshold
elseif op == '<' or op == 'lt' then
rule_matches = value < threshold
end
return rule_matches
end
return M

View File

@ -0,0 +1,34 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local l = require 'lpeg'
l.locale(l)
local tonumber = tonumber
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
function anywhere (patt)
return l.P {
patt + 1 * l.V(1)
}
end
sp = l.space
-- Pattern used to match a number
Number = l.P"-"^-1 * l.xdigit^1 * (l.S(".,") * l.xdigit^1 )^-1 / tonumber
return M

View File

@ -0,0 +1,83 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local table = require 'table'
local ipairs = ipairs
local pairs = pairs
local type = type
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
-- return the position (index) of an item in a list, nil if not found
function item_pos(item, list)
if type(list) == 'table' then
for i, v in ipairs(list) do
if v == item then
return i
end
end
end
end
-- return true if an item is present in the list, false otherwise
function item_find(item, list)
return item_pos(item, list) ~= nil
end
-- from http://lua-users.org/wiki/SortedIteration
function __genOrderedIndex( t )
local orderedIndex = {}
for key in pairs(t) do
table.insert( orderedIndex, key )
end
table.sort( orderedIndex )
return orderedIndex
end
function orderedNext(t, state)
-- Equivalent of the next function, but returns the keys in the alphabetic
-- order. We use a temporary ordered key table that is stored in the
-- table being iterated.
key = nil
if state == nil then
-- the first time, generate the index
t.__orderedIndex = __genOrderedIndex( t )
key = t.__orderedIndex[1]
else
-- fetch the next value
for i = 1,table.getn(t.__orderedIndex) do
if t.__orderedIndex[i] == state then
key = t.__orderedIndex[i+1]
end
end
end
if key then
return key, t[key]
end
-- no more value to return, cleanup
t.__orderedIndex = nil
return
end
function orderedPairs(t)
-- Equivalent of the pairs() function on tables. Allows to iterate
-- in order
return orderedNext, t, nil
end
return M

View File

@ -0,0 +1,46 @@
-- Copyright 2015-2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local cjson = require 'cjson'
local inject_message = inject_message
local read_message = read_message
local string = string
local pcall = pcall
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
-- Encode a Lua variable as JSON without raising an exception if the encoding
-- fails for some reason (for instance, the encoded buffer exceeds the sandbox
-- limit)
function safe_json_encode(v)
local ok, data = pcall(cjson.encode, v)
if not ok then
return
end
return data
end
-- Call inject_message() wrapped by pcall()
function safe_inject_message(msg)
local ok, err_msg = pcall(inject_message, msg)
if not ok then
return -1, err_msg
else
return 0
end
end
return M

View File

@ -0,0 +1,171 @@
-- Copyright 2016 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local l = require "lpeg"
l.locale(l)
local pcall = pcall
local string = require 'string'
local patterns = require 'stacklight.patterns'
local error = error
local setmetatable = setmetatable
local tonumber = tonumber
local C = l.C
local P = l.P
local S = l.S
local V = l.V
local Ct = l.Ct
local Cc = l.Cc
local Optional_space = patterns.sp^0
local Only_spaces = patterns.sp^1 * -1
local function space(pat)
return Optional_space * pat * Optional_space
end
local EQ = P'=='
local NEQ = P'!='
local GT = P'>'
local LT = P'<'
local GTE = P'>='
local LTE = P'<='
local MATCH = P'=~'
local NO_MATCH = P'!~'
local OR = P'||'
local AND = P'&&'
local function get_operator(op)
if op == '' then
return '=='
end
return op
end
local numerical_operator = (EQ + NEQ + LTE + GTE + GT + LT )^-1 / get_operator
local sub_numerical_expression = space(numerical_operator) * patterns.Number * Optional_space
local is_plain_numeric = (sub_numerical_expression * ((OR^1 + AND^1) * sub_numerical_expression)^0) * -1
local quoted_string = (P'"' * C((P(1) - (P'"'))^1) * P'"' + C((P(1) - patterns.sp)^1))
local string_operator = (EQ + NEQ + MATCH + NO_MATCH)^-1 / get_operator
local sub_string_expression = space(string_operator) * quoted_string * Optional_space
local is_plain_string = (sub_string_expression * ((OR^1 + AND^1) * sub_string_expression)^0) * -1
local numerical_expression = P {
'OR';
AND = Ct(Cc('and') * V'SUB' * space(AND) * V'AND' + V'SUB'),
OR = Ct(Cc('or') * V'AND' * space(OR) * V'OR' + V'AND'),
SUB = Ct(sub_numerical_expression)
} * -1
local string_expression = P {
'OR';
AND = Ct(Cc('and') * V'SUB' * space(AND) * V'AND' + V'SUB'),
OR = Ct(Cc('or') * V'AND' * space(OR) * V'OR' + V'AND'),
SUB = Ct(sub_string_expression)
} * -1
local is_complex = patterns.anywhere(EQ + NEQ + LTE + GTE + GT + LT + MATCH + NO_MATCH + OR + AND)
local function eval_tree(tree, value)
local match = false
if type(tree[1]) == 'table' then
match = eval_tree(tree[1], value)
else
local operator = tree[1]
if operator == 'and' or operator == 'or' then
match = eval_tree(tree[2], value)
for i=3, #tree, 1 do
local m = eval_tree(tree[i], value)
if operator == 'or' then
match = match or m
else
match = match and m
end
end
else
local matcher = tree[2]
if operator == '==' then
return value == matcher
elseif operator == '!=' then
return value ~= matcher
elseif operator == '>' then
return value > matcher
elseif operator == '<' then
return value < matcher
elseif operator == '>=' then
return value >= matcher
elseif operator == '<=' then
return value <= matcher
elseif operator == '=~' then
local ok, m = pcall(string.find, value, matcher)
return ok and m ~= nil
elseif operator == '!~' then
local ok, m = pcall(string.find, value, matcher)
return ok and m == nil
end
end
end
return match
end
local MatchExpression = {}
MatchExpression.__index = MatchExpression
setfenv(1, MatchExpression) -- Remove external access to contain everything in the module
function MatchExpression.new(expression)
local r = {}
setmetatable(r, MatchExpression)
if is_complex:match(expression) then
r.is_plain_numeric_exp = is_plain_numeric:match(expression) ~= nil
if r.is_plain_numeric_exp then
r.tree = numerical_expression:match(expression)
elseif is_plain_string:match(expression) ~= nil then
r.tree = string_expression:match(expression)
end
if r.tree == nil then
error('Invalid expression: ' .. expression)
end
else
if expression == '' or Only_spaces:match(expression) then
error('Expression is empty')
end
r.is_simple_equality_matching = true
end
r.expression = expression
return r
end
function MatchExpression:matches(value)
if self.is_simple_equality_matching then
return self.expression == value or
tonumber(self.expression) == value or
tonumber(value) == self.expression
end
if self.is_plain_numeric_exp then
value = tonumber(value)
if value == nil then
return false
end
end
return eval_tree(self.tree, value)
end
return MatchExpression

View File

@ -0,0 +1,71 @@
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
local alarms = {
{
['name'] = 'cpu-critical',
['description'] = 'The CPU usage is too high',
['severity'] = 'critical',
['trigger'] = {
['logical_operator'] = 'or',
['rules'] = {
{
['metric'] = 'intel.procfs.cpu.idle_percentage',
['fields'] = {
['cpuID'] = 'all'
},
['relational_operator'] = '<=',
['threshold'] = '5',
['window'] = '120',
['periods'] = '0',
['function'] = 'avg',
},
{
['metric'] = 'intel.procfs.cpu.iowait_percentage',
['fields'] = {
['cpuID'] = 'all'
},
['relational_operator'] = '>=',
['threshold'] = '35',
['window'] = '120',
['periods'] = '0',
['function'] = 'avg',
},
},
},
},
{
['name'] = 'cpu-warning',
['description'] = 'The CPU usage is high',
['severity'] = 'warning',
['trigger'] = {
['logical_operator'] = 'or',
['rules'] = {
{
['metric'] = 'intel.procfs.cpu.idle_percentage',
['fields'] = {
['cpuID'] = 'all'
},
['relational_operator'] = '<=',
['threshold'] = '15',
['window'] = '120',
['periods'] = '0',
['function'] = 'avg',
},
{
['metric'] = 'intel.procfs.cpu.iowait_percentage',
['fields'] = {
['cpuID'] = 'all'
},
['relational_operator'] = '>=',
['threshold'] = '25',
['window'] = '120',
['periods'] = '0',
['function'] = 'avg',
},
},
},
},
}
return alarms

View File

@ -0,0 +1,9 @@
filename = "afd.lua"
log_level = 7
message_matcher = "TRUE"
ticker_interval = 10
afd_type = "node"
afd_file = "afd_node_default_cpu_alarms"
afd_cluster_name = "default"
afd_logical_name = "cpu"
hostname = "{{ CCP_HINDSIGHT_NODE_NAME }}"

View File

@ -15,6 +15,7 @@ service:
- prune-input.cfg
- influxdb-tcp.cfg
- kubelet-stats.cfg
- afd-node-default-cpu-alarms.cfg
volumes:
- name: hindsight-output
type: empty-dir
@ -70,6 +71,10 @@ files:
path: /var/lib/hindsight/run/input/kubelet_stats.cfg
content: hindsight_kubelet_stats.cfg.j2
perm: "0600"
afd-node-default-cpu-alarms.cfg:
path: /var/lib/hindsight/run/analysis/afd_node_default_cpu_alarms.cfg
content: hindsight_afd_node_default_cpu_alarms.cfg.j2
perm: "0600"
snap.conf:
path: /etc/snap/snap.conf
content: snap.conf.j2