Merge "Add Lua code for alarming"
This commit is contained in:
commit
710e0c01bb
|
@ -19,7 +19,9 @@ RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 1FA22B08 \
|
|||
|
||||
ADD output/*.lua /var/lib/hindsight/run/output/
|
||||
ADD input/*.lua /var/lib/hindsight/run/input/
|
||||
ADD analysis/*.lua /var/lib/hindsight/run/analysis/
|
||||
ADD modules/*.lua /opt/ccp/lua/modules/stacklight/
|
||||
ADD modules_alarms/afd_node_default_cpu_alarms.lua /opt/ccp/lua/modules/stacklight_alarms/
|
||||
|
||||
RUN useradd --user-group hindsight \
|
||||
&& usermod -a -G microservices hindsight \
|
||||
|
|
|
@ -0,0 +1,120 @@
|
|||
-- Copyright 2015-2016 Mirantis, Inc.
|
||||
--
|
||||
-- Licensed under the Apache License, Version 2.0 (the "License");
|
||||
-- you may not use this file except in compliance with the License.
|
||||
-- You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
local string = require 'string'
|
||||
|
||||
local message = require 'stacklight.message'
|
||||
local afd = require 'stacklight.afd'
|
||||
local afd_annotation = require 'stacklight.afd_annotation'
|
||||
|
||||
-- node or service
|
||||
local afd_type = read_config('afd_type') or error('afd_type must be specified!')
|
||||
local msg_type
|
||||
local msg_field_name
|
||||
local afd_entity
|
||||
|
||||
if afd_type == 'node' then
|
||||
msg_type = 'afd_node_metric'
|
||||
msg_field_name = 'node_status'
|
||||
afd_entity = 'node_role'
|
||||
elseif afd_type == 'service' then
|
||||
msg_type = 'afd_service_metric'
|
||||
msg_field_name = 'service_status'
|
||||
afd_entity = 'service'
|
||||
else
|
||||
error('invalid afd_type value')
|
||||
end
|
||||
|
||||
-- ie: controller for node AFD / rabbitmq for service AFD
|
||||
local afd_entity_value = read_config('afd_cluster_name') or
|
||||
error('afd_cluster_name must be specified!')
|
||||
|
||||
-- ie: cpu for node AFD / queue for service AFD
|
||||
local msg_field_source = read_config('afd_logical_name') or
|
||||
error('afd_logical_name must be specified!')
|
||||
|
||||
local hostname = read_config('hostname') or error('hostname must be specified')
|
||||
|
||||
local afd_file = read_config('afd_file') or error('afd_file must be specified')
|
||||
local all_alarms = require('stacklight_alarms.' .. afd_file)
|
||||
local A = require 'stacklight.afd_alarms'
|
||||
A.load_alarms(all_alarms)
|
||||
|
||||
function process_message()
|
||||
|
||||
local metric_name = read_message('Fields[name]')
|
||||
local ts = read_message('Timestamp')
|
||||
|
||||
local value, err_msg = message.read_values()
|
||||
if not value then
|
||||
return -1, err_msg
|
||||
end
|
||||
-- retrieve field values
|
||||
local fields = {}
|
||||
for _, field in ipairs(A.get_metric_fields(metric_name)) do
|
||||
local field_value = read_message(string.format('Fields[%s]', field))
|
||||
if not field_value then
|
||||
return -1, "Cannot find Fields[" .. field .. "] for the metric " .. metric_name
|
||||
end
|
||||
fields[field] = field_value
|
||||
end
|
||||
A.add_value(ts, metric_name, value, fields)
|
||||
return 0
|
||||
end
|
||||
|
||||
function timer_event(ns)
|
||||
if A.is_started() then
|
||||
local state, alarms = A.evaluate(ns)
|
||||
if state then -- it was time to evaluate at least one alarm
|
||||
for _, alarm in ipairs(alarms) do
|
||||
afd.add_to_alarms(
|
||||
alarm.state,
|
||||
alarm.alert['function'],
|
||||
alarm.alert.metric,
|
||||
alarm.alert.fields,
|
||||
{}, -- tags
|
||||
alarm.alert.operator,
|
||||
alarm.alert.value,
|
||||
alarm.alert.threshold,
|
||||
alarm.alert.window,
|
||||
alarm.alert.periods,
|
||||
alarm.alert.message)
|
||||
end
|
||||
|
||||
-- Message example:
|
||||
-- msg = {
|
||||
-- Type = 'afd_node_metric',
|
||||
-- Payload = '{"alarms":[...]}',
|
||||
-- Fields = {
|
||||
-- name = 'node_status',
|
||||
-- value = 0,
|
||||
-- hostname = 'node1',
|
||||
-- source = 'cpu',
|
||||
-- node_role = 'controller',
|
||||
-- dimensions = {'node_role', 'source', 'hostname'},
|
||||
-- }
|
||||
-- }
|
||||
local msg = afd.inject_afd_metric(
|
||||
msg_type, afd_entity, afd_entity_value, msg_field_name,
|
||||
state, hostname, msg_field_source)
|
||||
|
||||
if msg then
|
||||
afd_annotation.inject_afd_annotation(msg)
|
||||
end
|
||||
|
||||
end
|
||||
else
|
||||
A.set_start_time(ns)
|
||||
end
|
||||
end
|
|
@ -0,0 +1,181 @@
|
|||
-- Copyright 2015-2016 Mirantis, Inc.
|
||||
--
|
||||
-- Licensed under the Apache License, Version 2.0 (the "License");
|
||||
-- you may not use this file except in compliance with the License.
|
||||
-- You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
local cjson = require 'cjson'
|
||||
local string = require 'string'
|
||||
local table = require 'table'
|
||||
|
||||
local utils = require 'stacklight.utils'
|
||||
local constants = require 'stacklight.constants'
|
||||
|
||||
local read_message = read_message
|
||||
local assert = assert
|
||||
local ipairs = ipairs
|
||||
local pcall = pcall
|
||||
|
||||
local M = {}
|
||||
setfenv(1, M) -- Remove external access to contain everything in the module
|
||||
|
||||
local function read_field(msg, name)
|
||||
return msg.Fields[name]
|
||||
end
|
||||
|
||||
function read_status(msg)
|
||||
return read_field(msg, 'value')
|
||||
end
|
||||
|
||||
function read_source(msg)
|
||||
return read_field(msg, 'source')
|
||||
end
|
||||
|
||||
function read_hostname(msg)
|
||||
return read_field(msg, 'hostname')
|
||||
end
|
||||
|
||||
function extract_alarms(msg)
|
||||
local ok, payload = pcall(cjson.decode, msg.Payload)
|
||||
if not ok or not payload.alarms then
|
||||
return nil
|
||||
end
|
||||
return payload.alarms
|
||||
end
|
||||
|
||||
-- return a human-readable message from an alarm table
|
||||
-- for instance: "CPU load too high (WARNING, rule='last(load_midterm)>=5', current=7)"
|
||||
function get_alarm_for_human(alarm)
|
||||
local metric
|
||||
if #(alarm.fields) > 0 then
|
||||
local fields = {}
|
||||
for _, field in ipairs(alarm.fields) do
|
||||
fields[#fields+1] = field.name .. '="' .. field.value .. '"'
|
||||
end
|
||||
metric = string.format('%s[%s]', alarm.metric, table.concat(fields, ','))
|
||||
else
|
||||
metric = alarm.metric
|
||||
end
|
||||
|
||||
local host = ''
|
||||
if alarm.hostname then
|
||||
host = string.format(', host=%s', alarm.hostname)
|
||||
end
|
||||
|
||||
return string.format(
|
||||
"%s (%s, rule='%s(%s)%s%s', current=%.2f%s)",
|
||||
alarm.message,
|
||||
alarm.severity,
|
||||
alarm['function'],
|
||||
metric,
|
||||
alarm.operator,
|
||||
alarm.threshold,
|
||||
alarm.value,
|
||||
host
|
||||
)
|
||||
end
|
||||
|
||||
function alarms_for_human(alarms)
|
||||
local alarm_messages = {}
|
||||
local hint_messages = {}
|
||||
|
||||
for _, v in ipairs(alarms) do
|
||||
if v.tags and v.tags.dependency_level and v.tags.dependency_level == 'hint' then
|
||||
hint_messages[#hint_messages+1] = get_alarm_for_human(v)
|
||||
else
|
||||
alarm_messages[#alarm_messages+1] = get_alarm_for_human(v)
|
||||
end
|
||||
end
|
||||
|
||||
if #hint_messages > 0 then
|
||||
alarm_messages[#alarm_messages+1] = "Other related alarms:"
|
||||
end
|
||||
for _, v in ipairs(hint_messages) do
|
||||
alarm_messages[#alarm_messages+1] = v
|
||||
end
|
||||
|
||||
return alarm_messages
|
||||
end
|
||||
|
||||
local alarms = {}
|
||||
|
||||
-- append an alarm to the list of pending alarms
|
||||
-- the list is sent when inject_afd_metric is called
|
||||
function add_to_alarms(status, fn, metric, fields, tags, operator, value, threshold, window, periods, message)
|
||||
local severity = constants.status_label(status)
|
||||
assert(severity)
|
||||
alarms[#alarms+1] = {
|
||||
severity=severity,
|
||||
['function']=fn,
|
||||
metric=metric,
|
||||
fields=fields or {},
|
||||
tags=tags or {},
|
||||
operator=operator,
|
||||
value=value,
|
||||
threshold=threshold,
|
||||
window=window or 0,
|
||||
periods=periods or 0,
|
||||
message=message
|
||||
}
|
||||
end
|
||||
|
||||
function get_alarms()
|
||||
return alarms
|
||||
end
|
||||
|
||||
function reset_alarms()
|
||||
alarms = {}
|
||||
end
|
||||
|
||||
-- inject an AFD event into the Heka pipeline
|
||||
function inject_afd_metric(msg_type, msg_tag_name, msg_tag_value, metric_name,
|
||||
value, hostname, source)
|
||||
local payload
|
||||
|
||||
if #alarms > 0 then
|
||||
payload = utils.safe_json_encode({alarms=alarms})
|
||||
reset_alarms()
|
||||
if not payload then
|
||||
return
|
||||
end
|
||||
else
|
||||
-- because cjson encodes empty tables as objects instead of arrays
|
||||
payload = '{"alarms":[]}'
|
||||
end
|
||||
|
||||
local msg = {
|
||||
Type = msg_type,
|
||||
Payload = payload,
|
||||
Fields = {
|
||||
name = metric_name,
|
||||
value = value,
|
||||
hostname = hostname,
|
||||
source = source,
|
||||
dimensions = {msg_tag_name, 'hostname', 'source'},
|
||||
}
|
||||
}
|
||||
msg.Fields[msg_tag_name] = msg_tag_value
|
||||
|
||||
local err_code, err_msg = utils.safe_inject_message(msg)
|
||||
|
||||
if err_code ~= 0 then
|
||||
return nil, err_msg
|
||||
end
|
||||
|
||||
return msg
|
||||
end
|
||||
|
||||
MATCH = 1
|
||||
NO_MATCH = 2
|
||||
NO_DATA = 3
|
||||
MISSING_DATA = 4
|
||||
|
||||
return M
|
|
@ -0,0 +1,224 @@
|
|||
-- Copyright 2015-2016 Mirantis, Inc.
|
||||
--
|
||||
-- Licensed under the Apache License, Version 2.0 (the "License");
|
||||
-- you may not use this file except in compliance with the License.
|
||||
-- You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
local assert = assert
|
||||
local ipairs = ipairs
|
||||
local pairs = pairs
|
||||
local string = string
|
||||
local setmetatable = setmetatable
|
||||
|
||||
local table_utils = require 'stacklight.table_utils'
|
||||
local constants = require 'stacklight.constants'
|
||||
local afd = require 'stacklight.afd'
|
||||
local Rule = require 'stacklight.afd_rule'
|
||||
|
||||
local SEVERITIES = {
|
||||
warning = constants.WARN,
|
||||
critical = constants.CRIT,
|
||||
down = constants.DOWN,
|
||||
unknown = constants.UNKW,
|
||||
okay = constants.OKAY,
|
||||
}
|
||||
|
||||
local Alarm = {}
|
||||
Alarm.__index = Alarm
|
||||
|
||||
setfenv(1, Alarm) -- Remove external access to contain everything in the module
|
||||
|
||||
function Alarm.new(alarm)
|
||||
local a = {}
|
||||
setmetatable(a, Alarm)
|
||||
a._metrics_list = nil
|
||||
a.name = alarm.name
|
||||
a.description = alarm.description
|
||||
if alarm.trigger.logical_operator then
|
||||
a.logical_operator = string.lower(alarm.trigger.logical_operator)
|
||||
else
|
||||
a.logical_operator = 'or'
|
||||
end
|
||||
a.severity_str = string.upper(alarm.severity)
|
||||
a.severity = SEVERITIES[string.lower(alarm.severity)]
|
||||
assert(a.severity ~= nil)
|
||||
|
||||
a.skip_when_no_data = false
|
||||
if alarm.no_data_policy then
|
||||
if string.lower(alarm.no_data_policy) == 'skip' then
|
||||
a.skip_when_no_data = true
|
||||
else
|
||||
a.no_data_severity = SEVERITIES[string.lower(alarm.no_data_policy)]
|
||||
end
|
||||
else
|
||||
a.no_data_severity = constants.UNKW
|
||||
end
|
||||
assert(a.skip_when_no_data or a.no_data_severity ~= nil)
|
||||
|
||||
a.rules = {}
|
||||
a.initial_wait = 0
|
||||
for _, rule in ipairs(alarm.trigger.rules) do
|
||||
local r = Rule.new(rule)
|
||||
a.rules[#a.rules+1] = r
|
||||
local wait = r.window * r.periods
|
||||
if wait > a.initial_wait then
|
||||
a.initial_wait = wait * 1e9
|
||||
end
|
||||
end
|
||||
a.start_time_ns = 0
|
||||
|
||||
return a
|
||||
end
|
||||
|
||||
-- return the Set of metrics used by the alarm
|
||||
function Alarm:get_metrics()
|
||||
if not self._metrics_list then
|
||||
self._metrics_list = {}
|
||||
for _, rule in ipairs(self.rules) do
|
||||
if not table_utils.item_find(rule.metric, metrics) then
|
||||
self._metrics_list[#self._metrics_list+1] = rule.metric
|
||||
end
|
||||
end
|
||||
end
|
||||
return self._metrics_list
|
||||
end
|
||||
|
||||
-- return a list of field names used for the metric
|
||||
-- (can have duplicate names)
|
||||
function Alarm:get_metric_fields(metric_name)
|
||||
local fields = {}
|
||||
for _, rule in ipairs(self.rules) do
|
||||
if rule.metric == metric_name then
|
||||
for k, _ in pairs(rule.fields) do
|
||||
fields[#fields+1] = k
|
||||
end
|
||||
for _, g in ipairs(rule.group_by) do
|
||||
fields[#fields+1] = g
|
||||
end
|
||||
end
|
||||
end
|
||||
return fields
|
||||
end
|
||||
|
||||
function Alarm:has_metric(metric)
|
||||
return table_utils.item_find(metric, self:get_metrics())
|
||||
end
|
||||
|
||||
-- dispatch datapoint in datastores
|
||||
function Alarm:add_value(ts, metric, value, fields)
|
||||
local data
|
||||
for id, rule in pairs(self.rules) do
|
||||
if rule.metric == metric then
|
||||
rule:add_value(ts, value, fields)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
-- convert fields to fields map
|
||||
-- {foo="bar"} --> {name="foo", value="bar"}
|
||||
local function convert_field_list(fields)
|
||||
local named_fields = {}
|
||||
for name, value in pairs(fields or {}) do
|
||||
named_fields[#named_fields+1] = {name=name, value=value}
|
||||
end
|
||||
return named_fields
|
||||
end
|
||||
|
||||
-- return: state of alarm and a list of alarm details.
|
||||
--
|
||||
-- with alarm list when state != OKAY:
|
||||
-- {
|
||||
-- {
|
||||
-- value = <current value>,
|
||||
-- fields = <metric fields table>,
|
||||
-- message = <string>,
|
||||
-- },
|
||||
-- }
|
||||
function Alarm:evaluate(ns)
|
||||
local state = constants.OKAY
|
||||
local matches = 0
|
||||
local all_alerts = {}
|
||||
local function add_alarm(rule, value, message, fields)
|
||||
all_alerts[#all_alerts+1] = {
|
||||
severity = self.severity_str,
|
||||
['function'] = rule.fct,
|
||||
metric = rule.metric,
|
||||
operator = rule.relational_operator,
|
||||
threshold = rule.threshold,
|
||||
window = rule.window,
|
||||
periods = rule.periods,
|
||||
value = value,
|
||||
fields = fields,
|
||||
message = message
|
||||
}
|
||||
end
|
||||
local one_unknown = false
|
||||
local msg
|
||||
|
||||
for _, rule in ipairs(self.rules) do
|
||||
local eval, context_list = rule:evaluate(ns)
|
||||
if eval == afd.MATCH then
|
||||
matches = matches + 1
|
||||
msg = self.description
|
||||
elseif eval == afd.MISSING_DATA then
|
||||
msg = 'No datapoint have been received over the last ' .. rule.observation_window .. ' seconds'
|
||||
one_unknown = true
|
||||
elseif eval == afd.NO_DATA then
|
||||
msg = 'No datapoint have been received ever'
|
||||
one_unknown = true
|
||||
end
|
||||
for _, context in ipairs(context_list) do
|
||||
add_alarm(rule, context.value, msg,
|
||||
convert_field_list(context.fields))
|
||||
end
|
||||
end
|
||||
|
||||
if self.logical_operator == 'and' then
|
||||
if one_unknown then
|
||||
if self.skip_when_no_data then
|
||||
state = nil
|
||||
else
|
||||
state = self.no_data_severity
|
||||
end
|
||||
elseif #self.rules == matches then
|
||||
state = self.severity
|
||||
end
|
||||
elseif self.logical_operator == 'or' then
|
||||
if matches > 0 then
|
||||
state = self.severity
|
||||
elseif one_unknown then
|
||||
if self.skip_when_no_data then
|
||||
state = nil
|
||||
else
|
||||
state = self.no_data_severity
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if state == nil or state == constants.OKAY then
|
||||
all_alerts = {}
|
||||
end
|
||||
return state, all_alerts
|
||||
end
|
||||
|
||||
function Alarm:set_start_time(ns)
|
||||
self.start_time_ns = ns
|
||||
end
|
||||
|
||||
function Alarm:is_evaluation_time(ns)
|
||||
local delta = ns - self.start_time_ns
|
||||
if delta >= self.initial_wait then
|
||||
return true
|
||||
end
|
||||
return false
|
||||
end
|
||||
|
||||
return Alarm
|
|
@ -0,0 +1,118 @@
|
|||
-- Copyright 2015-2016 Mirantis, Inc.
|
||||
--
|
||||
-- Licensed under the Apache License, Version 2.0 (the "License");
|
||||
-- you may not use this file except in compliance with the License.
|
||||
-- You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
local pairs = pairs
|
||||
local ipairs = ipairs
|
||||
local table_utils = require 'stacklight.table_utils'
|
||||
local constants = require 'stacklight.constants'
|
||||
local Alarm = require 'stacklight.afd_alarm'
|
||||
|
||||
local all_alarms = {}
|
||||
|
||||
local M = {}
|
||||
setfenv(1, M) -- Remove external access to contain everything in the module
|
||||
|
||||
-- return a list of field names required for the metric
|
||||
function get_metric_fields(metric_name)
|
||||
local fields = {}
|
||||
for name, alarm in pairs(all_alarms) do
|
||||
local mf = alarm:get_metric_fields(metric_name)
|
||||
if mf then
|
||||
for _, field in pairs(mf) do
|
||||
if not table_utils.item_find(field, fields) then
|
||||
fields[#fields+1] = field
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
return fields
|
||||
end
|
||||
|
||||
-- return list of alarms interested by a metric
|
||||
function get_interested_alarms(metric)
|
||||
local interested_alarms = {}
|
||||
for _, alarm in pairs(all_alarms) do
|
||||
if alarm:has_metric(metric) then
|
||||
|
||||
interested_alarms[#interested_alarms+1] = alarm
|
||||
end
|
||||
end
|
||||
return interested_alarms
|
||||
end
|
||||
|
||||
function add_value(ts, metric, value, fields)
|
||||
local interested_alarms = get_interested_alarms(metric)
|
||||
for _, alarm in ipairs (interested_alarms) do
|
||||
alarm:add_value(ts, metric, value, fields)
|
||||
end
|
||||
end
|
||||
|
||||
function reset_alarms()
|
||||
all_alarms = {}
|
||||
end
|
||||
|
||||
function evaluate(ns)
|
||||
local global_state
|
||||
local all_alerts = {}
|
||||
for _, alarm in pairs(all_alarms) do
|
||||
if alarm:is_evaluation_time(ns) then
|
||||
local state, alerts = alarm:evaluate(ns)
|
||||
global_state = constants.max_status(state, global_state)
|
||||
for _, a in ipairs(alerts) do
|
||||
all_alerts[#all_alerts+1] = { state=state, alert=a }
|
||||
end
|
||||
-- raise the first triggered alarm except for OKAY/UNKW states
|
||||
if global_state ~= constants.UNKW and global_state ~= constants.OKAY then
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
return global_state, all_alerts
|
||||
end
|
||||
|
||||
function get_alarms()
|
||||
return all_alarms
|
||||
end
|
||||
function get_alarm(alarm_name)
|
||||
for _, a in ipairs(all_alarms) do
|
||||
if a.name == alarm_name then
|
||||
return a
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
function load_alarm(alarm)
|
||||
local A = Alarm.new(alarm)
|
||||
all_alarms[#all_alarms+1] = A
|
||||
end
|
||||
|
||||
function load_alarms(alarms)
|
||||
for _, alarm in ipairs(alarms) do
|
||||
load_alarm(alarm)
|
||||
end
|
||||
end
|
||||
|
||||
local started = false
|
||||
function set_start_time(ns)
|
||||
for _, alarm in ipairs(all_alarms) do
|
||||
alarm:set_start_time(ns)
|
||||
end
|
||||
started = true
|
||||
end
|
||||
|
||||
function is_started()
|
||||
return started
|
||||
end
|
||||
|
||||
return M
|
|
@ -0,0 +1,99 @@
|
|||
-- Copyright 2015-2016 Mirantis, Inc.
|
||||
--
|
||||
-- Licensed under the Apache License, Version 2.0 (the "License");
|
||||
-- you may not use this file except in compliance with the License.
|
||||
-- You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
local string = require 'string'
|
||||
local table = require 'table'
|
||||
|
||||
local utils = require 'stacklight.utils'
|
||||
local consts = require 'stacklight.constants'
|
||||
local afd = require 'stacklight.afd'
|
||||
|
||||
local M = {}
|
||||
setfenv(1, M)
|
||||
|
||||
local statuses = {}
|
||||
|
||||
local annotation_msg = {
|
||||
Type = 'metric',
|
||||
Fields = {
|
||||
name = 'annotation',
|
||||
dimensions = {'source', 'hostname'},
|
||||
value_fields = {'title', 'tags', 'text'},
|
||||
title = nil,
|
||||
tags = nil,
|
||||
text = nil,
|
||||
source = nil,
|
||||
hostname = nil,
|
||||
}
|
||||
}
|
||||
|
||||
function inject_afd_annotation(msg)
|
||||
local previous
|
||||
local text
|
||||
|
||||
local source = afd.read_source(msg)
|
||||
local status = afd.read_status(msg)
|
||||
local hostname = afd.read_hostname(msg)
|
||||
local alarms = afd.extract_alarms(msg)
|
||||
|
||||
if not source or not status or not alarms then
|
||||
return -1
|
||||
end
|
||||
|
||||
if not statuses[source] then
|
||||
statuses[source] = {}
|
||||
end
|
||||
previous = statuses[source]
|
||||
|
||||
text = table.concat(afd.alarms_for_human(alarms), '<br />')
|
||||
|
||||
-- build the title
|
||||
if not previous.status and status == consts.OKAY then
|
||||
-- don't send an annotation when we detect a new cluster which is OKAY
|
||||
return 0
|
||||
elseif not previous.status then
|
||||
title = string.format('General status is %s',
|
||||
consts.status_label(status))
|
||||
elseif previous.status ~= status then
|
||||
title = string.format('General status %s -> %s',
|
||||
consts.status_label(previous.status),
|
||||
consts.status_label(status))
|
||||
|
||||
-- TODO(pasquier-s): generate an annotation when the set of alarms has
|
||||
-- changed. the following code generated an annotation whenever at least
|
||||
-- one value associated to an alarm was changing. This led to way too
|
||||
-- many annotations with alarms monitoring the CPU usage for instance.
|
||||
|
||||
-- elseif previous.text ~= text then
|
||||
-- title = string.format('General status remains %s',
|
||||
-- consts.status_label(status))
|
||||
else
|
||||
-- nothing has changed since the last message
|
||||
return 0
|
||||
end
|
||||
|
||||
annotation_msg.Fields.title = title
|
||||
annotation_msg.Fields.tags = source
|
||||
annotation_msg.Fields.text = text
|
||||
annotation_msg.Fields.source = source
|
||||
annotation_msg.Fields.hostname = hostname
|
||||
|
||||
-- store the last status and alarm text for future messages
|
||||
previous.status = status
|
||||
previous.text = text
|
||||
|
||||
return utils.safe_inject_message(annotation_msg)
|
||||
end
|
||||
|
||||
return M
|
|
@ -0,0 +1,279 @@
|
|||
-- Copyright 2015-2016 Mirantis, Inc.
|
||||
--
|
||||
-- Licensed under the Apache License, Version 2.0 (the "License");
|
||||
-- you may not use this file except in compliance with the License.
|
||||
-- You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
local circular_buffer = require 'circular_buffer'
|
||||
local stats = require 'lsb.stats'
|
||||
local setmetatable = setmetatable
|
||||
local ipairs = ipairs
|
||||
local pairs = pairs
|
||||
local math = require 'math'
|
||||
local string = string
|
||||
local table = table
|
||||
local assert = assert
|
||||
local type = type
|
||||
|
||||
-- StackLight libs
|
||||
local table_utils = require 'stacklight.table_utils'
|
||||
local constants = require 'stacklight.constants'
|
||||
local afd = require 'stacklight.afd'
|
||||
local matching = require 'stacklight.value_matching'
|
||||
|
||||
local MIN_WINDOW = 10
|
||||
local MIN_PERIOD = 1
|
||||
local SECONDS_PER_ROW = 5
|
||||
|
||||
local Rule = {}
|
||||
Rule.__index = Rule
|
||||
|
||||
setfenv(1, Rule) -- Remove external access to contain everything in the module
|
||||
|
||||
function Rule.new(rule)
|
||||
local r = {}
|
||||
setmetatable(r, Rule)
|
||||
|
||||
local win = MIN_WINDOW
|
||||
if rule.window and rule.window + 0 > 0 then
|
||||
win = rule.window + 0
|
||||
end
|
||||
r.window = win
|
||||
local periods = MIN_PERIOD
|
||||
if rule.periods and rule.periods + 0 > 0 then
|
||||
periods = rule.periods + 0
|
||||
end
|
||||
r.periods = periods
|
||||
r.relational_operator = rule.relational_operator
|
||||
r.metric = rule.metric
|
||||
r.fields = rule.fields or {}
|
||||
|
||||
-- build field matching
|
||||
r.field_matchers = {}
|
||||
for f, expression in pairs(r.fields) do
|
||||
r.field_matchers[f] = matching.new(expression)
|
||||
end
|
||||
|
||||
r.fct = rule['function']
|
||||
r.threshold = rule.threshold + 0
|
||||
r.value_index = rule.value or nil -- Can be nil
|
||||
|
||||
-- build unique rule id
|
||||
local arr = {r.metric, r.fct, r.window, r.periods}
|
||||
for f, v in table_utils.orderedPairs(r.fields or {}) do
|
||||
arr[#arr+1] = string.format('(%s=%s)', f, v)
|
||||
end
|
||||
r.rule_id = table.concat(arr, '/')
|
||||
|
||||
r.group_by = rule.group_by or {}
|
||||
|
||||
r.cbuf_size = math.ceil(r.window * r.periods / SECONDS_PER_ROW)
|
||||
|
||||
r.ids_datastore = {}
|
||||
r.datastore = {}
|
||||
r.observation_window = math.ceil(r.window * r.periods)
|
||||
|
||||
return r
|
||||
end
|
||||
|
||||
function Rule:get_datastore_id(fields)
|
||||
if #self.group_by == 0 or fields == nil then
|
||||
return self.rule_id
|
||||
end
|
||||
|
||||
local arr = {}
|
||||
arr[#arr + 1] = self.rule_id
|
||||
for _, g in ipairs(self.group_by) do
|
||||
arr[#arr + 1] = fields[g]
|
||||
end
|
||||
return table.concat(arr, '/')
|
||||
end
|
||||
|
||||
function Rule:fields_accepted(fields)
|
||||
if not fields then
|
||||
fields = {}
|
||||
end
|
||||
local matched_fields = 0
|
||||
local no_match_on_fields = true
|
||||
for f, expression in pairs(self.field_matchers) do
|
||||
no_match_on_fields = false
|
||||
for k, v in pairs(fields) do
|
||||
if k == f then
|
||||
if expression:matches(v) then
|
||||
matched_fields = matched_fields + 1
|
||||
else
|
||||
return false
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
return no_match_on_fields or matched_fields > 0
|
||||
end
|
||||
|
||||
function Rule:get_circular_buffer()
|
||||
local fct
|
||||
if self.fct == 'min' or self.fct == 'max' then
|
||||
fct = self.fct
|
||||
else
|
||||
fct = 'sum'
|
||||
end
|
||||
local cbuf = circular_buffer.new(self.cbuf_size, 1, SECONDS_PER_ROW)
|
||||
cbuf:set_header(1, self.metric, fct, fct)
|
||||
return cbuf
|
||||
end
|
||||
|
||||
-- store datapoints in cbuf, create the cbuf if not exists.
|
||||
-- value can be a table where the index to choose is referenced by self.value_index
|
||||
function Rule:add_value(ts, value, fields)
|
||||
if not self:fields_accepted(fields) then
|
||||
return
|
||||
end
|
||||
if type(value) == 'table' then
|
||||
value = value[self.value_index]
|
||||
end
|
||||
if value == nil then
|
||||
return
|
||||
end
|
||||
|
||||
local data
|
||||
local uniq_field_id = self:get_datastore_id(fields)
|
||||
if not self.datastore[uniq_field_id] then
|
||||
self.datastore[uniq_field_id] = {
|
||||
fields = self.fields,
|
||||
cbuf = self:get_circular_buffer()
|
||||
}
|
||||
if #self.group_by > 0 then
|
||||
self.datastore[uniq_field_id].fields = fields
|
||||
end
|
||||
|
||||
self:add_datastore(uniq_field_id)
|
||||
end
|
||||
data = self.datastore[uniq_field_id]
|
||||
|
||||
if self.fct == 'avg' then
|
||||
data.cbuf:add(ts, 1, value)
|
||||
else
|
||||
data.cbuf:set(ts, 1, value)
|
||||
end
|
||||
end
|
||||
|
||||
function Rule:add_datastore(id)
|
||||
if not table_utils.item_find(id, self.ids_datastore) then
|
||||
self.ids_datastore[#self.ids_datastore+1] = id
|
||||
end
|
||||
end
|
||||
|
||||
function Rule:compare_threshold(value)
|
||||
return constants.compare_threshold(value, self.relational_operator, self.threshold)
|
||||
end
|
||||
|
||||
local function isnumber(value)
|
||||
return value ~= nil and not (value ~= value)
|
||||
end
|
||||
|
||||
local available_functions = {last=true, avg=true, max=true, min=true, sum=true,
|
||||
variance=true, sd=true, diff=true}
|
||||
|
||||
-- evaluate the rule against datapoints
|
||||
-- return a list: match (bool or string), context ({value=v, fields=list of field table})
|
||||
--
|
||||
-- examples:
|
||||
-- true, { {value=100, fields={{queue='nova'}, {queue='neutron'}}, ..}
|
||||
-- false, { {value=10, fields={}}, ..}
|
||||
-- with 2 special cases:
|
||||
-- - never receive one datapoint
|
||||
-- 'nodata', {}
|
||||
-- - no more datapoint received for a metric
|
||||
-- 'missing', {value=-1, fields={}}
|
||||
-- There is a drawback with the 'missing' state and could leads to emit false positive
|
||||
-- state. For example when the monitored thing has been renamed/deleted,
|
||||
-- it's normal to don't receive datapoint anymore .. for example a filesystem.
|
||||
function Rule:evaluate(ns)
|
||||
local fields = {}
|
||||
local one_match, one_no_match, one_missing_data = false, false, false
|
||||
for _, id in ipairs(self.ids_datastore) do
|
||||
local data = self.datastore[id]
|
||||
if data then
|
||||
local cbuf_time = data.cbuf:current_time()
|
||||
-- if we didn't receive datapoint within the observation window this means
|
||||
-- we don't receive anymore data and cannot compute the rule.
|
||||
if ns - cbuf_time > self.observation_window * 1e9 then
|
||||
one_missing_data = true
|
||||
fields[#fields+1] = {value = -1, fields = data.fields}
|
||||
else
|
||||
assert(available_functions[self.fct])
|
||||
local result
|
||||
|
||||
if self.fct == 'last' then
|
||||
local last
|
||||
local t = ns
|
||||
while (not isnumber(last)) and t >= ns - self.observation_window * 1e9 do
|
||||
last = data.cbuf:get(t, 1)
|
||||
t = t - SECONDS_PER_ROW * 1e9
|
||||
end
|
||||
if isnumber(last) then
|
||||
result = last
|
||||
else
|
||||
one_missing_data = true
|
||||
fields[#fields+1] = {value = -1, fields = data.fields}
|
||||
end
|
||||
elseif self.fct == 'diff' then
|
||||
local first, last
|
||||
|
||||
local t = ns
|
||||
while (not isnumber(last)) and t >= ns - self.observation_window * 1e9 do
|
||||
last = data.cbuf:get(t, 1)
|
||||
t = t - SECONDS_PER_ROW * 1e9
|
||||
end
|
||||
|
||||
if isnumber(last) then
|
||||
t = ns - self.observation_window * 1e9
|
||||
while (not isnumber(first)) and t <= ns do
|
||||
first = data.cbuf:get(t, 1)
|
||||
t = t + SECONDS_PER_ROW * 1e9
|
||||
end
|
||||
end
|
||||
|
||||
if not isnumber(last) or not isnumber(first) then
|
||||
one_missing_data = true
|
||||
fields[#fields+1] = {value = -1, fields = data.fields}
|
||||
else
|
||||
result = last - first
|
||||
end
|
||||
else
|
||||
local values = data.cbuf:get_range(1)
|
||||
result = stats[self.fct](values)
|
||||
end
|
||||
|
||||
if result then
|
||||
local m = self:compare_threshold(result)
|
||||
if m then
|
||||
one_match = true
|
||||
fields[#fields+1] = {value=result, fields=data.fields}
|
||||
else
|
||||
one_no_match = true
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
if one_match then
|
||||
return afd.MATCH, fields
|
||||
elseif one_missing_data then
|
||||
return afd.MISSING_DATA, fields
|
||||
elseif one_no_match then
|
||||
return afd.NO_MATCH, {}
|
||||
else
|
||||
return afd.NO_DATA, {{value=-1, fields=self.fields}}
|
||||
end
|
||||
end
|
||||
|
||||
return Rule
|
|
@ -0,0 +1,78 @@
|
|||
-- Copyright 2015-2016 Mirantis, Inc.
|
||||
--
|
||||
-- Licensed under the Apache License, Version 2.0 (the "License");
|
||||
-- you may not use this file except in compliance with the License.
|
||||
-- You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
local M = {}
|
||||
setfenv(1, M) -- Remove external access to contain everything in the module
|
||||
|
||||
-- The status values were chosen to match with the Grafana constraints:
|
||||
-- OKAY => green
|
||||
-- WARN & UNKW => orange
|
||||
-- CRIT & DOWN => red
|
||||
OKAY=0
|
||||
WARN=1
|
||||
UNKW=2
|
||||
CRIT=3
|
||||
DOWN=4
|
||||
|
||||
local STATUS_LABELS = {
|
||||
[OKAY]='OKAY',
|
||||
[WARN]='WARN',
|
||||
[UNKW]='UNKNOWN',
|
||||
[CRIT]='CRITICAL',
|
||||
[DOWN]='DOWN'
|
||||
}
|
||||
|
||||
function status_label(v)
|
||||
return STATUS_LABELS[v]
|
||||
end
|
||||
|
||||
local STATUS_WEIGHTS = {
|
||||
[UNKW]=0,
|
||||
[OKAY]=1,
|
||||
[WARN]=2,
|
||||
[CRIT]=3,
|
||||
[DOWN]=4
|
||||
}
|
||||
|
||||
function max_status(val1, val2)
|
||||
if not val1 then
|
||||
return val2
|
||||
elseif not val2 then
|
||||
return val1
|
||||
elseif STATUS_WEIGHTS[val1] > STATUS_WEIGHTS[val2] then
|
||||
return val1
|
||||
else
|
||||
return val2
|
||||
end
|
||||
end
|
||||
|
||||
function compare_threshold(value, op, threshold)
|
||||
local rule_matches = false
|
||||
if op == '==' or op == 'eq' then
|
||||
rule_matches = value == threshold
|
||||
elseif op == '!=' or op == 'ne' then
|
||||
rule_matches = value ~= threshold
|
||||
elseif op == '>=' or op == 'gte' then
|
||||
rule_matches = value >= threshold
|
||||
elseif op == '>' or op == 'gt' then
|
||||
rule_matches = value > threshold
|
||||
elseif op == '<=' or op == 'lte' then
|
||||
rule_matches = value <= threshold
|
||||
elseif op == '<' or op == 'lt' then
|
||||
rule_matches = value < threshold
|
||||
end
|
||||
return rule_matches
|
||||
end
|
||||
|
||||
return M
|
|
@ -0,0 +1,34 @@
|
|||
-- Copyright 2015-2016 Mirantis, Inc.
|
||||
--
|
||||
-- Licensed under the Apache License, Version 2.0 (the "License");
|
||||
-- you may not use this file except in compliance with the License.
|
||||
-- You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
local l = require 'lpeg'
|
||||
l.locale(l)
|
||||
|
||||
local tonumber = tonumber
|
||||
|
||||
local M = {}
|
||||
setfenv(1, M) -- Remove external access to contain everything in the module
|
||||
|
||||
function anywhere (patt)
|
||||
return l.P {
|
||||
patt + 1 * l.V(1)
|
||||
}
|
||||
end
|
||||
|
||||
sp = l.space
|
||||
|
||||
-- Pattern used to match a number
|
||||
Number = l.P"-"^-1 * l.xdigit^1 * (l.S(".,") * l.xdigit^1 )^-1 / tonumber
|
||||
|
||||
return M
|
|
@ -0,0 +1,83 @@
|
|||
-- Copyright 2015-2016 Mirantis, Inc.
|
||||
--
|
||||
-- Licensed under the Apache License, Version 2.0 (the "License");
|
||||
-- you may not use this file except in compliance with the License.
|
||||
-- You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
local table = require 'table'
|
||||
local ipairs = ipairs
|
||||
local pairs = pairs
|
||||
local type = type
|
||||
|
||||
local M = {}
|
||||
setfenv(1, M) -- Remove external access to contain everything in the module
|
||||
|
||||
-- return the position (index) of an item in a list, nil if not found
|
||||
function item_pos(item, list)
|
||||
if type(list) == 'table' then
|
||||
for i, v in ipairs(list) do
|
||||
if v == item then
|
||||
return i
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
-- return true if an item is present in the list, false otherwise
|
||||
function item_find(item, list)
|
||||
return item_pos(item, list) ~= nil
|
||||
end
|
||||
|
||||
-- from http://lua-users.org/wiki/SortedIteration
|
||||
function __genOrderedIndex( t )
|
||||
local orderedIndex = {}
|
||||
for key in pairs(t) do
|
||||
table.insert( orderedIndex, key )
|
||||
end
|
||||
table.sort( orderedIndex )
|
||||
return orderedIndex
|
||||
end
|
||||
|
||||
function orderedNext(t, state)
|
||||
-- Equivalent of the next function, but returns the keys in the alphabetic
|
||||
-- order. We use a temporary ordered key table that is stored in the
|
||||
-- table being iterated.
|
||||
|
||||
key = nil
|
||||
if state == nil then
|
||||
-- the first time, generate the index
|
||||
t.__orderedIndex = __genOrderedIndex( t )
|
||||
key = t.__orderedIndex[1]
|
||||
else
|
||||
-- fetch the next value
|
||||
for i = 1,table.getn(t.__orderedIndex) do
|
||||
if t.__orderedIndex[i] == state then
|
||||
key = t.__orderedIndex[i+1]
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if key then
|
||||
return key, t[key]
|
||||
end
|
||||
|
||||
-- no more value to return, cleanup
|
||||
t.__orderedIndex = nil
|
||||
return
|
||||
end
|
||||
|
||||
function orderedPairs(t)
|
||||
-- Equivalent of the pairs() function on tables. Allows to iterate
|
||||
-- in order
|
||||
return orderedNext, t, nil
|
||||
end
|
||||
|
||||
return M
|
|
@ -0,0 +1,46 @@
|
|||
-- Copyright 2015-2016 Mirantis, Inc.
|
||||
--
|
||||
-- Licensed under the Apache License, Version 2.0 (the "License");
|
||||
-- you may not use this file except in compliance with the License.
|
||||
-- You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
local cjson = require 'cjson'
|
||||
|
||||
local inject_message = inject_message
|
||||
local read_message = read_message
|
||||
local string = string
|
||||
local pcall = pcall
|
||||
|
||||
local M = {}
|
||||
setfenv(1, M) -- Remove external access to contain everything in the module
|
||||
|
||||
-- Encode a Lua variable as JSON without raising an exception if the encoding
|
||||
-- fails for some reason (for instance, the encoded buffer exceeds the sandbox
|
||||
-- limit)
|
||||
function safe_json_encode(v)
|
||||
local ok, data = pcall(cjson.encode, v)
|
||||
if not ok then
|
||||
return
|
||||
end
|
||||
return data
|
||||
end
|
||||
|
||||
-- Call inject_message() wrapped by pcall()
|
||||
function safe_inject_message(msg)
|
||||
local ok, err_msg = pcall(inject_message, msg)
|
||||
if not ok then
|
||||
return -1, err_msg
|
||||
else
|
||||
return 0
|
||||
end
|
||||
end
|
||||
|
||||
return M
|
|
@ -0,0 +1,171 @@
|
|||
-- Copyright 2016 Mirantis, Inc.
|
||||
--
|
||||
-- Licensed under the Apache License, Version 2.0 (the "License");
|
||||
-- you may not use this file except in compliance with the License.
|
||||
-- You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
local l = require "lpeg"
|
||||
l.locale(l)
|
||||
local pcall = pcall
|
||||
local string = require 'string'
|
||||
|
||||
local patterns = require 'stacklight.patterns'
|
||||
local error = error
|
||||
local setmetatable = setmetatable
|
||||
local tonumber = tonumber
|
||||
|
||||
local C = l.C
|
||||
local P = l.P
|
||||
local S = l.S
|
||||
local V = l.V
|
||||
local Ct = l.Ct
|
||||
local Cc = l.Cc
|
||||
|
||||
local Optional_space = patterns.sp^0
|
||||
local Only_spaces = patterns.sp^1 * -1
|
||||
|
||||
local function space(pat)
|
||||
return Optional_space * pat * Optional_space
|
||||
end
|
||||
|
||||
local EQ = P'=='
|
||||
local NEQ = P'!='
|
||||
local GT = P'>'
|
||||
local LT = P'<'
|
||||
local GTE = P'>='
|
||||
local LTE = P'<='
|
||||
local MATCH = P'=~'
|
||||
local NO_MATCH = P'!~'
|
||||
|
||||
local OR = P'||'
|
||||
local AND = P'&&'
|
||||
|
||||
local function get_operator(op)
|
||||
if op == '' then
|
||||
return '=='
|
||||
end
|
||||
return op
|
||||
end
|
||||
|
||||
local numerical_operator = (EQ + NEQ + LTE + GTE + GT + LT )^-1 / get_operator
|
||||
local sub_numerical_expression = space(numerical_operator) * patterns.Number * Optional_space
|
||||
local is_plain_numeric = (sub_numerical_expression * ((OR^1 + AND^1) * sub_numerical_expression)^0) * -1
|
||||
|
||||
local quoted_string = (P'"' * C((P(1) - (P'"'))^1) * P'"' + C((P(1) - patterns.sp)^1))
|
||||
local string_operator = (EQ + NEQ + MATCH + NO_MATCH)^-1 / get_operator
|
||||
local sub_string_expression = space(string_operator) * quoted_string * Optional_space
|
||||
local is_plain_string = (sub_string_expression * ((OR^1 + AND^1) * sub_string_expression)^0) * -1
|
||||
|
||||
local numerical_expression = P {
|
||||
'OR';
|
||||
AND = Ct(Cc('and') * V'SUB' * space(AND) * V'AND' + V'SUB'),
|
||||
OR = Ct(Cc('or') * V'AND' * space(OR) * V'OR' + V'AND'),
|
||||
SUB = Ct(sub_numerical_expression)
|
||||
} * -1
|
||||
|
||||
local string_expression = P {
|
||||
'OR';
|
||||
AND = Ct(Cc('and') * V'SUB' * space(AND) * V'AND' + V'SUB'),
|
||||
OR = Ct(Cc('or') * V'AND' * space(OR) * V'OR' + V'AND'),
|
||||
SUB = Ct(sub_string_expression)
|
||||
} * -1
|
||||
|
||||
local is_complex = patterns.anywhere(EQ + NEQ + LTE + GTE + GT + LT + MATCH + NO_MATCH + OR + AND)
|
||||
|
||||
local function eval_tree(tree, value)
|
||||
local match = false
|
||||
|
||||
if type(tree[1]) == 'table' then
|
||||
match = eval_tree(tree[1], value)
|
||||
else
|
||||
local operator = tree[1]
|
||||
if operator == 'and' or operator == 'or' then
|
||||
match = eval_tree(tree[2], value)
|
||||
for i=3, #tree, 1 do
|
||||
local m = eval_tree(tree[i], value)
|
||||
if operator == 'or' then
|
||||
match = match or m
|
||||
else
|
||||
match = match and m
|
||||
end
|
||||
end
|
||||
else
|
||||
local matcher = tree[2]
|
||||
if operator == '==' then
|
||||
return value == matcher
|
||||
elseif operator == '!=' then
|
||||
return value ~= matcher
|
||||
elseif operator == '>' then
|
||||
return value > matcher
|
||||
elseif operator == '<' then
|
||||
return value < matcher
|
||||
elseif operator == '>=' then
|
||||
return value >= matcher
|
||||
elseif operator == '<=' then
|
||||
return value <= matcher
|
||||
elseif operator == '=~' then
|
||||
local ok, m = pcall(string.find, value, matcher)
|
||||
return ok and m ~= nil
|
||||
elseif operator == '!~' then
|
||||
local ok, m = pcall(string.find, value, matcher)
|
||||
return ok and m == nil
|
||||
end
|
||||
end
|
||||
end
|
||||
return match
|
||||
end
|
||||
|
||||
local MatchExpression = {}
|
||||
MatchExpression.__index = MatchExpression
|
||||
|
||||
setfenv(1, MatchExpression) -- Remove external access to contain everything in the module
|
||||
|
||||
function MatchExpression.new(expression)
|
||||
local r = {}
|
||||
setmetatable(r, MatchExpression)
|
||||
if is_complex:match(expression) then
|
||||
r.is_plain_numeric_exp = is_plain_numeric:match(expression) ~= nil
|
||||
|
||||
if r.is_plain_numeric_exp then
|
||||
r.tree = numerical_expression:match(expression)
|
||||
elseif is_plain_string:match(expression) ~= nil then
|
||||
r.tree = string_expression:match(expression)
|
||||
end
|
||||
if r.tree == nil then
|
||||
error('Invalid expression: ' .. expression)
|
||||
end
|
||||
else
|
||||
if expression == '' or Only_spaces:match(expression) then
|
||||
error('Expression is empty')
|
||||
end
|
||||
r.is_simple_equality_matching = true
|
||||
end
|
||||
r.expression = expression
|
||||
|
||||
return r
|
||||
end
|
||||
|
||||
function MatchExpression:matches(value)
|
||||
if self.is_simple_equality_matching then
|
||||
return self.expression == value or
|
||||
tonumber(self.expression) == value or
|
||||
tonumber(value) == self.expression
|
||||
end
|
||||
if self.is_plain_numeric_exp then
|
||||
value = tonumber(value)
|
||||
if value == nil then
|
||||
return false
|
||||
end
|
||||
end
|
||||
return eval_tree(self.tree, value)
|
||||
end
|
||||
|
||||
return MatchExpression
|
|
@ -0,0 +1,71 @@
|
|||
local M = {}
|
||||
setfenv(1, M) -- Remove external access to contain everything in the module
|
||||
|
||||
local alarms = {
|
||||
{
|
||||
['name'] = 'cpu-critical',
|
||||
['description'] = 'The CPU usage is too high',
|
||||
['severity'] = 'critical',
|
||||
['trigger'] = {
|
||||
['logical_operator'] = 'or',
|
||||
['rules'] = {
|
||||
{
|
||||
['metric'] = 'intel.procfs.cpu.idle_percentage',
|
||||
['fields'] = {
|
||||
['cpuID'] = 'all'
|
||||
},
|
||||
['relational_operator'] = '<=',
|
||||
['threshold'] = '5',
|
||||
['window'] = '120',
|
||||
['periods'] = '0',
|
||||
['function'] = 'avg',
|
||||
},
|
||||
{
|
||||
['metric'] = 'intel.procfs.cpu.iowait_percentage',
|
||||
['fields'] = {
|
||||
['cpuID'] = 'all'
|
||||
},
|
||||
['relational_operator'] = '>=',
|
||||
['threshold'] = '35',
|
||||
['window'] = '120',
|
||||
['periods'] = '0',
|
||||
['function'] = 'avg',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
['name'] = 'cpu-warning',
|
||||
['description'] = 'The CPU usage is high',
|
||||
['severity'] = 'warning',
|
||||
['trigger'] = {
|
||||
['logical_operator'] = 'or',
|
||||
['rules'] = {
|
||||
{
|
||||
['metric'] = 'intel.procfs.cpu.idle_percentage',
|
||||
['fields'] = {
|
||||
['cpuID'] = 'all'
|
||||
},
|
||||
['relational_operator'] = '<=',
|
||||
['threshold'] = '15',
|
||||
['window'] = '120',
|
||||
['periods'] = '0',
|
||||
['function'] = 'avg',
|
||||
},
|
||||
{
|
||||
['metric'] = 'intel.procfs.cpu.iowait_percentage',
|
||||
['fields'] = {
|
||||
['cpuID'] = 'all'
|
||||
},
|
||||
['relational_operator'] = '>=',
|
||||
['threshold'] = '25',
|
||||
['window'] = '120',
|
||||
['periods'] = '0',
|
||||
['function'] = 'avg',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
return alarms
|
|
@ -0,0 +1,9 @@
|
|||
filename = "afd.lua"
|
||||
log_level = 7
|
||||
message_matcher = "TRUE"
|
||||
ticker_interval = 10
|
||||
afd_type = "node"
|
||||
afd_file = "afd_node_default_cpu_alarms"
|
||||
afd_cluster_name = "default"
|
||||
afd_logical_name = "cpu"
|
||||
hostname = "{{ CCP_HINDSIGHT_NODE_NAME }}"
|
|
@ -15,6 +15,7 @@ service:
|
|||
- prune-input.cfg
|
||||
- influxdb-tcp.cfg
|
||||
- kubelet-stats.cfg
|
||||
- afd-node-default-cpu-alarms.cfg
|
||||
volumes:
|
||||
- name: hindsight-output
|
||||
type: empty-dir
|
||||
|
@ -70,6 +71,10 @@ files:
|
|||
path: /var/lib/hindsight/run/input/kubelet_stats.cfg
|
||||
content: hindsight_kubelet_stats.cfg.j2
|
||||
perm: "0600"
|
||||
afd-node-default-cpu-alarms.cfg:
|
||||
path: /var/lib/hindsight/run/analysis/afd_node_default_cpu_alarms.cfg
|
||||
content: hindsight_afd_node_default_cpu_alarms.cfg.j2
|
||||
perm: "0600"
|
||||
snap.conf:
|
||||
path: /etc/snap/snap.conf
|
||||
content: snap.conf.j2
|
||||
|
|
Loading…
Reference in New Issue