From ed5934cd36b67734bb4ee1ebf6c3f046f58e240d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Lemoine?= Date: Mon, 12 Sep 2016 12:22:14 +0000 Subject: [PATCH] Add Lua code for alarming This commits adds Lua code for generating AFD (Anomaly and Fault Detection) metrics based on the evaluation of alarms. The Lua code was copied from the lma_collector Fuel plugin [*], with changes to accomodate Hindsight and the versions of lua_sandbox and lua_sandbox_extensions we rely on. In the future we plan to move this Lua code in its own Git repository. And the Hindsight Dockerfile will install the Lua code in the image using Debian packages. The afd_node_default_cpu_alarms.lua and hindsight_afd_node_default_cpu_alarms.cfg.j2 files will be removed. Instead the operator will configure alarms through a YAML file, and we will use a sidecar container for generating Lua tables including alarm definitions and corresponding plugin configuration files. [*] https://github.com/openstack/fuel-plugin-lma-collector/ Change-Id: If182c3a6453f7bf8b72f03af56a14ace109eaa68 --- docker/hindsight/Dockerfile.j2 | 2 + docker/hindsight/analysis/afd.lua | 120 ++++++++ docker/hindsight/modules/afd.lua | 181 ++++++++++++ docker/hindsight/modules/afd_alarm.lua | 224 ++++++++++++++ docker/hindsight/modules/afd_alarms.lua | 118 ++++++++ docker/hindsight/modules/afd_annotation.lua | 99 +++++++ docker/hindsight/modules/afd_rule.lua | 279 ++++++++++++++++++ docker/hindsight/modules/constants.lua | 78 +++++ docker/hindsight/modules/patterns.lua | 34 +++ docker/hindsight/modules/table_utils.lua | 83 ++++++ docker/hindsight/modules/utils.lua | 46 +++ docker/hindsight/modules/value_matching.lua | 171 +++++++++++ .../afd_node_default_cpu_alarms.lua | 71 +++++ ...ndsight_afd_node_default_cpu_alarms.cfg.j2 | 9 + service/stacklight-collector.yaml | 5 + 15 files changed, 1520 insertions(+) create mode 100644 docker/hindsight/analysis/afd.lua create mode 100644 docker/hindsight/modules/afd.lua create mode 100644 docker/hindsight/modules/afd_alarm.lua create mode 100644 docker/hindsight/modules/afd_alarms.lua create mode 100644 docker/hindsight/modules/afd_annotation.lua create mode 100644 docker/hindsight/modules/afd_rule.lua create mode 100644 docker/hindsight/modules/constants.lua create mode 100644 docker/hindsight/modules/patterns.lua create mode 100644 docker/hindsight/modules/table_utils.lua create mode 100644 docker/hindsight/modules/utils.lua create mode 100644 docker/hindsight/modules/value_matching.lua create mode 100644 docker/hindsight/modules_alarms/afd_node_default_cpu_alarms.lua create mode 100644 service/files/hindsight_afd_node_default_cpu_alarms.cfg.j2 diff --git a/docker/hindsight/Dockerfile.j2 b/docker/hindsight/Dockerfile.j2 index 1014dc3..c8d0edd 100644 --- a/docker/hindsight/Dockerfile.j2 +++ b/docker/hindsight/Dockerfile.j2 @@ -19,7 +19,9 @@ RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 1FA22B08 \ ADD output/*.lua /var/lib/hindsight/run/output/ ADD input/*.lua /var/lib/hindsight/run/input/ +ADD analysis/*.lua /var/lib/hindsight/run/analysis/ ADD modules/*.lua /opt/ccp/lua/modules/stacklight/ +ADD modules_alarms/afd_node_default_cpu_alarms.lua /opt/ccp/lua/modules/stacklight_alarms/ RUN useradd --user-group hindsight \ && usermod -a -G microservices hindsight \ diff --git a/docker/hindsight/analysis/afd.lua b/docker/hindsight/analysis/afd.lua new file mode 100644 index 0000000..ed9db37 --- /dev/null +++ b/docker/hindsight/analysis/afd.lua @@ -0,0 +1,120 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local string = require 'string' + +local message = require 'stacklight.message' +local afd = require 'stacklight.afd' +local afd_annotation = require 'stacklight.afd_annotation' + +-- node or service +local afd_type = read_config('afd_type') or error('afd_type must be specified!') +local msg_type +local msg_field_name +local afd_entity + +if afd_type == 'node' then + msg_type = 'afd_node_metric' + msg_field_name = 'node_status' + afd_entity = 'node_role' +elseif afd_type == 'service' then + msg_type = 'afd_service_metric' + msg_field_name = 'service_status' + afd_entity = 'service' +else + error('invalid afd_type value') +end + +-- ie: controller for node AFD / rabbitmq for service AFD +local afd_entity_value = read_config('afd_cluster_name') or + error('afd_cluster_name must be specified!') + +-- ie: cpu for node AFD / queue for service AFD +local msg_field_source = read_config('afd_logical_name') or + error('afd_logical_name must be specified!') + +local hostname = read_config('hostname') or error('hostname must be specified') + +local afd_file = read_config('afd_file') or error('afd_file must be specified') +local all_alarms = require('stacklight_alarms.' .. afd_file) +local A = require 'stacklight.afd_alarms' +A.load_alarms(all_alarms) + +function process_message() + + local metric_name = read_message('Fields[name]') + local ts = read_message('Timestamp') + + local value, err_msg = message.read_values() + if not value then + return -1, err_msg + end + -- retrieve field values + local fields = {} + for _, field in ipairs(A.get_metric_fields(metric_name)) do + local field_value = read_message(string.format('Fields[%s]', field)) + if not field_value then + return -1, "Cannot find Fields[" .. field .. "] for the metric " .. metric_name + end + fields[field] = field_value + end + A.add_value(ts, metric_name, value, fields) + return 0 +end + +function timer_event(ns) + if A.is_started() then + local state, alarms = A.evaluate(ns) + if state then -- it was time to evaluate at least one alarm + for _, alarm in ipairs(alarms) do + afd.add_to_alarms( + alarm.state, + alarm.alert['function'], + alarm.alert.metric, + alarm.alert.fields, + {}, -- tags + alarm.alert.operator, + alarm.alert.value, + alarm.alert.threshold, + alarm.alert.window, + alarm.alert.periods, + alarm.alert.message) + end + + -- Message example: + -- msg = { + -- Type = 'afd_node_metric', + -- Payload = '{"alarms":[...]}', + -- Fields = { + -- name = 'node_status', + -- value = 0, + -- hostname = 'node1', + -- source = 'cpu', + -- node_role = 'controller', + -- dimensions = {'node_role', 'source', 'hostname'}, + -- } + -- } + local msg = afd.inject_afd_metric( + msg_type, afd_entity, afd_entity_value, msg_field_name, + state, hostname, msg_field_source) + + if msg then + afd_annotation.inject_afd_annotation(msg) + end + + end + else + A.set_start_time(ns) + end +end diff --git a/docker/hindsight/modules/afd.lua b/docker/hindsight/modules/afd.lua new file mode 100644 index 0000000..1db84ac --- /dev/null +++ b/docker/hindsight/modules/afd.lua @@ -0,0 +1,181 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local cjson = require 'cjson' +local string = require 'string' +local table = require 'table' + +local utils = require 'stacklight.utils' +local constants = require 'stacklight.constants' + +local read_message = read_message +local assert = assert +local ipairs = ipairs +local pcall = pcall + +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +local function read_field(msg, name) + return msg.Fields[name] +end + +function read_status(msg) + return read_field(msg, 'value') +end + +function read_source(msg) + return read_field(msg, 'source') +end + +function read_hostname(msg) + return read_field(msg, 'hostname') +end + +function extract_alarms(msg) + local ok, payload = pcall(cjson.decode, msg.Payload) + if not ok or not payload.alarms then + return nil + end + return payload.alarms +end + +-- return a human-readable message from an alarm table +-- for instance: "CPU load too high (WARNING, rule='last(load_midterm)>=5', current=7)" +function get_alarm_for_human(alarm) + local metric + if #(alarm.fields) > 0 then + local fields = {} + for _, field in ipairs(alarm.fields) do + fields[#fields+1] = field.name .. '="' .. field.value .. '"' + end + metric = string.format('%s[%s]', alarm.metric, table.concat(fields, ',')) + else + metric = alarm.metric + end + + local host = '' + if alarm.hostname then + host = string.format(', host=%s', alarm.hostname) + end + + return string.format( + "%s (%s, rule='%s(%s)%s%s', current=%.2f%s)", + alarm.message, + alarm.severity, + alarm['function'], + metric, + alarm.operator, + alarm.threshold, + alarm.value, + host + ) +end + +function alarms_for_human(alarms) + local alarm_messages = {} + local hint_messages = {} + + for _, v in ipairs(alarms) do + if v.tags and v.tags.dependency_level and v.tags.dependency_level == 'hint' then + hint_messages[#hint_messages+1] = get_alarm_for_human(v) + else + alarm_messages[#alarm_messages+1] = get_alarm_for_human(v) + end + end + + if #hint_messages > 0 then + alarm_messages[#alarm_messages+1] = "Other related alarms:" + end + for _, v in ipairs(hint_messages) do + alarm_messages[#alarm_messages+1] = v + end + + return alarm_messages +end + +local alarms = {} + +-- append an alarm to the list of pending alarms +-- the list is sent when inject_afd_metric is called +function add_to_alarms(status, fn, metric, fields, tags, operator, value, threshold, window, periods, message) + local severity = constants.status_label(status) + assert(severity) + alarms[#alarms+1] = { + severity=severity, + ['function']=fn, + metric=metric, + fields=fields or {}, + tags=tags or {}, + operator=operator, + value=value, + threshold=threshold, + window=window or 0, + periods=periods or 0, + message=message + } +end + +function get_alarms() + return alarms +end + +function reset_alarms() + alarms = {} +end + +-- inject an AFD event into the Heka pipeline +function inject_afd_metric(msg_type, msg_tag_name, msg_tag_value, metric_name, + value, hostname, source) + local payload + + if #alarms > 0 then + payload = utils.safe_json_encode({alarms=alarms}) + reset_alarms() + if not payload then + return + end + else + -- because cjson encodes empty tables as objects instead of arrays + payload = '{"alarms":[]}' + end + + local msg = { + Type = msg_type, + Payload = payload, + Fields = { + name = metric_name, + value = value, + hostname = hostname, + source = source, + dimensions = {msg_tag_name, 'hostname', 'source'}, + } + } + msg.Fields[msg_tag_name] = msg_tag_value + + local err_code, err_msg = utils.safe_inject_message(msg) + + if err_code ~= 0 then + return nil, err_msg + end + + return msg +end + +MATCH = 1 +NO_MATCH = 2 +NO_DATA = 3 +MISSING_DATA = 4 + +return M diff --git a/docker/hindsight/modules/afd_alarm.lua b/docker/hindsight/modules/afd_alarm.lua new file mode 100644 index 0000000..8d36e17 --- /dev/null +++ b/docker/hindsight/modules/afd_alarm.lua @@ -0,0 +1,224 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local assert = assert +local ipairs = ipairs +local pairs = pairs +local string = string +local setmetatable = setmetatable + +local table_utils = require 'stacklight.table_utils' +local constants = require 'stacklight.constants' +local afd = require 'stacklight.afd' +local Rule = require 'stacklight.afd_rule' + +local SEVERITIES = { + warning = constants.WARN, + critical = constants.CRIT, + down = constants.DOWN, + unknown = constants.UNKW, + okay = constants.OKAY, +} + +local Alarm = {} +Alarm.__index = Alarm + +setfenv(1, Alarm) -- Remove external access to contain everything in the module + +function Alarm.new(alarm) + local a = {} + setmetatable(a, Alarm) + a._metrics_list = nil + a.name = alarm.name + a.description = alarm.description + if alarm.trigger.logical_operator then + a.logical_operator = string.lower(alarm.trigger.logical_operator) + else + a.logical_operator = 'or' + end + a.severity_str = string.upper(alarm.severity) + a.severity = SEVERITIES[string.lower(alarm.severity)] + assert(a.severity ~= nil) + + a.skip_when_no_data = false + if alarm.no_data_policy then + if string.lower(alarm.no_data_policy) == 'skip' then + a.skip_when_no_data = true + else + a.no_data_severity = SEVERITIES[string.lower(alarm.no_data_policy)] + end + else + a.no_data_severity = constants.UNKW + end + assert(a.skip_when_no_data or a.no_data_severity ~= nil) + + a.rules = {} + a.initial_wait = 0 + for _, rule in ipairs(alarm.trigger.rules) do + local r = Rule.new(rule) + a.rules[#a.rules+1] = r + local wait = r.window * r.periods + if wait > a.initial_wait then + a.initial_wait = wait * 1e9 + end + end + a.start_time_ns = 0 + + return a +end + +-- return the Set of metrics used by the alarm +function Alarm:get_metrics() + if not self._metrics_list then + self._metrics_list = {} + for _, rule in ipairs(self.rules) do + if not table_utils.item_find(rule.metric, metrics) then + self._metrics_list[#self._metrics_list+1] = rule.metric + end + end + end + return self._metrics_list +end + +-- return a list of field names used for the metric +-- (can have duplicate names) +function Alarm:get_metric_fields(metric_name) + local fields = {} + for _, rule in ipairs(self.rules) do + if rule.metric == metric_name then + for k, _ in pairs(rule.fields) do + fields[#fields+1] = k + end + for _, g in ipairs(rule.group_by) do + fields[#fields+1] = g + end + end + end + return fields +end + +function Alarm:has_metric(metric) + return table_utils.item_find(metric, self:get_metrics()) +end + +-- dispatch datapoint in datastores +function Alarm:add_value(ts, metric, value, fields) + local data + for id, rule in pairs(self.rules) do + if rule.metric == metric then + rule:add_value(ts, value, fields) + end + end +end + +-- convert fields to fields map +-- {foo="bar"} --> {name="foo", value="bar"} +local function convert_field_list(fields) + local named_fields = {} + for name, value in pairs(fields or {}) do + named_fields[#named_fields+1] = {name=name, value=value} + end + return named_fields +end + +-- return: state of alarm and a list of alarm details. +-- +-- with alarm list when state != OKAY: +-- { +-- { +-- value = , +-- fields = , +-- message = , +-- }, +-- } +function Alarm:evaluate(ns) + local state = constants.OKAY + local matches = 0 + local all_alerts = {} + local function add_alarm(rule, value, message, fields) + all_alerts[#all_alerts+1] = { + severity = self.severity_str, + ['function'] = rule.fct, + metric = rule.metric, + operator = rule.relational_operator, + threshold = rule.threshold, + window = rule.window, + periods = rule.periods, + value = value, + fields = fields, + message = message + } + end + local one_unknown = false + local msg + + for _, rule in ipairs(self.rules) do + local eval, context_list = rule:evaluate(ns) + if eval == afd.MATCH then + matches = matches + 1 + msg = self.description + elseif eval == afd.MISSING_DATA then + msg = 'No datapoint have been received over the last ' .. rule.observation_window .. ' seconds' + one_unknown = true + elseif eval == afd.NO_DATA then + msg = 'No datapoint have been received ever' + one_unknown = true + end + for _, context in ipairs(context_list) do + add_alarm(rule, context.value, msg, + convert_field_list(context.fields)) + end + end + + if self.logical_operator == 'and' then + if one_unknown then + if self.skip_when_no_data then + state = nil + else + state = self.no_data_severity + end + elseif #self.rules == matches then + state = self.severity + end + elseif self.logical_operator == 'or' then + if matches > 0 then + state = self.severity + elseif one_unknown then + if self.skip_when_no_data then + state = nil + else + state = self.no_data_severity + end + end + end + + if state == nil or state == constants.OKAY then + all_alerts = {} + end + return state, all_alerts +end + +function Alarm:set_start_time(ns) + self.start_time_ns = ns +end + +function Alarm:is_evaluation_time(ns) + local delta = ns - self.start_time_ns + if delta >= self.initial_wait then + return true + end + return false +end + +return Alarm diff --git a/docker/hindsight/modules/afd_alarms.lua b/docker/hindsight/modules/afd_alarms.lua new file mode 100644 index 0000000..465241d --- /dev/null +++ b/docker/hindsight/modules/afd_alarms.lua @@ -0,0 +1,118 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local pairs = pairs +local ipairs = ipairs +local table_utils = require 'stacklight.table_utils' +local constants = require 'stacklight.constants' +local Alarm = require 'stacklight.afd_alarm' + +local all_alarms = {} + +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +-- return a list of field names required for the metric +function get_metric_fields(metric_name) + local fields = {} + for name, alarm in pairs(all_alarms) do + local mf = alarm:get_metric_fields(metric_name) + if mf then + for _, field in pairs(mf) do + if not table_utils.item_find(field, fields) then + fields[#fields+1] = field + end + end + end + end + return fields +end + +-- return list of alarms interested by a metric +function get_interested_alarms(metric) + local interested_alarms = {} + for _, alarm in pairs(all_alarms) do + if alarm:has_metric(metric) then + + interested_alarms[#interested_alarms+1] = alarm + end + end + return interested_alarms +end + +function add_value(ts, metric, value, fields) + local interested_alarms = get_interested_alarms(metric) + for _, alarm in ipairs (interested_alarms) do + alarm:add_value(ts, metric, value, fields) + end +end + +function reset_alarms() + all_alarms = {} +end + +function evaluate(ns) + local global_state + local all_alerts = {} + for _, alarm in pairs(all_alarms) do + if alarm:is_evaluation_time(ns) then + local state, alerts = alarm:evaluate(ns) + global_state = constants.max_status(state, global_state) + for _, a in ipairs(alerts) do + all_alerts[#all_alerts+1] = { state=state, alert=a } + end + -- raise the first triggered alarm except for OKAY/UNKW states + if global_state ~= constants.UNKW and global_state ~= constants.OKAY then + break + end + end + end + return global_state, all_alerts +end + +function get_alarms() + return all_alarms +end +function get_alarm(alarm_name) + for _, a in ipairs(all_alarms) do + if a.name == alarm_name then + return a + end + end +end + +function load_alarm(alarm) + local A = Alarm.new(alarm) + all_alarms[#all_alarms+1] = A +end + +function load_alarms(alarms) + for _, alarm in ipairs(alarms) do + load_alarm(alarm) + end +end + +local started = false +function set_start_time(ns) + for _, alarm in ipairs(all_alarms) do + alarm:set_start_time(ns) + end + started = true +end + +function is_started() + return started +end + +return M diff --git a/docker/hindsight/modules/afd_annotation.lua b/docker/hindsight/modules/afd_annotation.lua new file mode 100644 index 0000000..1ede301 --- /dev/null +++ b/docker/hindsight/modules/afd_annotation.lua @@ -0,0 +1,99 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local string = require 'string' +local table = require 'table' + +local utils = require 'stacklight.utils' +local consts = require 'stacklight.constants' +local afd = require 'stacklight.afd' + +local M = {} +setfenv(1, M) + +local statuses = {} + +local annotation_msg = { + Type = 'metric', + Fields = { + name = 'annotation', + dimensions = {'source', 'hostname'}, + value_fields = {'title', 'tags', 'text'}, + title = nil, + tags = nil, + text = nil, + source = nil, + hostname = nil, + } +} + +function inject_afd_annotation(msg) + local previous + local text + + local source = afd.read_source(msg) + local status = afd.read_status(msg) + local hostname = afd.read_hostname(msg) + local alarms = afd.extract_alarms(msg) + + if not source or not status or not alarms then + return -1 + end + + if not statuses[source] then + statuses[source] = {} + end + previous = statuses[source] + + text = table.concat(afd.alarms_for_human(alarms), '
') + + -- build the title + if not previous.status and status == consts.OKAY then + -- don't send an annotation when we detect a new cluster which is OKAY + return 0 + elseif not previous.status then + title = string.format('General status is %s', + consts.status_label(status)) + elseif previous.status ~= status then + title = string.format('General status %s -> %s', + consts.status_label(previous.status), + consts.status_label(status)) + + -- TODO(pasquier-s): generate an annotation when the set of alarms has + -- changed. the following code generated an annotation whenever at least + -- one value associated to an alarm was changing. This led to way too + -- many annotations with alarms monitoring the CPU usage for instance. + +-- elseif previous.text ~= text then +-- title = string.format('General status remains %s', +-- consts.status_label(status)) + else + -- nothing has changed since the last message + return 0 + end + + annotation_msg.Fields.title = title + annotation_msg.Fields.tags = source + annotation_msg.Fields.text = text + annotation_msg.Fields.source = source + annotation_msg.Fields.hostname = hostname + + -- store the last status and alarm text for future messages + previous.status = status + previous.text = text + + return utils.safe_inject_message(annotation_msg) +end + +return M diff --git a/docker/hindsight/modules/afd_rule.lua b/docker/hindsight/modules/afd_rule.lua new file mode 100644 index 0000000..f61a2b0 --- /dev/null +++ b/docker/hindsight/modules/afd_rule.lua @@ -0,0 +1,279 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local circular_buffer = require 'circular_buffer' +local stats = require 'lsb.stats' +local setmetatable = setmetatable +local ipairs = ipairs +local pairs = pairs +local math = require 'math' +local string = string +local table = table +local assert = assert +local type = type + +-- StackLight libs +local table_utils = require 'stacklight.table_utils' +local constants = require 'stacklight.constants' +local afd = require 'stacklight.afd' +local matching = require 'stacklight.value_matching' + +local MIN_WINDOW = 10 +local MIN_PERIOD = 1 +local SECONDS_PER_ROW = 5 + +local Rule = {} +Rule.__index = Rule + +setfenv(1, Rule) -- Remove external access to contain everything in the module + +function Rule.new(rule) + local r = {} + setmetatable(r, Rule) + + local win = MIN_WINDOW + if rule.window and rule.window + 0 > 0 then + win = rule.window + 0 + end + r.window = win + local periods = MIN_PERIOD + if rule.periods and rule.periods + 0 > 0 then + periods = rule.periods + 0 + end + r.periods = periods + r.relational_operator = rule.relational_operator + r.metric = rule.metric + r.fields = rule.fields or {} + + -- build field matching + r.field_matchers = {} + for f, expression in pairs(r.fields) do + r.field_matchers[f] = matching.new(expression) + end + + r.fct = rule['function'] + r.threshold = rule.threshold + 0 + r.value_index = rule.value or nil -- Can be nil + + -- build unique rule id + local arr = {r.metric, r.fct, r.window, r.periods} + for f, v in table_utils.orderedPairs(r.fields or {}) do + arr[#arr+1] = string.format('(%s=%s)', f, v) + end + r.rule_id = table.concat(arr, '/') + + r.group_by = rule.group_by or {} + + r.cbuf_size = math.ceil(r.window * r.periods / SECONDS_PER_ROW) + + r.ids_datastore = {} + r.datastore = {} + r.observation_window = math.ceil(r.window * r.periods) + + return r +end + +function Rule:get_datastore_id(fields) + if #self.group_by == 0 or fields == nil then + return self.rule_id + end + + local arr = {} + arr[#arr + 1] = self.rule_id + for _, g in ipairs(self.group_by) do + arr[#arr + 1] = fields[g] + end + return table.concat(arr, '/') +end + +function Rule:fields_accepted(fields) + if not fields then + fields = {} + end + local matched_fields = 0 + local no_match_on_fields = true + for f, expression in pairs(self.field_matchers) do + no_match_on_fields = false + for k, v in pairs(fields) do + if k == f then + if expression:matches(v) then + matched_fields = matched_fields + 1 + else + return false + end + end + end + end + return no_match_on_fields or matched_fields > 0 +end + +function Rule:get_circular_buffer() + local fct + if self.fct == 'min' or self.fct == 'max' then + fct = self.fct + else + fct = 'sum' + end + local cbuf = circular_buffer.new(self.cbuf_size, 1, SECONDS_PER_ROW) + cbuf:set_header(1, self.metric, fct, fct) + return cbuf +end + +-- store datapoints in cbuf, create the cbuf if not exists. +-- value can be a table where the index to choose is referenced by self.value_index +function Rule:add_value(ts, value, fields) + if not self:fields_accepted(fields) then + return + end + if type(value) == 'table' then + value = value[self.value_index] + end + if value == nil then + return + end + + local data + local uniq_field_id = self:get_datastore_id(fields) + if not self.datastore[uniq_field_id] then + self.datastore[uniq_field_id] = { + fields = self.fields, + cbuf = self:get_circular_buffer() + } + if #self.group_by > 0 then + self.datastore[uniq_field_id].fields = fields + end + + self:add_datastore(uniq_field_id) + end + data = self.datastore[uniq_field_id] + + if self.fct == 'avg' then + data.cbuf:add(ts, 1, value) + else + data.cbuf:set(ts, 1, value) + end +end + +function Rule:add_datastore(id) + if not table_utils.item_find(id, self.ids_datastore) then + self.ids_datastore[#self.ids_datastore+1] = id + end +end + +function Rule:compare_threshold(value) + return constants.compare_threshold(value, self.relational_operator, self.threshold) +end + +local function isnumber(value) + return value ~= nil and not (value ~= value) +end + +local available_functions = {last=true, avg=true, max=true, min=true, sum=true, + variance=true, sd=true, diff=true} + +-- evaluate the rule against datapoints +-- return a list: match (bool or string), context ({value=v, fields=list of field table}) +-- +-- examples: +-- true, { {value=100, fields={{queue='nova'}, {queue='neutron'}}, ..} +-- false, { {value=10, fields={}}, ..} +-- with 2 special cases: +-- - never receive one datapoint +-- 'nodata', {} +-- - no more datapoint received for a metric +-- 'missing', {value=-1, fields={}} +-- There is a drawback with the 'missing' state and could leads to emit false positive +-- state. For example when the monitored thing has been renamed/deleted, +-- it's normal to don't receive datapoint anymore .. for example a filesystem. +function Rule:evaluate(ns) + local fields = {} + local one_match, one_no_match, one_missing_data = false, false, false + for _, id in ipairs(self.ids_datastore) do + local data = self.datastore[id] + if data then + local cbuf_time = data.cbuf:current_time() + -- if we didn't receive datapoint within the observation window this means + -- we don't receive anymore data and cannot compute the rule. + if ns - cbuf_time > self.observation_window * 1e9 then + one_missing_data = true + fields[#fields+1] = {value = -1, fields = data.fields} + else + assert(available_functions[self.fct]) + local result + + if self.fct == 'last' then + local last + local t = ns + while (not isnumber(last)) and t >= ns - self.observation_window * 1e9 do + last = data.cbuf:get(t, 1) + t = t - SECONDS_PER_ROW * 1e9 + end + if isnumber(last) then + result = last + else + one_missing_data = true + fields[#fields+1] = {value = -1, fields = data.fields} + end + elseif self.fct == 'diff' then + local first, last + + local t = ns + while (not isnumber(last)) and t >= ns - self.observation_window * 1e9 do + last = data.cbuf:get(t, 1) + t = t - SECONDS_PER_ROW * 1e9 + end + + if isnumber(last) then + t = ns - self.observation_window * 1e9 + while (not isnumber(first)) and t <= ns do + first = data.cbuf:get(t, 1) + t = t + SECONDS_PER_ROW * 1e9 + end + end + + if not isnumber(last) or not isnumber(first) then + one_missing_data = true + fields[#fields+1] = {value = -1, fields = data.fields} + else + result = last - first + end + else + local values = data.cbuf:get_range(1) + result = stats[self.fct](values) + end + + if result then + local m = self:compare_threshold(result) + if m then + one_match = true + fields[#fields+1] = {value=result, fields=data.fields} + else + one_no_match = true + end + end + end + end + end + if one_match then + return afd.MATCH, fields + elseif one_missing_data then + return afd.MISSING_DATA, fields + elseif one_no_match then + return afd.NO_MATCH, {} + else + return afd.NO_DATA, {{value=-1, fields=self.fields}} + end +end + +return Rule diff --git a/docker/hindsight/modules/constants.lua b/docker/hindsight/modules/constants.lua new file mode 100644 index 0000000..50e9bae --- /dev/null +++ b/docker/hindsight/modules/constants.lua @@ -0,0 +1,78 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +-- The status values were chosen to match with the Grafana constraints: +-- OKAY => green +-- WARN & UNKW => orange +-- CRIT & DOWN => red +OKAY=0 +WARN=1 +UNKW=2 +CRIT=3 +DOWN=4 + +local STATUS_LABELS = { + [OKAY]='OKAY', + [WARN]='WARN', + [UNKW]='UNKNOWN', + [CRIT]='CRITICAL', + [DOWN]='DOWN' +} + +function status_label(v) + return STATUS_LABELS[v] +end + +local STATUS_WEIGHTS = { + [UNKW]=0, + [OKAY]=1, + [WARN]=2, + [CRIT]=3, + [DOWN]=4 +} + +function max_status(val1, val2) + if not val1 then + return val2 + elseif not val2 then + return val1 + elseif STATUS_WEIGHTS[val1] > STATUS_WEIGHTS[val2] then + return val1 + else + return val2 + end +end + +function compare_threshold(value, op, threshold) + local rule_matches = false + if op == '==' or op == 'eq' then + rule_matches = value == threshold + elseif op == '!=' or op == 'ne' then + rule_matches = value ~= threshold + elseif op == '>=' or op == 'gte' then + rule_matches = value >= threshold + elseif op == '>' or op == 'gt' then + rule_matches = value > threshold + elseif op == '<=' or op == 'lte' then + rule_matches = value <= threshold + elseif op == '<' or op == 'lt' then + rule_matches = value < threshold + end + return rule_matches +end + +return M diff --git a/docker/hindsight/modules/patterns.lua b/docker/hindsight/modules/patterns.lua new file mode 100644 index 0000000..f6580e7 --- /dev/null +++ b/docker/hindsight/modules/patterns.lua @@ -0,0 +1,34 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local l = require 'lpeg' +l.locale(l) + +local tonumber = tonumber + +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +function anywhere (patt) + return l.P { + patt + 1 * l.V(1) + } +end + +sp = l.space + +-- Pattern used to match a number +Number = l.P"-"^-1 * l.xdigit^1 * (l.S(".,") * l.xdigit^1 )^-1 / tonumber + +return M diff --git a/docker/hindsight/modules/table_utils.lua b/docker/hindsight/modules/table_utils.lua new file mode 100644 index 0000000..177e457 --- /dev/null +++ b/docker/hindsight/modules/table_utils.lua @@ -0,0 +1,83 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local table = require 'table' +local ipairs = ipairs +local pairs = pairs +local type = type + +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +-- return the position (index) of an item in a list, nil if not found +function item_pos(item, list) + if type(list) == 'table' then + for i, v in ipairs(list) do + if v == item then + return i + end + end + end +end + +-- return true if an item is present in the list, false otherwise +function item_find(item, list) + return item_pos(item, list) ~= nil +end + +-- from http://lua-users.org/wiki/SortedIteration +function __genOrderedIndex( t ) + local orderedIndex = {} + for key in pairs(t) do + table.insert( orderedIndex, key ) + end + table.sort( orderedIndex ) + return orderedIndex +end + +function orderedNext(t, state) + -- Equivalent of the next function, but returns the keys in the alphabetic + -- order. We use a temporary ordered key table that is stored in the + -- table being iterated. + + key = nil + if state == nil then + -- the first time, generate the index + t.__orderedIndex = __genOrderedIndex( t ) + key = t.__orderedIndex[1] + else + -- fetch the next value + for i = 1,table.getn(t.__orderedIndex) do + if t.__orderedIndex[i] == state then + key = t.__orderedIndex[i+1] + end + end + end + + if key then + return key, t[key] + end + + -- no more value to return, cleanup + t.__orderedIndex = nil + return +end + +function orderedPairs(t) + -- Equivalent of the pairs() function on tables. Allows to iterate + -- in order + return orderedNext, t, nil +end + +return M diff --git a/docker/hindsight/modules/utils.lua b/docker/hindsight/modules/utils.lua new file mode 100644 index 0000000..8a512b8 --- /dev/null +++ b/docker/hindsight/modules/utils.lua @@ -0,0 +1,46 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local cjson = require 'cjson' + +local inject_message = inject_message +local read_message = read_message +local string = string +local pcall = pcall + +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +-- Encode a Lua variable as JSON without raising an exception if the encoding +-- fails for some reason (for instance, the encoded buffer exceeds the sandbox +-- limit) +function safe_json_encode(v) + local ok, data = pcall(cjson.encode, v) + if not ok then + return + end + return data +end + +-- Call inject_message() wrapped by pcall() +function safe_inject_message(msg) + local ok, err_msg = pcall(inject_message, msg) + if not ok then + return -1, err_msg + else + return 0 + end +end + +return M diff --git a/docker/hindsight/modules/value_matching.lua b/docker/hindsight/modules/value_matching.lua new file mode 100644 index 0000000..3b1fd5c --- /dev/null +++ b/docker/hindsight/modules/value_matching.lua @@ -0,0 +1,171 @@ +-- Copyright 2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local l = require "lpeg" +l.locale(l) +local pcall = pcall +local string = require 'string' + +local patterns = require 'stacklight.patterns' +local error = error +local setmetatable = setmetatable +local tonumber = tonumber + +local C = l.C +local P = l.P +local S = l.S +local V = l.V +local Ct = l.Ct +local Cc = l.Cc + +local Optional_space = patterns.sp^0 +local Only_spaces = patterns.sp^1 * -1 + +local function space(pat) + return Optional_space * pat * Optional_space +end + +local EQ = P'==' +local NEQ = P'!=' +local GT = P'>' +local LT = P'<' +local GTE = P'>=' +local LTE = P'<=' +local MATCH = P'=~' +local NO_MATCH = P'!~' + +local OR = P'||' +local AND = P'&&' + +local function get_operator(op) + if op == '' then + return '==' + end + return op +end + +local numerical_operator = (EQ + NEQ + LTE + GTE + GT + LT )^-1 / get_operator +local sub_numerical_expression = space(numerical_operator) * patterns.Number * Optional_space +local is_plain_numeric = (sub_numerical_expression * ((OR^1 + AND^1) * sub_numerical_expression)^0) * -1 + +local quoted_string = (P'"' * C((P(1) - (P'"'))^1) * P'"' + C((P(1) - patterns.sp)^1)) +local string_operator = (EQ + NEQ + MATCH + NO_MATCH)^-1 / get_operator +local sub_string_expression = space(string_operator) * quoted_string * Optional_space +local is_plain_string = (sub_string_expression * ((OR^1 + AND^1) * sub_string_expression)^0) * -1 + +local numerical_expression = P { + 'OR'; + AND = Ct(Cc('and') * V'SUB' * space(AND) * V'AND' + V'SUB'), + OR = Ct(Cc('or') * V'AND' * space(OR) * V'OR' + V'AND'), + SUB = Ct(sub_numerical_expression) +} * -1 + +local string_expression = P { + 'OR'; + AND = Ct(Cc('and') * V'SUB' * space(AND) * V'AND' + V'SUB'), + OR = Ct(Cc('or') * V'AND' * space(OR) * V'OR' + V'AND'), + SUB = Ct(sub_string_expression) +} * -1 + +local is_complex = patterns.anywhere(EQ + NEQ + LTE + GTE + GT + LT + MATCH + NO_MATCH + OR + AND) + +local function eval_tree(tree, value) + local match = false + + if type(tree[1]) == 'table' then + match = eval_tree(tree[1], value) + else + local operator = tree[1] + if operator == 'and' or operator == 'or' then + match = eval_tree(tree[2], value) + for i=3, #tree, 1 do + local m = eval_tree(tree[i], value) + if operator == 'or' then + match = match or m + else + match = match and m + end + end + else + local matcher = tree[2] + if operator == '==' then + return value == matcher + elseif operator == '!=' then + return value ~= matcher + elseif operator == '>' then + return value > matcher + elseif operator == '<' then + return value < matcher + elseif operator == '>=' then + return value >= matcher + elseif operator == '<=' then + return value <= matcher + elseif operator == '=~' then + local ok, m = pcall(string.find, value, matcher) + return ok and m ~= nil + elseif operator == '!~' then + local ok, m = pcall(string.find, value, matcher) + return ok and m == nil + end + end + end + return match +end + +local MatchExpression = {} +MatchExpression.__index = MatchExpression + +setfenv(1, MatchExpression) -- Remove external access to contain everything in the module + +function MatchExpression.new(expression) + local r = {} + setmetatable(r, MatchExpression) + if is_complex:match(expression) then + r.is_plain_numeric_exp = is_plain_numeric:match(expression) ~= nil + + if r.is_plain_numeric_exp then + r.tree = numerical_expression:match(expression) + elseif is_plain_string:match(expression) ~= nil then + r.tree = string_expression:match(expression) + end + if r.tree == nil then + error('Invalid expression: ' .. expression) + end + else + if expression == '' or Only_spaces:match(expression) then + error('Expression is empty') + end + r.is_simple_equality_matching = true + end + r.expression = expression + + return r +end + +function MatchExpression:matches(value) + if self.is_simple_equality_matching then + return self.expression == value or + tonumber(self.expression) == value or + tonumber(value) == self.expression + end + if self.is_plain_numeric_exp then + value = tonumber(value) + if value == nil then + return false + end + end + return eval_tree(self.tree, value) +end + +return MatchExpression diff --git a/docker/hindsight/modules_alarms/afd_node_default_cpu_alarms.lua b/docker/hindsight/modules_alarms/afd_node_default_cpu_alarms.lua new file mode 100644 index 0000000..e3775bd --- /dev/null +++ b/docker/hindsight/modules_alarms/afd_node_default_cpu_alarms.lua @@ -0,0 +1,71 @@ +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +local alarms = { + { + ['name'] = 'cpu-critical', + ['description'] = 'The CPU usage is too high', + ['severity'] = 'critical', + ['trigger'] = { + ['logical_operator'] = 'or', + ['rules'] = { + { + ['metric'] = 'intel.procfs.cpu.idle_percentage', + ['fields'] = { + ['cpuID'] = 'all' + }, + ['relational_operator'] = '<=', + ['threshold'] = '5', + ['window'] = '120', + ['periods'] = '0', + ['function'] = 'avg', + }, + { + ['metric'] = 'intel.procfs.cpu.iowait_percentage', + ['fields'] = { + ['cpuID'] = 'all' + }, + ['relational_operator'] = '>=', + ['threshold'] = '35', + ['window'] = '120', + ['periods'] = '0', + ['function'] = 'avg', + }, + }, + }, + }, + { + ['name'] = 'cpu-warning', + ['description'] = 'The CPU usage is high', + ['severity'] = 'warning', + ['trigger'] = { + ['logical_operator'] = 'or', + ['rules'] = { + { + ['metric'] = 'intel.procfs.cpu.idle_percentage', + ['fields'] = { + ['cpuID'] = 'all' + }, + ['relational_operator'] = '<=', + ['threshold'] = '15', + ['window'] = '120', + ['periods'] = '0', + ['function'] = 'avg', + }, + { + ['metric'] = 'intel.procfs.cpu.iowait_percentage', + ['fields'] = { + ['cpuID'] = 'all' + }, + ['relational_operator'] = '>=', + ['threshold'] = '25', + ['window'] = '120', + ['periods'] = '0', + ['function'] = 'avg', + }, + }, + }, + }, +} + +return alarms diff --git a/service/files/hindsight_afd_node_default_cpu_alarms.cfg.j2 b/service/files/hindsight_afd_node_default_cpu_alarms.cfg.j2 new file mode 100644 index 0000000..a75ff2f --- /dev/null +++ b/service/files/hindsight_afd_node_default_cpu_alarms.cfg.j2 @@ -0,0 +1,9 @@ +filename = "afd.lua" +log_level = 7 +message_matcher = "TRUE" +ticker_interval = 10 +afd_type = "node" +afd_file = "afd_node_default_cpu_alarms" +afd_cluster_name = "default" +afd_logical_name = "cpu" +hostname = "{{ CCP_HINDSIGHT_NODE_NAME }}" diff --git a/service/stacklight-collector.yaml b/service/stacklight-collector.yaml index 177bd52..07286bc 100644 --- a/service/stacklight-collector.yaml +++ b/service/stacklight-collector.yaml @@ -15,6 +15,7 @@ service: - prune-input.cfg - influxdb-tcp.cfg - kubelet-stats.cfg + - afd-node-default-cpu-alarms.cfg volumes: - name: hindsight-output type: empty-dir @@ -70,6 +71,10 @@ files: path: /var/lib/hindsight/run/input/kubelet_stats.cfg content: hindsight_kubelet_stats.cfg.j2 perm: "0600" + afd-node-default-cpu-alarms.cfg: + path: /var/lib/hindsight/run/analysis/afd_node_default_cpu_alarms.cfg + content: hindsight_afd_node_default_cpu_alarms.cfg.j2 + perm: "0600" snap.conf: path: /etc/snap/snap.conf content: snap.conf.j2