Replace the workers AFD filter

This patch uses the generic AFD filter with new alarms to replace
the custom AFD for workers.

Blueprint: allow-all-alarms-to-be-specified-in-alarming-file
Change-Id: I6c432e60a16da5bb3c8d0ecd0bd22a1246fe6f82
This commit is contained in:
Guillaume Thouvenin 2016-09-09 11:45:48 +02:00 committed by Swann Croiset
parent 215f693307
commit 9dbf48dbfe
6 changed files with 599 additions and 163 deletions

View File

@ -259,9 +259,6 @@ if hiera('lma::collector::influxdb::server', false) {
class { 'lma_collector::logs::http_metrics': }
class { 'lma_collector::logs::aggregated_http_metrics': }
# AFD filters
class { 'lma_collector::afd::workers': }
}
$alerting_mode = $lma_collector['alerting_mode']

View File

@ -2427,6 +2427,506 @@ lma_collector:
periods: 0
function: last
# Following are the AFD generated to check workers
# All workers are down
- name: 'nova-scheduler-all-down'
description: 'All Nova schedulers are down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'scheduler'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-cert-all-down'
description: 'All Nova certs are down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'cert'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-consoleauth-all-down'
description: 'All Nova consoleauths are down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'consoleauth'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-compute-all-down'
description: 'All Nova computes are down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'compute'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-conductor-all-down'
description: 'All Nova conductors are down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'conductor'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'cinder-scheduler-all-down'
description: 'All Cinder schedulers are down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_cinder_services
fields:
service: 'scheduler'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'cinder-volume-all-down'
description: 'All Cinder volumes are down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_cinder_services
fields:
service: 'volume'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-l3-all-down'
description: 'All Neutron L3 agents are down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'l3'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-dhcp-all-down'
description: 'All Neutron DHCP agents are down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'dhcp'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-metadata-all-down'
description: 'All Neutron metadata agents are down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'metadata'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-openvswitch-all-down'
description: 'All Neutron openvswitch agents are down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'openvswitch'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
# At least one backend is down
- name: 'nova-scheduler-one-down'
description: 'At least one Nova scheduler is down'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'scheduler'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-cert-one-down'
description: 'At least one Nova cert is down'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'cert'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-consoleauth-one-down'
description: 'At least one Nova consoleauth is down'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'consoleauth'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-compute-one-down'
description: 'At least one Nova compute is down'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'compute'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-conductor-one-down'
description: 'At least one Nova conductor is down'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'conductor'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'cinder-scheduler-one-down'
description: 'At least one Cinder scheduler is down'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: openstack_cinder_services
fields:
service: 'scheduler'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'cinder-volume-one-down'
description: 'At least one Cinder volume is down'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: openstack_cinder_services
fields:
service: 'volume'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-l3-one-down'
description: 'At least one L3 agent is down'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'l3'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-dhcp-one-down'
description: 'At least one DHCP agent is down'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'dhcp'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-metadata-one-down'
description: 'At least one metadata agents is down'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'metadata'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-openvswitch-one-down'
description: 'At least one openvswitch agents is down'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'openvswitch'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
# Less than 50% of service are up (compared to up and down).
- name: 'nova-scheduler-majority-down'
description: 'Less than 50% of Nova schedulers are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services_percent
fields:
service: 'scheduler'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'nova-cert-majority-down'
description: 'Less than 50% of Nova certs are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services_percent
fields:
service: 'cert'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'nova-consoleauth-majority-down'
description: 'Less than 50% of Nova consoleauths are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services_percent
fields:
service: 'consoleauth'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'nova-compute-majority-down'
description: 'Less than 50% of Nova computes are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services_percent
fields:
service: 'compute'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'nova-conductor-majority-down'
description: 'Less than 50% of Nova conductors are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services_percent
fields:
service: 'conductor'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'cinder-scheduler-majority-down'
description: 'Less than 50% of Cinder schedulers are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_cinder_services_percent
fields:
service: 'scheduler'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'cinder-volume-majority-down'
description: 'Less than 50% of Cinder volumes are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_cinder_services_percent
fields:
service: 'volume'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'neutron-l3-majority-down'
description: 'Less than 50% of Neutron L3 agents are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents_percent
fields:
service: 'l3'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'neutron-dhcp-majority-down'
description: 'Less than 50% of Neutron DHCP agents are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents_percent
fields:
service: 'dhcp'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'neutron-metadata-majority-down'
description: 'Less than 50% of Neutron metadata agents are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents_percent
fields:
service: 'metadata'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'neutron-openvswitch-majority-down'
description: 'Less than 50% of Neutron openvswitch agents are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents_percent
fields:
service: 'openvswitch'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
# Definition of the AFD node filters
node_cluster_alarms:
controller:
@ -2620,6 +3120,51 @@ lma_collector:
activate_alerting: true
alarms:
error: ['nova-logs-error']
nova-cert:
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
workers:
- 'nova-cert-all-down'
- 'nova-cert-majority-down'
- 'nova-cert-one-down'
nova-consoleauth:
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
workers:
- 'nova-consoleauth-all-down'
- 'nova-consoleauth-majority-down'
- 'nova-consoleauth-one-down'
nova-compute:
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
workers:
- 'nova-compute-all-down'
- 'nova-compute-majority-down'
- 'nova-compute-one-down'
nova-conductor:
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
workers:
- 'nova-conductor-all-down'
- 'nova-conductor-majority-down'
- 'nova-conductor-one-down'
nova-scheduler:
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
workers:
- 'nova-scheduler-all-down'
- 'nova-scheduler-majority-down'
- 'nova-scheduler-one-down'
heat-api:
apply_to_node: controller
enable_notification: false
@ -2742,6 +3287,24 @@ lma_collector:
activate_alerting: true
alarms:
error: ['cinder-logs-error']
cinder-scheduler:
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
workers:
- 'cinder-scheduler-all-down'
- 'cinder-scheduler-majority-down'
- 'cinder-scheduler-one-down'
cinder-volume:
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
workers:
- 'cinder-volume-all-down'
- 'cinder-volume-majority-down'
- 'cinder-volume-one-down'
<% if not @storage_options["volumes_ceph"] then -%>
cinder-volume-logs:
apply_to_node: storage
@ -2813,6 +3376,42 @@ lma_collector:
activate_alerting: true
alarms:
error: ['neutron-logs-error']
neutron-l3:
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
workers:
- 'neutron-l3-all-down'
- 'neutron-l3-majority-down'
- 'neutron-l3-one-down'
neutron-dhcp:
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
workers:
- 'neutron-dhcp-all-down'
- 'neutron-dhcp-majority-down'
- 'neutron-dhcp-one-down'
neutron-metadata:
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
workers:
- 'neutron-metadata-all-down'
- 'neutron-metadata-majority-down'
- 'neutron-metadata-one-down'
neutron-openvswitch:
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
workers:
- 'neutron-openvswitch-all-down'
- 'neutron-openvswitch-majority-down'
- 'neutron-openvswitch-one-down'
neutron-logs-compute:
apply_to_node: compute
enable_notification: false

View File

@ -449,7 +449,6 @@ Public Classes:
* [`lma_collector::notifications::metrics`](#class-lma_collectornotificationsmetrics)
* [`lma_collector::aggregator::client`](#class-lma_collectoraggregatorclient)
* [`lma_collector::aggregator::server`](#class-lma_collectoraggregatorserver)
* [`lma_collector::afd::workers`](#class-lma_collectorafdworkers)
* [`lma_collector::gse_policies`](#class-lma_collectorgse_policies)
* [`lma_collector::metrics::heka_monitoring`](#class-lma_collectormetricsheka_monitoring)
* [`lma_collector::smtp_alert`](#class-lma_collectorsmtp_alert)
@ -870,12 +869,6 @@ Declare this class to make Heka run the aggregator service.
to check the health of the aggregator service. Valid options: an integer.
Default: `undef`.
#### Class: `lma_collector::afd::workers`
Declare this class to configure the Heka filter that sends AFD metrics
reporting the availability of the Neutron agents and the Cinder and Nova
services.
#### Class: `lma_collector::gse_policies`
Declare this class to configure the GSE cluster policies on the aggregator node.

View File

@ -1,94 +0,0 @@
-- Copyright 2015 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
require 'string'
local afd = require 'afd'
local consts = require 'gse_constants'
local worker_states = {}
-- emit AFD event metrics based on openstack_nova_services, openstack_cinder_services and openstack_neutron_agents metrics
function process_message()
local metric_name = read_message('Fields[name]')
local service = string.format('%s-%s',
string.match(metric_name, 'openstack_([^_]+)'),
read_message('Fields[service]'))
local worker_key = string.format('%s.%s', metric_name, service)
if not worker_states[worker_key] then
worker_states[worker_key] = {}
end
local worker = worker_states[worker_key]
worker[read_message('Fields[state]')] = read_message('Fields[value]')
local state = consts.OKAY
if not(worker.up and worker.down) then
-- not enough data for now
return 0
end
if worker.up == 0 then
state = consts.DOWN
afd.add_to_alarms(consts.DOWN,
'last',
metric_name,
{service=service,state='up'},
{},
'==',
worker.up,
0,
nil,
nil,
string.format("All instances for the service %s are down or disabled", service))
elseif worker.down >= worker.up then
state = consts.CRIT
afd.add_to_alarms(consts.CRIT,
'last',
metric_name,
{service=service,state='down'},
{},
'>=',
worker.down,
worker.up,
nil,
nil,
string.format("More instances of %s are down than up", service))
elseif worker.down > 0 then
state = consts.WARN
afd.add_to_alarms(consts.WARN,
'last',
metric_name,
{service=service,state='down'},
{},
'>',
worker.down,
0,
nil,
nil,
string.format("At least one %s instance is down", service))
end
afd.inject_afd_service_metric(service,
state,
read_message('Fields[hostname]'),
0,
'workers')
-- reset the cache for this worker
worker_states[worker_key] = {}
return 0
end

View File

@ -1,33 +0,0 @@
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
class lma_collector::afd::workers () {
include lma_collector::params
include lma_collector::service::metric
$lua_modules_dir = $lma_collector::params::lua_modules_dir
$metrics_matcher = join([
'(Type == \'metric\' || Type == \'heka.sandbox.metric\')', ' && ',
'Fields[name] =~ /^openstack_(nova|cinder|neutron)_(services|agents)$/',
], '')
heka::filter::sandbox { 'afd_workers':
config_dir => $lma_collector::params::metric_config_dir,
filename => "${lma_collector::params::plugins_dir}/filters/afd_workers.lua",
message_matcher => $metrics_matcher,
module_directory => $lua_modules_dir,
notify => Class['lma_collector::service::metric'],
}
}

View File

@ -1,26 +0,0 @@
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
require 'spec_helper'
describe 'lma_collector::afd::workers' do
let(:facts) do
{:kernel => 'Linux', :operatingsystem => 'Ubuntu',
:osfamily => 'Debian'}
end
describe 'with defaults' do
it { is_expected.to contain_heka__filter__sandbox('afd_workers') }
end
end