Alarm definition refactoring

DocImpact
blueprint: alarming-refactoring

Change-Id: I8c053f2fbc4b4b85958be8413919f9bf1b168027
This commit is contained in:
Swann Croiset 2016-09-12 17:35:42 +02:00
parent 385da2a160
commit 7deace8726
11 changed files with 652 additions and 207 deletions

View File

@ -38,8 +38,7 @@ if $is_controller or $is_rabbitmq or $is_mysql_server {
class { 'fuel_lma_collector::afds':
roles => hiera('roles'),
node_cluster_roles => $lma['node_cluster_roles'],
service_cluster_roles => $lma['service_cluster_roles'],
node_profiles => $lma['node_profiles'],
node_cluster_alarms => $lma['node_cluster_alarms'],
service_cluster_alarms => $lma['service_cluster_alarms'],
alarms => $alarms_definitions,

View File

@ -239,11 +239,15 @@ lma::collector::infrastructure_alerting::password: <%= @nagios_password %>
$detach_database = hiera('detach-database', {})
$detach_database_enabled = $detach_database['metadata'] and $detach_database['metadata']['enabled']
fuel_lma_collector::hiera_data { 'gse_filters':
content => template('fuel_lma_collector/gse_filters.yaml.erb')
fuel_lma_collector::hiera_data { 'clusters':
content => template('fuel_lma_collector/clusters.yaml.erb')
}
fuel_lma_collector::hiera_data { 'alarming':
content => template('fuel_lma_collector/alarming.yaml.erb')
}
fuel_lma_collector::hiera_data { 'node_profiles':
content => template('fuel_lma_collector/node_profiles.yaml.erb')
}
}

View File

@ -23,10 +23,10 @@
# Ex:
#
# ARG0:
# {"rabbitmq"=>{"queue"=>["rabbitmq-queue-warning"]},
# "apache"=>{"worker"=>["apache-warning"]},
# "memcached"=>{"all"=>["memcached-warning"]},
# "haproxy"=>{"alive"=>["haproxy-warning"]}}
# {"rabbitmq"=>{"apply_to_node" => "controller", "alarms" => {"queue"=>["rabbitmq-queue-warning"]}},
# "apache"=>{"apply_to_node" => "controller", "alarms" => {"worker"=>["apache-warning"]}},
# "memcached"=>{"apply_to_node"=>"controller", "alarms" => {"all"=>["memcached-warning"]}},
# "haproxy"=>{"apply_to_node" => "controller", "alarms" => {"alive"=>["haproxy-warning"]}}}
#
# ARG1:
#
@ -63,7 +63,7 @@
# "function"=>"min"}]}}
# ]
#
# ARG2: ["rabbitmq", "apache"]
# ARG2: ["controller", "compute"]
#
# ARG3: type (node|service)
#
@ -96,31 +96,47 @@ module Puppet::Parser::Functions
afd_filters = {}
afd_profiles.each do |afd_profile|
next unless afd_alarms.has_key?(afd_profile)
afd_alarms[afd_profile].each do |afd_name, alarms|
# Collect the metrics which are required by this AFD filter
metrics = Set.new([])
alarms.each do |a_name|
alarm_definitions.each do |alarm_def|
if alarm_def['name'] == a_name
alarm_def['trigger']['rules'].each do |r|
metrics << r['metric']
end
end
afds = afd_alarms.select {|k,v| v.has_key?('apply_to_node') and v['apply_to_node'] == afd_profile }
afds.each do |k, v|
activate_alerting=true
if v.has_key?('activate_alerting')
if v['activate_alerting'] == false
activate_alerting=false
end
end
enable_notification=false
if v.has_key?('enable_notification')
if v['enable_notification'] == true
enable_notification=true
end
end
afd_cluster_name = k
v['alarms'].each do |afd_name, alarms|
# Collect the metrics which are required by this AFD filter
metrics = Set.new([])
alarms.each do |a_name|
alarm_definitions.each do |alarm_def|
if alarm_def['name'] == a_name
alarm_def['trigger']['rules'].each do |r|
metrics << r['metric']
end
end
message_matcher = metrics.collect{|x| "Fields[name] == \'#{x}\'" }.join(' || ')
end
end
message_matcher = metrics.collect{|x| "Fields[name] == \'#{x}\'" }.join(' || ')
afd_filters["#{afd_profile}_#{afd_name}"] = {
'type' => type,
'cluster_name' => afd_profile,
'logical_name' => afd_name,
'alarms' => alarms,
'alarms_definitions' => alarm_definitions,
'message_matcher' => message_matcher
}
afd_filters["#{afd_cluster_name}_#{afd_name}"] = {
'type' => type,
'cluster_name' => afd_cluster_name,
'logical_name' => afd_name,
'alarms' => alarms,
'alarms_definitions' => alarm_definitions,
'message_matcher' => message_matcher,
'activate_alerting' => activate_alerting,
'enable_notification' => enable_notification,
}
end
end
end

View File

@ -44,7 +44,7 @@ module Puppet::Parser::Functions
roles.each do |role|
data.each do |k,v|
cluster_names << k if v.include?(role)
cluster_names << k if v['roles'].include?(role)
end
end

View File

@ -15,37 +15,28 @@
class fuel_lma_collector::afds (
$roles = undef,
$node_cluster_roles = undef,
$service_cluster_roles = undef,
$node_profiles = undef,
$node_cluster_alarms = undef,
$service_cluster_alarms = undef,
$alarms = undef,
){
validate_array($roles)
validate_hash($node_cluster_roles)
validate_hash($service_cluster_roles)
validate_hash($node_profiles)
validate_hash($node_cluster_alarms)
validate_hash($service_cluster_alarms)
validate_array($alarms)
$node_cluster_names_tmp = get_cluster_names($node_cluster_roles, $roles)
$service_cluster_names = get_cluster_names($service_cluster_roles, $roles)
if size($node_cluster_names_tmp) == 0 and $node_cluster_alarms['default'] {
$node_cluster_names = ['default']
} else {
$node_cluster_names = $node_cluster_names_tmp
}
$clusters = get_cluster_names($node_profiles, $roles)
$node_afd_filters = get_afd_filters($node_cluster_alarms,
$alarms,
$node_cluster_names,
$clusters,
'node')
$service_afd_filters = get_afd_filters($service_cluster_alarms,
$alarms,
$service_cluster_names,
$clusters,
'service')
create_resources(lma_collector::afd_filter, $node_afd_filters)

View File

@ -22,10 +22,24 @@ describe 'fuel_lma_collector::afds' do
describe 'with defaults' do
let(:params) do
{:roles => ['primary-controller'],
:node_cluster_roles => {'controller' => ['primary-controller']},
:service_cluster_roles => {'mysql' => ['primary-controller']},
:node_cluster_alarms => {'controller' => {'cpu' => ['cpu_warning']}},
:service_cluster_alarms => {'mysql' => {'all' => ['db_warning']}},
:node_profiles => {'controller' => {'roles' => ['primary-controller']}},
:node_cluster_alarms => {
'controller' =>
{
'apply_to_node' => 'controller',
'alarms' => {
'cpu' => ['cpu_warning']
}
}
},
:service_cluster_alarms => {
'mysql' => {
'apply_to_node' => 'controller',
'alarms' => {
'all' => ['db_warning']
}
}
},
:alarms => [
{"name"=>"cpu_warning",
"description"=>"Fake alarm",
@ -63,9 +77,15 @@ describe 'fuel_lma_collector::afds' do
describe 'with enabled false' do
let(:params) do
{:roles => ['primary-controller'],
:node_cluster_roles => {'controller' => ['primary-controller']},
:service_cluster_roles => {},
:node_cluster_alarms => {'controller' => {'cpu' => ['cpu_warning']}},
:node_profiles => {'controller' => {'roles' => ['primary-controller']}},
:node_cluster_alarms => {
'controller' => {
'apply_to_node' => 'controller',
'alarms' => {
'cpu' => ['cpu_warning']
}
}
},
:service_cluster_alarms => {},
:alarms => [
{"name"=>"cpu_warning",

View File

@ -0,0 +1,288 @@
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
require 'spec_helper'
describe 'get_afd_filters' do
alarms_nodes = [
{"name"=>"cpu-critical-controller",
"description"=>"The CPU usage is too high (controller node)",
"severity"=>"critical",
"trigger"=>
{"logical_operator"=>"or",
"rules"=>
[
{"metric"=>"cpu_idle",
"relational_operator"=>"<=",
"threshold"=>5,
"window"=>120,
"periods"=>0,
"function"=>"avg"},
{"metric"=>"cpu_wait",
"relational_operator"=>">=",
"threshold"=>35,
"window"=>120,
"periods"=>0,
"function"=>"avg"},
]}},
{"name"=>"cpu-warning-controller",
"description"=>"The CPU usage is high (controller node)",
"severity"=>"warning",
"trigger"=>
{"logical_operator"=>"or",
"rules"=>
[
{"metric"=>"cpu_idle",
"relational_operator"=>"<=",
"threshold"=>15,
"window"=>120,
"periods"=>0,
"function"=>"avg"},
{"metric"=>"cpu_wait",
"relational_operator"=>">=",
"threshold"=>25,
"window"=>120,
"periods"=>0,
"function"=>"avg"},
]}},
{"name"=>"cpu-critical-compute",
"description"=>"The CPU usage is high (critical node)",
"severity"=>"critical",
"trigger"=>
{"logical_operator"=>"or",
"rules"=>
[
{"metric"=>"cpu_idle",
"relational_operator"=>"<=",
"threshold"=>30,
"window"=>120,
"periods"=>0,
"function"=>"avg"},
]}},
{"name"=>"cpu-warning-compute",
"description"=>"The CPU usage is high (compute node)",
"severity"=>"warning",
"trigger"=>
{"logical_operator"=>"or",
"rules"=>
[
{"metric"=>"cpu_idle",
"relational_operator"=>"<=",
"threshold"=>20,
"window"=>120,
"periods"=>0,
"function"=>"avg"},
]}},
{"name"=>"fs-critical",
"description"=>"The FS usage is critical",
"severity"=>"critical",
"trigger"=>
{"logical_operator"=>"or",
"rules"=>
[
{"metric"=>"fs_percent_free",
"relational_operator"=>"<=",
"threshold"=>8,
"window"=>120,
"periods"=>0,
"function"=>"avg"},
]}},
]
afds_nodes = {
"controller" => {
"apply_to_node" => "controller",
"enable_notification" => true,
"activate_alerting" => true,
"alarms" => {
"system" => ["cpu-critical-controller", "cpu-warning-controller"],
},
},
"compute" => {
"apply_to_node" => "compute",
"enable_notification" => true,
"activate_alerting" => true,
"alarms" => {
"system" => ["cpu-critical-compute", "cpu-warning-compute"],
"fs" => ["fs-critical"],
},
}
}
describe 'For controller nodes' do
it { should run.with_params(afds_nodes, alarms_nodes, ['controller'], 'node')
.and_return(
{"controller_system"=>
{"type"=>"node",
"cluster_name"=>"controller",
"logical_name"=>"system",
"alarms"=>["cpu-critical-controller", "cpu-warning-controller"],
"alarms_definitions"=> alarms_nodes,
"message_matcher"=>"Fields[name] == 'cpu_idle' || Fields[name] == 'cpu_wait'",
"enable_notification" => true,
"activate_alerting" => true,
}
})
}
end
describe 'For compute nodes' do
it { should run.with_params(afds_nodes, alarms_nodes, ['compute'], 'node')
.and_return(
{"compute_system"=>
{"type"=>"node",
"cluster_name"=>"compute",
"logical_name"=>"system",
"alarms"=>["cpu-critical-compute", "cpu-warning-compute"],
"alarms_definitions"=> alarms_nodes,
"message_matcher"=>"Fields[name] == 'cpu_idle'",
"activate_alerting" => true,
"enable_notification" => true,
},
"compute_fs"=>
{"type"=>"node",
"cluster_name"=>"compute",
"logical_name"=>"fs",
"alarms"=>["fs-critical"],
"alarms_definitions"=> alarms_nodes,
"message_matcher"=>"Fields[name] == 'fs_percent_free'",
"activate_alerting" => true,
"enable_notification" => true,
}
})
}
end
describe 'For compute and controller nodes' do
it { should run.with_params(afds_nodes, alarms_nodes, ['compute', 'controller'], 'node')
.and_return(
{"compute_system"=>
{"type"=>"node",
"cluster_name"=>"compute",
"logical_name"=>"system",
"alarms"=>["cpu-critical-compute", "cpu-warning-compute"],
"alarms_definitions"=> alarms_nodes,
"message_matcher"=>"Fields[name] == 'cpu_idle'",
"activate_alerting" => true,
"enable_notification" => true,
},
"compute_fs"=>
{"type"=>"node",
"cluster_name"=>"compute",
"logical_name"=>"fs",
"alarms"=>["fs-critical"],
"alarms_definitions"=> alarms_nodes,
"message_matcher"=>"Fields[name] == 'fs_percent_free'",
"activate_alerting" => true,
"enable_notification" => true,
},
"controller_system"=>
{"type"=>"node",
"cluster_name"=>"controller",
"logical_name"=>"system",
"alarms"=>["cpu-critical-controller", "cpu-warning-controller"],
"alarms_definitions"=> alarms_nodes,
"message_matcher"=>"Fields[name] == 'cpu_idle' || Fields[name] == 'cpu_wait'",
"activate_alerting" => true,
"enable_notification" => true,
}
})
}
end
alarms_services = [
{"name"=>"rabbitmq-queue-warning",
"description"=>"Number of message in queues too high",
"severity"=>"warning",
"trigger"=>
{"logical_operator"=>"or",
"rules"=>
[{"metric"=>"rabbitmq_messages",
"relational_operator"=>">=",
"threshold"=>200,
"window"=>120,
"periods"=>0,
"function"=>"avg"}]}},
{"name"=>"apache-warning",
"description"=>"",
"severity"=>"warning",
"trigger"=>
{"logical_operator"=>"or",
"rules"=>
[{"metric"=>"apache_idle_workers",
"relational_operator"=>"=",
"threshold"=>0,
"window"=>60,
"periods"=>0,
"function"=>"min"},
{"metric"=>"apache_status",
"relational_operator"=>"=",
"threshold"=>0,
"window"=>60,
"periods"=>0,
"function"=>"min"}]}}
]
afds_services = {
"rabbitmq" => {
"apply_to_node" => "controller",
"enable_notification" => false,
"activate_alerting" => true,
"enable_notification" => false,
"alarms" => {
# "pacemaker" => ['rabbitmq-pacemaker-down', 'rabbitmq-pacemaker-critical'],
"queue" => ["rabbitmq-queue-warning"]
},
},
"apache" => {
"apply_to_node" => "controller",
"enable_notification" => false,
"activate_alerting" => true,
"enable_notification" => false,
"alarms" => {
"worker" => ['apache-warning'],
},
},
}
describe 'For services' do
it { should run.with_params(afds_services, alarms_services, ['controller'], 'service')
.and_return(
{
"rabbitmq_queue"=>
{
"type"=>"service",
"cluster_name"=>"rabbitmq",
"logical_name"=>"queue",
"alarms_definitions"=> alarms_services,
"alarms"=>["rabbitmq-queue-warning"],
"message_matcher"=>"Fields[name] == 'rabbitmq_messages'",
"activate_alerting" => true,
"enable_notification" => false,
},
"apache_worker"=>
{
"type"=>"service",
"cluster_name"=>"apache",
"logical_name"=>"worker",
"alarms_definitions"=> alarms_services,
"alarms"=>["apache-warning"],
"message_matcher"=>"Fields[name] == 'apache_idle_workers' || Fields[name] == 'apache_status'",
"activate_alerting" => true,
"enable_notification" => false,
}}
)
}
end
end

View File

@ -1090,200 +1090,289 @@ lma_collector:
periods: 0
function: max
# Mapping between the Fuel roles and the AFD node filters
node_cluster_roles:
controller: ['primary-controller', 'controller']
<% if @detach_database_enabled -%>
mysql-nodes: ['primary-standalone-database', 'standalone-database']
<% else -%>
mysql-nodes: ['primary-controller', 'controller']
<% end -%>
compute: ['compute']
storage: ['cinder', 'ceph-osd']
elasticsearch-nodes: ['primary-elasticsearch_kibana', 'elasticsearch_kibana']
influxdb-nodes: ['primary-influxdb_grafana', 'influxdb_grafana']
# Mapping between the Fuel roles and the AFD service filters
service_cluster_roles:
<% if @detach_rabbitmq_enabled -%>
rabbitmq-cluster: ['primary-standalone-rabbitmq', 'standalone-rabbitmq']
rabbitmq-service: ['primary-standalone-rabbitmq', 'standalone-rabbitmq']
<% else -%>
rabbitmq-cluster: ['primary-controller', 'controller']
rabbitmq-service: ['primary-controller', 'controller']
<% end -%>
<% if @detach_database_enabled -%>
mysql: ['primary-standalone-database', 'standalone-database']
<% else -%>
mysql: ['primary-controller', 'controller']
<% end -%>
apache: ['primary-controller', 'controller']
nova-api: ['primary-controller', 'controller']
nova-logs: ['primary-controller', 'controller', 'compute']
heat-api: ['primary-controller', 'controller']
heat-logs: ['primary-controller', 'controller']
<% if not @storage_options["objects_ceph"] then -%>
swift-api: ['primary-controller', 'controller']
<% end -%>
cinder-api: ['primary-controller', 'controller']
cinder-logs: ['primary-controller', 'controller', 'cinder']
glance-api: ['primary-controller', 'controller']
glance-logs: ['primary-controller', 'controller']
neutron-api: ['primary-controller', 'controller']
neutron-logs: ['primary-controller', 'controller', 'compute']
keystone-response-time: ['primary-controller', 'controller']
keystone-public-api: ['primary-controller', 'controller']
keystone-admin-api: ['primary-controller', 'controller']
keystone-logs: ['primary-controller', 'controller']
nova-instances: ['primary-controller', 'controller']
<% if @storage_options["volumes_ceph"] then -%>
ceph-mon-cluster: ['primary-controller', 'controller']
ceph-mon-service: ['primary-controller', 'controller']
ceph-osd-service: ['ceph-osd']
<% end -%>
elasticsearch-cluster: ['primary-elasticsearch_kibana', 'elasticsearch_kibana']
elasticsearch-service: ['primary-elasticsearch_kibana', 'elasticsearch_kibana']
influxdb-service: ['primary-influxdb_grafana', 'influxdb_grafana']
pacemaker-service: ['primary-controller', 'controller']
haproxy-openstack: ['primary-controller', 'controller']
libvirt-service: ['compute']
memcached-service: ['primary-controller', 'controller']
# Definition of the AFD node filters
node_cluster_alarms:
controller:
cpu: ['cpu-critical-controller', 'cpu-warning-controller']
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs: ['root-fs-critical', 'root-fs-warning']
log-fs: ['log-fs-critical', 'log-fs-warning']
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors: ['hdd-errors-critical']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
cpu: ['cpu-critical-controller', 'cpu-warning-controller']
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs: ['root-fs-critical', 'root-fs-warning']
log-fs: ['log-fs-critical', 'log-fs-warning']
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors: ['hdd-errors-critical']
<% if @detach_rabbitmq_enabled -%>
rabbitmq-nodes:
cpu: ['cpu-critical-rabbitmq', 'cpu-warning-rabbitmq']
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs: ['root-fs-critical', 'root-fs-warning']
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors: ['hdd-errors-critical']
apply_to_node: rabbitmq-nodes
enable_notification: false
activate_alerting: true
alarms:
cpu: ['cpu-critical-rabbitmq', 'cpu-warning-rabbitmq']
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs: ['root-fs-critical', 'root-fs-warning']
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors: ['hdd-errors-critical']
<% end -%>
mysql-nodes:
apply_to_node: mysql-nodes
enable_notification: false
activate_alerting: true
alarms:
<% if @detach_database_enabled -%>
cpu: ['cpu-critical-mysql', 'cpu-warning-mysql']
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs: ['root-fs-critical', 'root-fs-warning']
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors: ['hdd-errors-critical']
cpu: ['cpu-critical-mysql', 'cpu-warning-mysql']
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs: ['root-fs-critical', 'root-fs-warning']
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors: ['hdd-errors-critical']
<% end -%>
mysql-fs: ['mysql-fs-critical', 'mysql-fs-warning']
mysql-fs: ['mysql-fs-critical', 'mysql-fs-warning']
compute:
cpu: ['cpu-critical-compute', 'cpu-warning-compute']
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs: ['root-fs-critical', 'root-fs-warning']
nova-fs: ['nova-fs-critical', 'nova-fs-warning']
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors: ['hdd-errors-critical']
apply_to_node: compute
enable_notification: false
activate_alerting: true
alarms:
cpu: ['cpu-critical-compute', 'cpu-warning-compute']
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs: ['root-fs-critical', 'root-fs-warning']
nova-fs: ['nova-fs-critical', 'nova-fs-warning']
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors: ['hdd-errors-critical']
storage:
cpu: ['cpu-critical-storage', 'cpu-warning-storage']
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs: ['root-fs-critical', 'root-fs-warning']
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors: ['hdd-errors-critical']
default:
cpu: ['cpu-critical-default']
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs: ['root-fs-critical', 'root-fs-warning']
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors: ['hdd-errors-critical']
apply_to_node: storage
enable_notification: false
activate_alerting: true
alarms:
cpu: ['cpu-critical-storage', 'cpu-warning-storage']
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs: ['root-fs-critical', 'root-fs-warning']
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors: ['hdd-errors-critical']
elasticsearch-nodes:
cpu: ['cpu-critical-default']
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs: ['root-fs-critical', 'root-fs-warning']
data-fs: ['elasticsearch-fs-critical', 'elasticsearch-fs-warning']
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors: ['hdd-errors-critical']
apply_to_node: elasticsearch-nodes
enable_notification: false
activate_alerting: true
alarms:
cpu: ['cpu-critical-default']
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs: ['root-fs-critical', 'root-fs-warning']
data-fs: ['elasticsearch-fs-critical', 'elasticsearch-fs-warning']
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors: ['hdd-errors-critical']
influxdb-nodes:
cpu: ['cpu-critical-default']
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs: ['root-fs-critical', 'root-fs-warning']
data-fs: ['influxdb-fs-critical', 'influxdb-fs-warning']
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors: ['hdd-errors-critical']
apply_to_node: influxdb-nodes
enable_notification: false
activate_alerting: true
alarms:
cpu: ['cpu-critical-default']
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs: ['root-fs-critical', 'root-fs-warning']
data-fs: ['influxdb-fs-critical', 'influxdb-fs-warning']
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors: ['hdd-errors-critical']
# Definition of the AFD service filters
service_cluster_alarms:
rabbitmq-cluster:
pacemaker: ['rabbitmq-pacemaker-down', 'rabbitmq-pacemaker-critical', 'rabbitmq-pacemaker-warning']
queue: ['rabbitmq-queue-warning']
memory: ['rabbitmq-memory-limit-critical', 'rabbitmq-memory-limit-warning']
disk: ['rabbitmq-disk-limit-critical', 'rabbitmq-disk-limit-warning']
apply_to_node: rabbitmq-nodes
enable_notification: false
activate_alerting: true
alarms:
pacemaker: ['rabbitmq-pacemaker-down', 'rabbitmq-pacemaker-critical', 'rabbitmq-pacemaker-warning']
queue: ['rabbitmq-queue-warning']
memory: ['rabbitmq-memory-limit-critical', 'rabbitmq-memory-limit-warning']
disk: ['rabbitmq-disk-limit-critical', 'rabbitmq-disk-limit-warning']
rabbitmq-service:
check: ['rabbitmq-check']
apply_to_node: rabbitmq-nodes
enable_notification: false
activate_alerting: true
alarms:
check: ['rabbitmq-check']
mysql:
node-status: ['mysql-node-connected', 'mysql-node-ready']
check: ['mysql-check']
apply_to_node: mysql-nodes
enable_notification: false
activate_alerting: true
alarms:
node-status: ['mysql-node-connected', 'mysql-node-ready']
check: ['mysql-check']
apache:
worker: ['apache-warning']
check: ['apache-check']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
worker: ['apache-warning']
check: ['apache-check']
nova-api:
http_errors: ['nova-api-http-errors']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
http_errors: ['nova-api-http-errors']
nova-logs:
error: ['nova-logs-error']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
error: ['nova-logs-error']
heat-api:
http_errors: ['heat-api-http-errors']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
http_errors: ['heat-api-http-errors']
heat-logs:
error: ['heat-logs-error']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
error: ['heat-logs-error']
<% if not @storage_options["objects_ceph"] then -%>
swift-api:
http_errors: ['swift-api-http-errors']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
http_errors: ['swift-api-http-errors']
swift-logs:
error: ['swift-logs-error']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
error: ['swift-logs-error']
<% end -%>
cinder-api:
http_errors: ['cinder-api-http-errors']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
http_errors: ['cinder-api-http-errors']
cinder-logs:
error: ['cinder-logs-error']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
error: ['cinder-logs-error']
glance-api:
http_errors: ['glance-api-http-errors']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
http_errors: ['glance-api-http-errors']
glance-logs:
error: ['glance-logs-error']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
error: ['glance-logs-error']
neutron-api:
http_errors: ['neutron-api-http-errors']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
http_errors: ['neutron-api-http-errors']
neutron-logs:
error: ['neutron-logs-error']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
error: ['neutron-logs-error']
keystone-response-time:
duration: ['keystone-response-time-duration']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
duration: ['keystone-response-time-duration']
keystone-public-api:
http_errors: ['keystone-public-api-http-errors']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
http_errors: ['keystone-public-api-http-errors']
keystone-logs:
error: ['keystone-logs-error']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
error: ['keystone-logs-error']
keystone-admin-api:
http_errors: ['keystone-admin-api-http-errors']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
http_errors: ['keystone-admin-api-http-errors']
nova-instances:
creation-time: ['instance-creation-time-warning']
#TODO(scroiset): apply on compute nodes
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
creation-time: ['instance-creation-time-warning']
ceph-mon-cluster:
health: ['ceph-health-critical', 'ceph-health-warning']
capacity: ['ceph-capacity-critical', 'ceph-capacity-warning']
apply_to_node: ceph-mon
enable_notification: false
activate_alerting: true
alarms:
health: ['ceph-health-critical', 'ceph-health-warning']
capacity: ['ceph-capacity-critical', 'ceph-capacity-warning']
ceph-mon-service:
check: ['ceph-mon-check']
apply_to_node: ceph-mon
enable_notification: false
activate_alerting: true
alarms:
check: ['ceph-mon-check']
ceph-osd-service:
check: ['ceph-osd-check']
apply_to_node: ceph-osd
enable_notification: false
activate_alerting: true
alarms:
check: ['ceph-osd-check']
elasticsearch-cluster:
health: ['elasticsearch-health-critical', 'elasticsearch-health-warning']
apply_to_node: elasticsearch-nodes
enable_notification: false
activate_alerting: true
alarms:
health: ['elasticsearch-health-critical', 'elasticsearch-health-warning']
elasticsearch-service:
check: ['elasticsearch-check']
apply_to_node: elasticsearch-nodes
enable_notification: false
activate_alerting: true
alarms:
check: ['elasticsearch-check']
influxdb-service:
check: ['influxdb-check']
apply_to_node: influxdb-nodes
enable_notification: false
activate_alerting: true
alarms:
check: ['influxdb-check']
haproxy-openstack:
check: ['haproxy-check']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
check: ['haproxy-check']
pacemaker-service:
check: ['pacemaker-check']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
check: ['pacemaker-check']
libvirt-service:
check: ['libvirt-check']
apply_to_node: compute
enable_notification: false
activate_alerting: true
alarms:
check: ['libvirt-check']
memcached-service:
check: ['memcached-check']
apply_to_node: controller
enable_notification: false
activate_alerting: true
alarms:
check: ['memcached-check']

View File

@ -0,0 +1,34 @@
---
lma_collector:
# Fuel roles mapping to alarm evaluator key.
node_profiles:
controller:
roles: ['primary-controller', 'controller']
<% if @detach_database_enabled -%>
mysql-nodes:
roles: ['primary-standalone-database', 'standalone-database']
<% else -%>
mysql-nodes:
roles: ['primary-controller', 'controller']
<% end -%>
<% if @detach_rabbitmq_enabled -%>
rabbitmq-nodes:
roles: ['primary-standalone-rabbitmq', 'standalone-rabbitmq']
<% else -%>
rabbitmq-nodes:
roles: ['primary-controller', 'controller']
<% end -%>
<% if @storage_options["volumes_ceph"] then -%>
ceph-mon:
roles: ['primary-controller', 'controller']
ceph-osd:
roles: ['ceph-osd']
<% end -%>
compute:
roles: ['compute']
storage:
roles: ['cinder']
elasticsearch-nodes:
roles: ['primary-elasticsearch_kibana', 'elasticsearch_kibana']
influxdb-nodes:
roles: ['primary-influxdb_grafana', 'influxdb_grafana']

View File

@ -20,6 +20,8 @@ define lma_collector::afd_filter (
$alarms,
$alarms_definitions,
$message_matcher,
$activate_alerting = true,
$enable_notification = false,
) {
include lma_collector::params
include lma_collector::service::metric
@ -44,11 +46,13 @@ define lma_collector::afd_filter (
message_matcher => "(Type == \'metric\' || Type == \'heka.sandbox.metric\') && (${message_matcher})",
ticker_interval => 10,
config => {
hostname => $::hostname,
afd_type => $type,
afd_file => $afd_file,
afd_cluster_name => $cluster_name,
afd_logical_name => $logical_name,
hostname => $::hostname,
afd_type => $type,
afd_file => $afd_file,
afd_cluster_name => $cluster_name,
afd_logical_name => $logical_name,
activate_alerting => $activate_alerting,
enable_notification => $enable_notification,
},
module_directory => $lua_modules_dir,
require => File[$afd_filename],