From 7deace8726aff5c4de0490497415f3d8400e65e1 Mon Sep 17 00:00:00 2001 From: Swann Croiset Date: Mon, 12 Sep 2016 17:35:42 +0200 Subject: [PATCH] Alarm definition refactoring DocImpact blueprint: alarming-refactoring Change-Id: I8c053f2fbc4b4b85958be8413919f9bf1b168027 --- .../puppet/manifests/configure_afd_filters.pp | 3 +- .../puppet/manifests/hiera_override.pp | 8 +- .../parser/functions/get_afd_filters.rb | 68 +-- .../parser/functions/get_cluster_names.rb | 2 +- .../fuel_lma_collector/manifests/afds.pp | 19 +- .../classes/fuel_lma_collector_afds_spec.rb | 34 +- .../spec/functions/get_afd_filters_spec.rb | 288 +++++++++++++ .../templates/alarming.yaml.erb | 389 +++++++++++------- ...gse_filters.yaml.erb => clusters.yaml.erb} | 0 .../templates/node_profiles.yaml.erb | 34 ++ .../lma_collector/manifests/afd_filter.pp | 14 +- 11 files changed, 652 insertions(+), 207 deletions(-) create mode 100644 deployment_scripts/puppet/modules/fuel_lma_collector/spec/functions/get_afd_filters_spec.rb rename deployment_scripts/puppet/modules/fuel_lma_collector/templates/{gse_filters.yaml.erb => clusters.yaml.erb} (100%) create mode 100644 deployment_scripts/puppet/modules/fuel_lma_collector/templates/node_profiles.yaml.erb diff --git a/deployment_scripts/puppet/manifests/configure_afd_filters.pp b/deployment_scripts/puppet/manifests/configure_afd_filters.pp index 5b045a81d..490c17e47 100644 --- a/deployment_scripts/puppet/manifests/configure_afd_filters.pp +++ b/deployment_scripts/puppet/manifests/configure_afd_filters.pp @@ -38,8 +38,7 @@ if $is_controller or $is_rabbitmq or $is_mysql_server { class { 'fuel_lma_collector::afds': roles => hiera('roles'), - node_cluster_roles => $lma['node_cluster_roles'], - service_cluster_roles => $lma['service_cluster_roles'], + node_profiles => $lma['node_profiles'], node_cluster_alarms => $lma['node_cluster_alarms'], service_cluster_alarms => $lma['service_cluster_alarms'], alarms => $alarms_definitions, diff --git a/deployment_scripts/puppet/manifests/hiera_override.pp b/deployment_scripts/puppet/manifests/hiera_override.pp index c941d3e5a..c7e37f7c3 100644 --- a/deployment_scripts/puppet/manifests/hiera_override.pp +++ b/deployment_scripts/puppet/manifests/hiera_override.pp @@ -239,11 +239,15 @@ lma::collector::infrastructure_alerting::password: <%= @nagios_password %> $detach_database = hiera('detach-database', {}) $detach_database_enabled = $detach_database['metadata'] and $detach_database['metadata']['enabled'] - fuel_lma_collector::hiera_data { 'gse_filters': - content => template('fuel_lma_collector/gse_filters.yaml.erb') + fuel_lma_collector::hiera_data { 'clusters': + content => template('fuel_lma_collector/clusters.yaml.erb') } fuel_lma_collector::hiera_data { 'alarming': content => template('fuel_lma_collector/alarming.yaml.erb') } + + fuel_lma_collector::hiera_data { 'node_profiles': + content => template('fuel_lma_collector/node_profiles.yaml.erb') + } } diff --git a/deployment_scripts/puppet/modules/fuel_lma_collector/lib/puppet/parser/functions/get_afd_filters.rb b/deployment_scripts/puppet/modules/fuel_lma_collector/lib/puppet/parser/functions/get_afd_filters.rb index 2725a7396..10bf99611 100644 --- a/deployment_scripts/puppet/modules/fuel_lma_collector/lib/puppet/parser/functions/get_afd_filters.rb +++ b/deployment_scripts/puppet/modules/fuel_lma_collector/lib/puppet/parser/functions/get_afd_filters.rb @@ -23,10 +23,10 @@ # Ex: # # ARG0: -# {"rabbitmq"=>{"queue"=>["rabbitmq-queue-warning"]}, -# "apache"=>{"worker"=>["apache-warning"]}, -# "memcached"=>{"all"=>["memcached-warning"]}, -# "haproxy"=>{"alive"=>["haproxy-warning"]}} +# {"rabbitmq"=>{"apply_to_node" => "controller", "alarms" => {"queue"=>["rabbitmq-queue-warning"]}}, +# "apache"=>{"apply_to_node" => "controller", "alarms" => {"worker"=>["apache-warning"]}}, +# "memcached"=>{"apply_to_node"=>"controller", "alarms" => {"all"=>["memcached-warning"]}}, +# "haproxy"=>{"apply_to_node" => "controller", "alarms" => {"alive"=>["haproxy-warning"]}}} # # ARG1: # @@ -63,7 +63,7 @@ # "function"=>"min"}]}} # ] # -# ARG2: ["rabbitmq", "apache"] +# ARG2: ["controller", "compute"] # # ARG3: type (node|service) # @@ -96,31 +96,47 @@ module Puppet::Parser::Functions afd_filters = {} afd_profiles.each do |afd_profile| - next unless afd_alarms.has_key?(afd_profile) - - afd_alarms[afd_profile].each do |afd_name, alarms| - # Collect the metrics which are required by this AFD filter - metrics = Set.new([]) - alarms.each do |a_name| - alarm_definitions.each do |alarm_def| - if alarm_def['name'] == a_name - alarm_def['trigger']['rules'].each do |r| - metrics << r['metric'] - end - end + afds = afd_alarms.select {|k,v| v.has_key?('apply_to_node') and v['apply_to_node'] == afd_profile } + afds.each do |k, v| + activate_alerting=true + if v.has_key?('activate_alerting') + if v['activate_alerting'] == false + activate_alerting=false end end + enable_notification=false + if v.has_key?('enable_notification') + if v['enable_notification'] == true + enable_notification=true + end + end + afd_cluster_name = k + v['alarms'].each do |afd_name, alarms| + # Collect the metrics which are required by this AFD filter + metrics = Set.new([]) + alarms.each do |a_name| + alarm_definitions.each do |alarm_def| + if alarm_def['name'] == a_name + alarm_def['trigger']['rules'].each do |r| + metrics << r['metric'] + end + end - message_matcher = metrics.collect{|x| "Fields[name] == \'#{x}\'" }.join(' || ') + end + end + message_matcher = metrics.collect{|x| "Fields[name] == \'#{x}\'" }.join(' || ') - afd_filters["#{afd_profile}_#{afd_name}"] = { - 'type' => type, - 'cluster_name' => afd_profile, - 'logical_name' => afd_name, - 'alarms' => alarms, - 'alarms_definitions' => alarm_definitions, - 'message_matcher' => message_matcher - } + afd_filters["#{afd_cluster_name}_#{afd_name}"] = { + 'type' => type, + 'cluster_name' => afd_cluster_name, + 'logical_name' => afd_name, + 'alarms' => alarms, + 'alarms_definitions' => alarm_definitions, + 'message_matcher' => message_matcher, + 'activate_alerting' => activate_alerting, + 'enable_notification' => enable_notification, + } + end end end diff --git a/deployment_scripts/puppet/modules/fuel_lma_collector/lib/puppet/parser/functions/get_cluster_names.rb b/deployment_scripts/puppet/modules/fuel_lma_collector/lib/puppet/parser/functions/get_cluster_names.rb index bd90ea867..1d07b8af4 100644 --- a/deployment_scripts/puppet/modules/fuel_lma_collector/lib/puppet/parser/functions/get_cluster_names.rb +++ b/deployment_scripts/puppet/modules/fuel_lma_collector/lib/puppet/parser/functions/get_cluster_names.rb @@ -44,7 +44,7 @@ module Puppet::Parser::Functions roles.each do |role| data.each do |k,v| - cluster_names << k if v.include?(role) + cluster_names << k if v['roles'].include?(role) end end diff --git a/deployment_scripts/puppet/modules/fuel_lma_collector/manifests/afds.pp b/deployment_scripts/puppet/modules/fuel_lma_collector/manifests/afds.pp index c6f4ab37a..171d4cef4 100644 --- a/deployment_scripts/puppet/modules/fuel_lma_collector/manifests/afds.pp +++ b/deployment_scripts/puppet/modules/fuel_lma_collector/manifests/afds.pp @@ -15,37 +15,28 @@ class fuel_lma_collector::afds ( $roles = undef, - $node_cluster_roles = undef, - $service_cluster_roles = undef, + $node_profiles = undef, $node_cluster_alarms = undef, $service_cluster_alarms = undef, $alarms = undef, ){ validate_array($roles) - validate_hash($node_cluster_roles) - validate_hash($service_cluster_roles) + validate_hash($node_profiles) validate_hash($node_cluster_alarms) validate_hash($service_cluster_alarms) validate_array($alarms) - $node_cluster_names_tmp = get_cluster_names($node_cluster_roles, $roles) - $service_cluster_names = get_cluster_names($service_cluster_roles, $roles) - - if size($node_cluster_names_tmp) == 0 and $node_cluster_alarms['default'] { - $node_cluster_names = ['default'] - } else { - $node_cluster_names = $node_cluster_names_tmp - } + $clusters = get_cluster_names($node_profiles, $roles) $node_afd_filters = get_afd_filters($node_cluster_alarms, $alarms, - $node_cluster_names, + $clusters, 'node') $service_afd_filters = get_afd_filters($service_cluster_alarms, $alarms, - $service_cluster_names, + $clusters, 'service') create_resources(lma_collector::afd_filter, $node_afd_filters) diff --git a/deployment_scripts/puppet/modules/fuel_lma_collector/spec/classes/fuel_lma_collector_afds_spec.rb b/deployment_scripts/puppet/modules/fuel_lma_collector/spec/classes/fuel_lma_collector_afds_spec.rb index 912a49dfa..9d9178fdb 100644 --- a/deployment_scripts/puppet/modules/fuel_lma_collector/spec/classes/fuel_lma_collector_afds_spec.rb +++ b/deployment_scripts/puppet/modules/fuel_lma_collector/spec/classes/fuel_lma_collector_afds_spec.rb @@ -22,10 +22,24 @@ describe 'fuel_lma_collector::afds' do describe 'with defaults' do let(:params) do {:roles => ['primary-controller'], - :node_cluster_roles => {'controller' => ['primary-controller']}, - :service_cluster_roles => {'mysql' => ['primary-controller']}, - :node_cluster_alarms => {'controller' => {'cpu' => ['cpu_warning']}}, - :service_cluster_alarms => {'mysql' => {'all' => ['db_warning']}}, + :node_profiles => {'controller' => {'roles' => ['primary-controller']}}, + :node_cluster_alarms => { + 'controller' => + { + 'apply_to_node' => 'controller', + 'alarms' => { + 'cpu' => ['cpu_warning'] + } + } + }, + :service_cluster_alarms => { + 'mysql' => { + 'apply_to_node' => 'controller', + 'alarms' => { + 'all' => ['db_warning'] + } + } + }, :alarms => [ {"name"=>"cpu_warning", "description"=>"Fake alarm", @@ -63,9 +77,15 @@ describe 'fuel_lma_collector::afds' do describe 'with enabled false' do let(:params) do {:roles => ['primary-controller'], - :node_cluster_roles => {'controller' => ['primary-controller']}, - :service_cluster_roles => {}, - :node_cluster_alarms => {'controller' => {'cpu' => ['cpu_warning']}}, + :node_profiles => {'controller' => {'roles' => ['primary-controller']}}, + :node_cluster_alarms => { + 'controller' => { + 'apply_to_node' => 'controller', + 'alarms' => { + 'cpu' => ['cpu_warning'] + } + } + }, :service_cluster_alarms => {}, :alarms => [ {"name"=>"cpu_warning", diff --git a/deployment_scripts/puppet/modules/fuel_lma_collector/spec/functions/get_afd_filters_spec.rb b/deployment_scripts/puppet/modules/fuel_lma_collector/spec/functions/get_afd_filters_spec.rb new file mode 100644 index 000000000..9fbf2b751 --- /dev/null +++ b/deployment_scripts/puppet/modules/fuel_lma_collector/spec/functions/get_afd_filters_spec.rb @@ -0,0 +1,288 @@ +# Copyright 2015 Mirantis, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +require 'spec_helper' + +describe 'get_afd_filters' do + + alarms_nodes = [ + {"name"=>"cpu-critical-controller", + "description"=>"The CPU usage is too high (controller node)", + "severity"=>"critical", + "trigger"=> + {"logical_operator"=>"or", + "rules"=> + [ + {"metric"=>"cpu_idle", + "relational_operator"=>"<=", + "threshold"=>5, + "window"=>120, + "periods"=>0, + "function"=>"avg"}, + {"metric"=>"cpu_wait", + "relational_operator"=>">=", + "threshold"=>35, + "window"=>120, + "periods"=>0, + "function"=>"avg"}, + ]}}, + {"name"=>"cpu-warning-controller", + "description"=>"The CPU usage is high (controller node)", + "severity"=>"warning", + "trigger"=> + {"logical_operator"=>"or", + "rules"=> + [ + {"metric"=>"cpu_idle", + "relational_operator"=>"<=", + "threshold"=>15, + "window"=>120, + "periods"=>0, + "function"=>"avg"}, + {"metric"=>"cpu_wait", + "relational_operator"=>">=", + "threshold"=>25, + "window"=>120, + "periods"=>0, + "function"=>"avg"}, + ]}}, + {"name"=>"cpu-critical-compute", + "description"=>"The CPU usage is high (critical node)", + "severity"=>"critical", + "trigger"=> + {"logical_operator"=>"or", + "rules"=> + [ + {"metric"=>"cpu_idle", + "relational_operator"=>"<=", + "threshold"=>30, + "window"=>120, + "periods"=>0, + "function"=>"avg"}, + ]}}, + {"name"=>"cpu-warning-compute", + "description"=>"The CPU usage is high (compute node)", + "severity"=>"warning", + "trigger"=> + {"logical_operator"=>"or", + "rules"=> + [ + {"metric"=>"cpu_idle", + "relational_operator"=>"<=", + "threshold"=>20, + "window"=>120, + "periods"=>0, + "function"=>"avg"}, + ]}}, + {"name"=>"fs-critical", + "description"=>"The FS usage is critical", + "severity"=>"critical", + "trigger"=> + {"logical_operator"=>"or", + "rules"=> + [ + {"metric"=>"fs_percent_free", + "relational_operator"=>"<=", + "threshold"=>8, + "window"=>120, + "periods"=>0, + "function"=>"avg"}, + ]}}, + ] + + afds_nodes = { + "controller" => { + "apply_to_node" => "controller", + "enable_notification" => true, + "activate_alerting" => true, + "alarms" => { + "system" => ["cpu-critical-controller", "cpu-warning-controller"], + }, + }, + "compute" => { + "apply_to_node" => "compute", + "enable_notification" => true, + "activate_alerting" => true, + "alarms" => { + "system" => ["cpu-critical-compute", "cpu-warning-compute"], + "fs" => ["fs-critical"], + }, + } + } + + describe 'For controller nodes' do + it { should run.with_params(afds_nodes, alarms_nodes, ['controller'], 'node') + .and_return( + {"controller_system"=> + {"type"=>"node", + "cluster_name"=>"controller", + "logical_name"=>"system", + "alarms"=>["cpu-critical-controller", "cpu-warning-controller"], + "alarms_definitions"=> alarms_nodes, + "message_matcher"=>"Fields[name] == 'cpu_idle' || Fields[name] == 'cpu_wait'", + "enable_notification" => true, + "activate_alerting" => true, + } + }) + + } + end + describe 'For compute nodes' do + it { should run.with_params(afds_nodes, alarms_nodes, ['compute'], 'node') + .and_return( + {"compute_system"=> + {"type"=>"node", + "cluster_name"=>"compute", + "logical_name"=>"system", + "alarms"=>["cpu-critical-compute", "cpu-warning-compute"], + "alarms_definitions"=> alarms_nodes, + "message_matcher"=>"Fields[name] == 'cpu_idle'", + "activate_alerting" => true, + "enable_notification" => true, + }, + "compute_fs"=> + {"type"=>"node", + "cluster_name"=>"compute", + "logical_name"=>"fs", + "alarms"=>["fs-critical"], + "alarms_definitions"=> alarms_nodes, + "message_matcher"=>"Fields[name] == 'fs_percent_free'", + "activate_alerting" => true, + "enable_notification" => true, + } + }) + } + end + describe 'For compute and controller nodes' do + it { should run.with_params(afds_nodes, alarms_nodes, ['compute', 'controller'], 'node') + .and_return( + {"compute_system"=> + {"type"=>"node", + "cluster_name"=>"compute", + "logical_name"=>"system", + "alarms"=>["cpu-critical-compute", "cpu-warning-compute"], + "alarms_definitions"=> alarms_nodes, + "message_matcher"=>"Fields[name] == 'cpu_idle'", + "activate_alerting" => true, + "enable_notification" => true, + }, + "compute_fs"=> + {"type"=>"node", + "cluster_name"=>"compute", + "logical_name"=>"fs", + "alarms"=>["fs-critical"], + "alarms_definitions"=> alarms_nodes, + "message_matcher"=>"Fields[name] == 'fs_percent_free'", + "activate_alerting" => true, + "enable_notification" => true, + }, + "controller_system"=> + {"type"=>"node", + "cluster_name"=>"controller", + "logical_name"=>"system", + "alarms"=>["cpu-critical-controller", "cpu-warning-controller"], + "alarms_definitions"=> alarms_nodes, + "message_matcher"=>"Fields[name] == 'cpu_idle' || Fields[name] == 'cpu_wait'", + "activate_alerting" => true, + "enable_notification" => true, + } + }) + } + end + + alarms_services = [ + {"name"=>"rabbitmq-queue-warning", + "description"=>"Number of message in queues too high", + "severity"=>"warning", + "trigger"=> + {"logical_operator"=>"or", + "rules"=> + [{"metric"=>"rabbitmq_messages", + "relational_operator"=>">=", + "threshold"=>200, + "window"=>120, + "periods"=>0, + "function"=>"avg"}]}}, + {"name"=>"apache-warning", + "description"=>"", + "severity"=>"warning", + "trigger"=> + {"logical_operator"=>"or", + "rules"=> + [{"metric"=>"apache_idle_workers", + "relational_operator"=>"=", + "threshold"=>0, + "window"=>60, + "periods"=>0, + "function"=>"min"}, + {"metric"=>"apache_status", + "relational_operator"=>"=", + "threshold"=>0, + "window"=>60, + "periods"=>0, + "function"=>"min"}]}} + ] + afds_services = { + "rabbitmq" => { + "apply_to_node" => "controller", + "enable_notification" => false, + "activate_alerting" => true, + "enable_notification" => false, + "alarms" => { +# "pacemaker" => ['rabbitmq-pacemaker-down', 'rabbitmq-pacemaker-critical'], + "queue" => ["rabbitmq-queue-warning"] + }, + }, + "apache" => { + "apply_to_node" => "controller", + "enable_notification" => false, + "activate_alerting" => true, + "enable_notification" => false, + "alarms" => { + "worker" => ['apache-warning'], + }, + }, + } + describe 'For services' do + it { should run.with_params(afds_services, alarms_services, ['controller'], 'service') + .and_return( + { + "rabbitmq_queue"=> + { + "type"=>"service", + "cluster_name"=>"rabbitmq", + "logical_name"=>"queue", + "alarms_definitions"=> alarms_services, + "alarms"=>["rabbitmq-queue-warning"], + "message_matcher"=>"Fields[name] == 'rabbitmq_messages'", + "activate_alerting" => true, + "enable_notification" => false, + }, + "apache_worker"=> + { + "type"=>"service", + "cluster_name"=>"apache", + "logical_name"=>"worker", + "alarms_definitions"=> alarms_services, + "alarms"=>["apache-warning"], + "message_matcher"=>"Fields[name] == 'apache_idle_workers' || Fields[name] == 'apache_status'", + "activate_alerting" => true, + "enable_notification" => false, + }} + + ) + } + end +end + diff --git a/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb b/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb index 0658cb433..0cdcc5976 100644 --- a/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb +++ b/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb @@ -1090,200 +1090,289 @@ lma_collector: periods: 0 function: max - # Mapping between the Fuel roles and the AFD node filters - node_cluster_roles: - controller: ['primary-controller', 'controller'] -<% if @detach_database_enabled -%> - mysql-nodes: ['primary-standalone-database', 'standalone-database'] -<% else -%> - mysql-nodes: ['primary-controller', 'controller'] -<% end -%> - compute: ['compute'] - storage: ['cinder', 'ceph-osd'] - elasticsearch-nodes: ['primary-elasticsearch_kibana', 'elasticsearch_kibana'] - influxdb-nodes: ['primary-influxdb_grafana', 'influxdb_grafana'] - - # Mapping between the Fuel roles and the AFD service filters - service_cluster_roles: -<% if @detach_rabbitmq_enabled -%> - rabbitmq-cluster: ['primary-standalone-rabbitmq', 'standalone-rabbitmq'] - rabbitmq-service: ['primary-standalone-rabbitmq', 'standalone-rabbitmq'] -<% else -%> - rabbitmq-cluster: ['primary-controller', 'controller'] - rabbitmq-service: ['primary-controller', 'controller'] -<% end -%> -<% if @detach_database_enabled -%> - mysql: ['primary-standalone-database', 'standalone-database'] -<% else -%> - mysql: ['primary-controller', 'controller'] -<% end -%> - apache: ['primary-controller', 'controller'] - nova-api: ['primary-controller', 'controller'] - nova-logs: ['primary-controller', 'controller', 'compute'] - heat-api: ['primary-controller', 'controller'] - heat-logs: ['primary-controller', 'controller'] -<% if not @storage_options["objects_ceph"] then -%> - swift-api: ['primary-controller', 'controller'] -<% end -%> - cinder-api: ['primary-controller', 'controller'] - cinder-logs: ['primary-controller', 'controller', 'cinder'] - glance-api: ['primary-controller', 'controller'] - glance-logs: ['primary-controller', 'controller'] - neutron-api: ['primary-controller', 'controller'] - neutron-logs: ['primary-controller', 'controller', 'compute'] - keystone-response-time: ['primary-controller', 'controller'] - keystone-public-api: ['primary-controller', 'controller'] - keystone-admin-api: ['primary-controller', 'controller'] - keystone-logs: ['primary-controller', 'controller'] - nova-instances: ['primary-controller', 'controller'] -<% if @storage_options["volumes_ceph"] then -%> - ceph-mon-cluster: ['primary-controller', 'controller'] - ceph-mon-service: ['primary-controller', 'controller'] - ceph-osd-service: ['ceph-osd'] -<% end -%> - elasticsearch-cluster: ['primary-elasticsearch_kibana', 'elasticsearch_kibana'] - elasticsearch-service: ['primary-elasticsearch_kibana', 'elasticsearch_kibana'] - influxdb-service: ['primary-influxdb_grafana', 'influxdb_grafana'] - pacemaker-service: ['primary-controller', 'controller'] - haproxy-openstack: ['primary-controller', 'controller'] - libvirt-service: ['compute'] - memcached-service: ['primary-controller', 'controller'] - # Definition of the AFD node filters node_cluster_alarms: controller: - cpu: ['cpu-critical-controller', 'cpu-warning-controller'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - log-fs: ['log-fs-critical', 'log-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + cpu: ['cpu-critical-controller', 'cpu-warning-controller'] + network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] + network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] + root-fs: ['root-fs-critical', 'root-fs-warning'] + log-fs: ['log-fs-critical', 'log-fs-warning'] + swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] + hdd-errors: ['hdd-errors-critical'] <% if @detach_rabbitmq_enabled -%> rabbitmq-nodes: - cpu: ['cpu-critical-rabbitmq', 'cpu-warning-rabbitmq'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] + apply_to_node: rabbitmq-nodes + enable_notification: false + activate_alerting: true + alarms: + cpu: ['cpu-critical-rabbitmq', 'cpu-warning-rabbitmq'] + network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] + network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] + root-fs: ['root-fs-critical', 'root-fs-warning'] + swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] + hdd-errors: ['hdd-errors-critical'] <% end -%> mysql-nodes: + apply_to_node: mysql-nodes + enable_notification: false + activate_alerting: true + alarms: <% if @detach_database_enabled -%> - cpu: ['cpu-critical-mysql', 'cpu-warning-mysql'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] + cpu: ['cpu-critical-mysql', 'cpu-warning-mysql'] + network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] + network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] + root-fs: ['root-fs-critical', 'root-fs-warning'] + swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] + hdd-errors: ['hdd-errors-critical'] <% end -%> - mysql-fs: ['mysql-fs-critical', 'mysql-fs-warning'] + mysql-fs: ['mysql-fs-critical', 'mysql-fs-warning'] compute: - cpu: ['cpu-critical-compute', 'cpu-warning-compute'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - nova-fs: ['nova-fs-critical', 'nova-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] + apply_to_node: compute + enable_notification: false + activate_alerting: true + alarms: + cpu: ['cpu-critical-compute', 'cpu-warning-compute'] + network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] + network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] + root-fs: ['root-fs-critical', 'root-fs-warning'] + nova-fs: ['nova-fs-critical', 'nova-fs-warning'] + swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] + hdd-errors: ['hdd-errors-critical'] storage: - cpu: ['cpu-critical-storage', 'cpu-warning-storage'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] - default: - cpu: ['cpu-critical-default'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] + apply_to_node: storage + enable_notification: false + activate_alerting: true + alarms: + cpu: ['cpu-critical-storage', 'cpu-warning-storage'] + network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] + network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] + root-fs: ['root-fs-critical', 'root-fs-warning'] + swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] + hdd-errors: ['hdd-errors-critical'] elasticsearch-nodes: - cpu: ['cpu-critical-default'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - data-fs: ['elasticsearch-fs-critical', 'elasticsearch-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] + apply_to_node: elasticsearch-nodes + enable_notification: false + activate_alerting: true + alarms: + cpu: ['cpu-critical-default'] + network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] + network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] + root-fs: ['root-fs-critical', 'root-fs-warning'] + data-fs: ['elasticsearch-fs-critical', 'elasticsearch-fs-warning'] + swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] + hdd-errors: ['hdd-errors-critical'] influxdb-nodes: - cpu: ['cpu-critical-default'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - data-fs: ['influxdb-fs-critical', 'influxdb-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] + apply_to_node: influxdb-nodes + enable_notification: false + activate_alerting: true + alarms: + cpu: ['cpu-critical-default'] + network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] + network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] + root-fs: ['root-fs-critical', 'root-fs-warning'] + data-fs: ['influxdb-fs-critical', 'influxdb-fs-warning'] + swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] + hdd-errors: ['hdd-errors-critical'] # Definition of the AFD service filters service_cluster_alarms: rabbitmq-cluster: - pacemaker: ['rabbitmq-pacemaker-down', 'rabbitmq-pacemaker-critical', 'rabbitmq-pacemaker-warning'] - queue: ['rabbitmq-queue-warning'] - memory: ['rabbitmq-memory-limit-critical', 'rabbitmq-memory-limit-warning'] - disk: ['rabbitmq-disk-limit-critical', 'rabbitmq-disk-limit-warning'] + apply_to_node: rabbitmq-nodes + enable_notification: false + activate_alerting: true + alarms: + pacemaker: ['rabbitmq-pacemaker-down', 'rabbitmq-pacemaker-critical', 'rabbitmq-pacemaker-warning'] + queue: ['rabbitmq-queue-warning'] + memory: ['rabbitmq-memory-limit-critical', 'rabbitmq-memory-limit-warning'] + disk: ['rabbitmq-disk-limit-critical', 'rabbitmq-disk-limit-warning'] rabbitmq-service: - check: ['rabbitmq-check'] + apply_to_node: rabbitmq-nodes + enable_notification: false + activate_alerting: true + alarms: + check: ['rabbitmq-check'] mysql: - node-status: ['mysql-node-connected', 'mysql-node-ready'] - check: ['mysql-check'] + apply_to_node: mysql-nodes + enable_notification: false + activate_alerting: true + alarms: + node-status: ['mysql-node-connected', 'mysql-node-ready'] + check: ['mysql-check'] apache: - worker: ['apache-warning'] - check: ['apache-check'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + worker: ['apache-warning'] + check: ['apache-check'] nova-api: - http_errors: ['nova-api-http-errors'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + http_errors: ['nova-api-http-errors'] nova-logs: - error: ['nova-logs-error'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + error: ['nova-logs-error'] heat-api: - http_errors: ['heat-api-http-errors'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + http_errors: ['heat-api-http-errors'] heat-logs: - error: ['heat-logs-error'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + error: ['heat-logs-error'] +<% if not @storage_options["objects_ceph"] then -%> swift-api: - http_errors: ['swift-api-http-errors'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + http_errors: ['swift-api-http-errors'] swift-logs: - error: ['swift-logs-error'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + error: ['swift-logs-error'] +<% end -%> cinder-api: - http_errors: ['cinder-api-http-errors'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + http_errors: ['cinder-api-http-errors'] cinder-logs: - error: ['cinder-logs-error'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + error: ['cinder-logs-error'] glance-api: - http_errors: ['glance-api-http-errors'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + http_errors: ['glance-api-http-errors'] glance-logs: - error: ['glance-logs-error'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + error: ['glance-logs-error'] neutron-api: - http_errors: ['neutron-api-http-errors'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + http_errors: ['neutron-api-http-errors'] neutron-logs: - error: ['neutron-logs-error'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + error: ['neutron-logs-error'] keystone-response-time: - duration: ['keystone-response-time-duration'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + duration: ['keystone-response-time-duration'] keystone-public-api: - http_errors: ['keystone-public-api-http-errors'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + http_errors: ['keystone-public-api-http-errors'] keystone-logs: - error: ['keystone-logs-error'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + error: ['keystone-logs-error'] keystone-admin-api: - http_errors: ['keystone-admin-api-http-errors'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + http_errors: ['keystone-admin-api-http-errors'] nova-instances: - creation-time: ['instance-creation-time-warning'] + #TODO(scroiset): apply on compute nodes + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + creation-time: ['instance-creation-time-warning'] ceph-mon-cluster: - health: ['ceph-health-critical', 'ceph-health-warning'] - capacity: ['ceph-capacity-critical', 'ceph-capacity-warning'] + apply_to_node: ceph-mon + enable_notification: false + activate_alerting: true + alarms: + health: ['ceph-health-critical', 'ceph-health-warning'] + capacity: ['ceph-capacity-critical', 'ceph-capacity-warning'] ceph-mon-service: - check: ['ceph-mon-check'] + apply_to_node: ceph-mon + enable_notification: false + activate_alerting: true + alarms: + check: ['ceph-mon-check'] ceph-osd-service: - check: ['ceph-osd-check'] + apply_to_node: ceph-osd + enable_notification: false + activate_alerting: true + alarms: + check: ['ceph-osd-check'] elasticsearch-cluster: - health: ['elasticsearch-health-critical', 'elasticsearch-health-warning'] + apply_to_node: elasticsearch-nodes + enable_notification: false + activate_alerting: true + alarms: + health: ['elasticsearch-health-critical', 'elasticsearch-health-warning'] elasticsearch-service: - check: ['elasticsearch-check'] + apply_to_node: elasticsearch-nodes + enable_notification: false + activate_alerting: true + alarms: + check: ['elasticsearch-check'] influxdb-service: - check: ['influxdb-check'] + apply_to_node: influxdb-nodes + enable_notification: false + activate_alerting: true + alarms: + check: ['influxdb-check'] haproxy-openstack: - check: ['haproxy-check'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + check: ['haproxy-check'] pacemaker-service: - check: ['pacemaker-check'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + check: ['pacemaker-check'] libvirt-service: - check: ['libvirt-check'] + apply_to_node: compute + enable_notification: false + activate_alerting: true + alarms: + check: ['libvirt-check'] memcached-service: - check: ['memcached-check'] + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + check: ['memcached-check'] diff --git a/deployment_scripts/puppet/modules/fuel_lma_collector/templates/gse_filters.yaml.erb b/deployment_scripts/puppet/modules/fuel_lma_collector/templates/clusters.yaml.erb similarity index 100% rename from deployment_scripts/puppet/modules/fuel_lma_collector/templates/gse_filters.yaml.erb rename to deployment_scripts/puppet/modules/fuel_lma_collector/templates/clusters.yaml.erb diff --git a/deployment_scripts/puppet/modules/fuel_lma_collector/templates/node_profiles.yaml.erb b/deployment_scripts/puppet/modules/fuel_lma_collector/templates/node_profiles.yaml.erb new file mode 100644 index 000000000..a387828b9 --- /dev/null +++ b/deployment_scripts/puppet/modules/fuel_lma_collector/templates/node_profiles.yaml.erb @@ -0,0 +1,34 @@ +--- +lma_collector: + # Fuel roles mapping to alarm evaluator key. + node_profiles: + controller: + roles: ['primary-controller', 'controller'] +<% if @detach_database_enabled -%> + mysql-nodes: + roles: ['primary-standalone-database', 'standalone-database'] +<% else -%> + mysql-nodes: + roles: ['primary-controller', 'controller'] +<% end -%> +<% if @detach_rabbitmq_enabled -%> + rabbitmq-nodes: + roles: ['primary-standalone-rabbitmq', 'standalone-rabbitmq'] +<% else -%> + rabbitmq-nodes: + roles: ['primary-controller', 'controller'] +<% end -%> +<% if @storage_options["volumes_ceph"] then -%> + ceph-mon: + roles: ['primary-controller', 'controller'] + ceph-osd: + roles: ['ceph-osd'] +<% end -%> + compute: + roles: ['compute'] + storage: + roles: ['cinder'] + elasticsearch-nodes: + roles: ['primary-elasticsearch_kibana', 'elasticsearch_kibana'] + influxdb-nodes: + roles: ['primary-influxdb_grafana', 'influxdb_grafana'] diff --git a/deployment_scripts/puppet/modules/lma_collector/manifests/afd_filter.pp b/deployment_scripts/puppet/modules/lma_collector/manifests/afd_filter.pp index 4225d42fd..08368e51f 100644 --- a/deployment_scripts/puppet/modules/lma_collector/manifests/afd_filter.pp +++ b/deployment_scripts/puppet/modules/lma_collector/manifests/afd_filter.pp @@ -20,6 +20,8 @@ define lma_collector::afd_filter ( $alarms, $alarms_definitions, $message_matcher, + $activate_alerting = true, + $enable_notification = false, ) { include lma_collector::params include lma_collector::service::metric @@ -44,11 +46,13 @@ define lma_collector::afd_filter ( message_matcher => "(Type == \'metric\' || Type == \'heka.sandbox.metric\') && (${message_matcher})", ticker_interval => 10, config => { - hostname => $::hostname, - afd_type => $type, - afd_file => $afd_file, - afd_cluster_name => $cluster_name, - afd_logical_name => $logical_name, + hostname => $::hostname, + afd_type => $type, + afd_file => $afd_file, + afd_cluster_name => $cluster_name, + afd_logical_name => $logical_name, + activate_alerting => $activate_alerting, + enable_notification => $enable_notification, }, module_directory => $lua_modules_dir, require => File[$afd_filename],