Skip service configuration for remote alarms

When an alarm contains metric(s) with collected_on:'aggregator',
the corresponding Nagios service check is skipped.

Change-Id: I758e7bd412a68314e59ec86a40370661525a5af9
This commit is contained in:
Swann Croiset 2016-10-10 11:32:05 +02:00
parent 07526249a7
commit 8ee53ec594
4 changed files with 65 additions and 9 deletions

View File

@ -386,6 +386,8 @@ class { 'lma_infra_alerting::nagios::hosts':
node_profiles => $node_profiles,
node_cluster_alarms => $node_cluster_alarms,
service_cluster_alarms => $service_cluster_alarms,
alarms => $lma_collector['alarms'],
metrics => $lma_collector['metrics'],
require => Class['lma_infra_alerting::nagios'],
}

View File

@ -22,6 +22,8 @@ It expects 5 arguments:
3. The key containing the node's role.
4. The mapping between AFD profiles and node's roles
5. The mapping between AFD profiles and alarms
6. Array of alarm definitions
7. Hash table mapping metric names to the place where there are collected
*Examples:*
@ -45,7 +47,7 @@ Would return:
) do |arguments|
raise(Puppet::ParseError, "afds_to_nagios_services(): Wrong number of arguments " +
"given (#{arguments.size} expecting 5") if arguments.size != 5
"given (#{arguments.size} expecting 7") if arguments.size != 7
nodes = arguments[0]
raise(Puppet::ParseError, "arg0 isn't an array!") if ! nodes.is_a?(Array)
@ -56,6 +58,10 @@ Would return:
raise(Puppet::ParseError, "arg3 isn't a hash!") if ! role_to_cluster.is_a?(Hash)
afds = arguments[4]
raise(Puppet::ParseError, "arg4 isn't a hash!") if ! afds.is_a?(Hash)
alarms = arguments[5]
alarms = [] if ! alarms.is_a?(Array)
metrics = arguments[6]
metrics = {} if ! metrics.is_a?(Hash)
result = {}
@ -90,8 +96,28 @@ Would return:
if a['alerting'] == 'enabled_with_notification'
notifications_enabled = 1
end
a['alarms'].keys.each do |source|
node_services["#{node}.#{logical_cluster}.#{source}"] = "#{ logical_cluster }.#{ source }".gsub(/\s+/, '_')
a['alarms'].each do |source, afd|
# collect metric names
m = Set.new([])
afd.each do |alarm|
# find metric definition
alarm_def = alarms.select {|defi| defi['name'] == alarm}
next if alarm_def.empty?
alarm_def[0]['trigger']['rules'].each do |r|
m << r['metric']
end
end
matches = true
m.each do |metric_name|
if metrics.has_key?(metric_name) and metrics[metric_name]['collected_on'] == 'aggregator'
matches = false
end
end
# skip the source if collected_on differs
if matches
node_services["#{node}.#{logical_cluster}.#{source}"] = "#{ logical_cluster }.#{ source }".gsub(/\s+/, '_')
end
end
end

View File

@ -32,14 +32,16 @@ class lma_infra_alerting::nagios::hosts (
$role_key = undef,
$node_profiles = {},
$node_cluster_alarms = {},
$alarms = [],
$metrics = {},
$service_cluster_alarms = {},
){
include lma_infra_alerting::params
validate_string($host_name_key, $network_role_key)
validate_array($hosts, $host_display_name_keys, $host_custom_vars_keys)
validate_hash($node_profiles, $node_cluster_alarms)
validate_array($hosts, $host_display_name_keys, $host_custom_vars_keys, $alarms)
validate_hash($node_profiles, $node_cluster_alarms, $metrics)
$nagios_hosts = nodes_to_nagios_hosts($hosts,
$host_name_key,
@ -73,14 +75,20 @@ class lma_infra_alerting::nagios::hosts (
$host_name_key,
$role_key,
$node_profiles,
$node_cluster_alarms)
$node_cluster_alarms,
$alarms,
$metrics
)
create_resources(lma_infra_alerting::nagios::services, $afd_nodes)
$afd_services = afds_to_nagios_services($hosts,
$host_name_key,
$role_key,
$node_profiles,
$service_cluster_alarms)
$service_cluster_alarms,
$alarms,
$metrics
)
create_resources(lma_infra_alerting::nagios::services, $afd_services)
if empty($node_profiles) and empty($node_cluster_alarms) {

View File

@ -93,7 +93,8 @@ describe 'afds_to_nagios_services' do
"alerting" => "enabled_with_notification",
"alarms" => {
"system-ctrl" => ["cpu-critical-controller", "cpu-warning-controller"],
"fs" => ["fs-critical", "fs-warning"]
"fs" => ["fs-critical", "fs-warning"],
"rabbitmq" => ["rabbitmq-cluster-warning"]
}
},
"compute" => {
@ -137,8 +138,27 @@ describe 'afds_to_nagios_services' do
}
}
}
alarms_services = [
{"name"=>"rabbitmq-cluster-warning",
"description"=>"The RabbitMQ cluster is degraded because some RabbitMQ nodes are missing",
"severity"=>"warning",
"trigger"=>
{"logical_operator"=>"or",
"rules"=>
[{"metric"=>"pacemaker_resource_percent",
"relational_operator"=>"<",
"threshold"=>50,
"window"=>60,
"periods"=>0,
"function"=>"last"}]}},
]
metrics = {
"pacemaker_resource_percent" => {
"collected_on" => "aggregator"
}
}
describe 'with arguments' do
it { should run.with_params(all_nodes, 'name', 'node_roles', role_to_cluster, afds).and_return(
it { should run.with_params(all_nodes, 'name', 'node_roles', role_to_cluster, afds, alarms_services, metrics).and_return(
{
"default checks for node-1" => {
"hostname" => "node-1",