Add Nagios support for OpenStack service status

implements blueprint alerting-lma-collector

Change-Id: I722b7a83c5dd391a86423d6af526355bc2ed8bbc
This commit is contained in:
Swann Croiset 2015-07-03 17:27:03 +02:00
parent 6e914f0d1c
commit 929e15c324
8 changed files with 262 additions and 5 deletions

View File

@ -55,3 +55,22 @@ if $influxdb_mode == 'local' {
fail("Could not find node '${influxdb_node_name}' in the environment")
}
}
$nagios_mode = $lma_collector['nagios_mode']
if $nagios_mode == 'local' {
# Check that the LMA-Infrastructure-Alerting plugin is enabled for that environment
# and that the node names match
$infra_alerting = hiera('lma_infrastructure_alerting', false)
if ! $infra_alerting {
fail('Could not get the LMA Infrastructure Alerting parameters. The LMA-Infrastructure-Alerting plugin is probably not installed.')
}
elsif ! $infra_alerting['metadata']['enabled'] {
fail('Could not get the LMA Infrastructure Alerting parameters. The LMA-Infrastructure-Alerting plugin is probably not enabled for this environment.')
}
# Check that the LMA-Infrastructure-Alerting node exists in the environment
$infra_alerting_node_name = $influxdb_grafana['node_name']
$infra_alerting_nodes = filter_nodes(hiera('nodes'), 'user_node_name', $infra_alerting_node_name)
if size($infra_alerting_nodes) < 1 {
fail("Could not find node '${infra_alerting_node_name}' in the environment")
}
}

View File

@ -159,3 +159,37 @@ if $lma_collector['influxdb_mode'] != 'disabled' {
class { 'lma_collector::metrics::pacemaker_resources': }
}
}
$nagios_mode = $lma_collector['nagios_mode']
if $nagios_mode != 'disabled' {
$deployment_id = hiera('deployment_id')
if $nagios_mode == 'remote' {
$nagios_url = $lma_collector['nagios_url']
$nagios_user = $lma_collector['nagios_user']
$nagios_password = $lma_collector['nagios_password']
} elsif $nagios_mode == 'local' {
$lma_infra_alerting = hiera('lma_infrastructure_alerting', false)
$nagios_node_name = $lma_infra_alerting['node_name']
$nagios_nodes = filter_nodes(hiera('nodes'), 'user_node_name', $nagios_node_name)
$nagios_server = $nagios_nodes[0]['internal_address']
$nagios_user = $lma_infra_alerting['nagios_user']
$nagios_password = $lma_infra_alerting['nagios_password']
# TODO: $http_port and $http_path must match automatically the
# lma_infra_monitoring configuration.
$http_port = $lma_collector::params::nagios_http_port
$http_path = $lma_collector::params::nagios_http_path
$nagios_url = "http://${nagios_server}:${http_port}/${http_path}"
} else {
fail("'${nagios_mode}' mode not supported for the infrastructure alerting service")
}
class { 'lma_collector::nagios':
openstack_deployment_name => $deployment_id,
url => $nagios_url,
user => $nagios_user,
password => $nagios_password,
}
}

View File

@ -0,0 +1,66 @@
-- Copyright 2015 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
require 'table'
require 'string'
require 'cjson'
local utils = require 'lma_utils'
local host = read_config('nagios_host')
local data = {
cmd_typ = '30',
cmd_mod = '2',
host = host,
service = nil,
plugin_state = nil,
plugin_output = nil,
performance_data = '',
}
local nagios_break_line = '\\n'
function url_encode(str)
if (str) then
str = string.gsub (str, "([^%w %-%_%.%~])",
function (c) return string.format ("%%%02X", string.byte(c)) end)
str = string.gsub (str, " ", "+")
end
return str
end
function process_message()
local service = read_message('Fields[service]')
local service_name = read_config(service)
if not service_name then
return -1
end
local status = read_message('Fields[status]')
local payload = read_message('Payload')
data['service'] = service_name
data['plugin_state'] = status
local ok, details = pcall(cjson.decode, payload)
if not ok or not details then details = {'no detail'} end
local title = string.format('%s %s',
service_name,
utils.global_status_to_label_map[status])
table.insert(details, 1, title)
data['plugin_output'] = table.concat(details, nagios_break_line)
local params = {}
for k, v in pairs(data) do
params[#params+1] = string.format("%s=%s", k, url_encode(v))
end
local p = table.concat(params, '&')
inject_payload('txt', 'nagios', p)
return 0
end

View File

@ -69,9 +69,9 @@ function process_message ()
local prev = all_service_status[service_name].global_status or utils.global_status_map.UNKNOWN
local updated
updated = (prev ~= global_status or #events > 0)
if updated then -- append not UP status elements in details
for k, v in pairs(not_up_status) do events[#events+1] = v end
end
-- always append not UP status elements in details
for k, v in pairs(not_up_status) do events[#events+1] = v end
local details = ''
if #events > 0 then
details = cjson.encode(events)

View File

@ -98,6 +98,14 @@ class lma_collector (
require => File[$plugins_dir]
}
file { "${plugins_dir}/encoders":
ensure => directory,
source => 'puppet:///modules/lma_collector/plugins/encoders',
recurse => remote,
notify => Class['lma_collector::service'],
require => File[$plugins_dir]
}
if size($lma_collector::params::additional_packages) > 0 {
package { $lma_collector::params::additional_packages:
ensure => present,

View File

@ -0,0 +1,54 @@
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
class lma_collector::nagios (
$openstack_deployment_name = '',
$url = undef,
$user = $lma_collector::params::nagios_user,
$password = $lma_collector::params::nagios_password,
$ensure = present,
) inherits lma_collector::params {
include lma_collector::service
if $url == undef {
fail('url parameter is undef!')
}
validate_string($url)
# This must be identical logic than in lma-infra-alerting-plugin
$nagios_host = $lma_collector::params::nagios_hostname_service_status
$_nagios_host = "${nagios_host}-env${openstack_deployment_name}"
$config = $lma_collector::params::nagios_event_status_name_to_service_name_map
$config['nagios_host'] = $_nagios_host
heka::encoder::sandbox { 'nagios':
config_dir => $lma_collector::params::config_dir,
filename => "${lma_collector::params::plugins_dir}/encoders/event_nagios.lua",
config => $config,
notify => Class['lma_collector::service'],
}
heka::output::http { 'nagios':
config_dir => $lma_collector::params::config_dir,
url => $url,
message_matcher => 'Type == \'heka.sandbox.status\'',
username => $user,
password => $password,
encoder => 'nagios',
timeout => $lma_collector::params::nagios_timeout,
headers => {'Content-Type' => 'application/x-www-form-urlencoded'},
require => Heka::Encoder::Sandbox['nagios'],
notify => Class['lma_collector::service'],
}
}

View File

@ -54,7 +54,7 @@ class lma_collector::params {
$hekad_max_process_inject = 2
# We inject as many messages than the number of OpenStack services in the Heka
# filter 'service_accumulator_states'. Currently 9 services.
# filter 'service_accumulator_states'. Currently 10 services.
# Hekad default is fine so far with 10 messages allowed from TimerEvent function
$hekad_max_timer_inject = 10
@ -102,9 +102,14 @@ class lma_collector::params {
$heartbeat_timeout = 30
$service_status_timeout = 65
$service_status_payload_name = 'service_status'
# Catch all metrics used to compute OpenStack service statutes
$service_status_metrics_regexp = [
'^openstack.(nova|cinder|neutron).(services|agents).*(up|down|disabled)$',
'^haproxy.backend.*.servers.(down|up)$',
# Exception for mysqld backend because the MySQL service status is
# computed by a dedicated filter and this avoids to send an annoying
# status Heka message.
'^haproxy.backend.(horizon|nova|cinder|neutron|ceilometer|keystone|swift|heat|glance|radosgw)(-.+)?.servers.(down|up)$',
'^pacemaker.resource.vip__public.active$',
'^openstack.*check_api$'
]
@ -149,4 +154,29 @@ class lma_collector::params {
'sahara' => 'sahara-api',
'swift' => 'swift-api',
}
# Nagios parameters
#
$nagios_server = 'localhost'
$nagios_http_port = 80
$nagios_http_path = 'nagios3/cgi-bin/cmd.cgi'
$nagios_user = 'nagiosadmin'
$nagios_password = ''
$nagios_timeout = 2
# Following parameter must match the lma_infrastructure_alerting::params::nagios_openstack_dummy_hostname
$nagios_hostname_service_status = '00-openstack-services'
# Following parameter must match the lma_infrastructure_alerting::params::openstack_core_services
$nagios_event_status_name_to_service_name_map = {
'nova' => 'openstack.nova.status',
'keystone' => 'openstack.keystone.status',
'glance' => 'openstack.glance.status',
'cinder' => 'openstack.cinder.status',
'neutron' => 'openstack.neutron.status',
'heat' => 'openstack.heat.status',
'horizon' => 'openstack.horizon.status',
'swift' => 'openstack.swift.status',
'ceilometer' => 'openstack.ceilometer.status',
'radosgw' => 'openstack.radosgw.status',
}
}

View File

@ -87,6 +87,7 @@ attributes:
type: "text"
regex: *not_empty_parameter
restrictions: *disable_influxdb_parameters
influxdb_password:
value: 'lmapass'
label: 'InfluxDB password'
@ -104,3 +105,48 @@ attributes:
restrictions:
- condition: "true"
action: "hide"
nagios_mode:
type: "radio"
weight: 90
value: "local"
label: "Alerting"
values:
- data: "disabled"
label: "Disabled"
- data: "local"
label: "Local node"
- data: "remote"
label: "Remote server"
restrictions: *all_disabled_msg
nagios_url:
value: ''
label: 'Nagios URL'
description: 'ie: http://<server>/nagios3/cgi-bin/cmd.cgi'
weight: 100
type: "text"
regex: &node_url_regex
source: '^(http(s?):\/\/)?(((www\.)?+[a-zA-Z0-9\.\-\_]+(\.[a-zA-Z]{2,6}){0,})|(\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b))(\/[a-zA-Z0-9\_\-\s\.\/\?\%\#\&\=]*)?$'
error: "Invalid URL"
restrictions: &not_remote
- condition: "settings:lma_collector.nagios_mode.value != 'remote'"
action: "disable"
nagios_user:
value: 'nagiosadmin'
label: 'Nagios user'
description: ''
weight: 105
type: "text"
regex: *not_empty_parameter
restrictions: *not_remote
nagios_password:
value: ''
label: 'Nagios password'
description: ''
weight: 110
type: "password"
regex: *not_empty_parameter
restrictions: *not_remote