From 3a3ef6f2e3c15e4f38852206017857d6dd5bb968 Mon Sep 17 00:00:00 2001 From: Simon Pasquier Date: Fri, 29 Jul 2016 12:24:08 +0200 Subject: [PATCH] Add Pacemaker collectd plugin This change adds a collectd plugin that gets metrics from the Pacemaker cluster: - cluster's metrics - node's metrics - resource's metrics Most of the metrics are only collected from the node that is the designated controller except pacemaker_resource_local_active and pacemaker_dc_local_active. The plugin also removes the 'pacemaker_resource' plugin by providing the exact same metrics and notifications for the other collectd plugins. Finally the plugin is also installed on the standalone-rabbitmq and standalone-database nodes if they are present. Change-Id: I8b5b987704f69c6a60b13e8ea982f27924f488d1 --- deployment_scripts/puppet/manifests/base.pp | 24 ++ .../puppet/manifests/controller.pp | 34 +- .../puppet/modules/lma_collector/README.md | 11 +- .../files/collectd/collectd_base.py | 4 +- .../files/collectd/collectd_pacemaker.py | 306 ++++++++++++++++++ .../files/collectd/pacemaker_resource.py | 80 ----- .../files/plugins/decoders/collectd.lua | 18 +- .../manifests/collectd/pacemaker.pp | 79 ++--- .../lma_collector_collectd_pacemaker_spec.rb | 32 +- doc/user/source/metrics/pacemaker.rst | 64 +++- 10 files changed, 476 insertions(+), 176 deletions(-) create mode 100644 deployment_scripts/puppet/modules/lma_collector/files/collectd/collectd_pacemaker.py delete mode 100644 deployment_scripts/puppet/modules/lma_collector/files/collectd/pacemaker_resource.py diff --git a/deployment_scripts/puppet/manifests/base.pp b/deployment_scripts/puppet/manifests/base.pp index ccbd812dc..69efb41df 100644 --- a/deployment_scripts/puppet/manifests/base.pp +++ b/deployment_scripts/puppet/manifests/base.pp @@ -357,6 +357,30 @@ if hiera('lma::collector::influxdb::server', false) { } } + if ($is_rabbitmq or $is_mysql_server) and ! $is_controller { + if $is_mysql_server { + $mysql_resource = { + 'p_mysqld' => 'mysqld', + } + } + else { + $mysql_resource = {} + } + if $is_rabbitmq { + $rabbitmq_resource = { + 'p_rabbitmq-server' => 'rabbitmq', + } + } + else { + $rabbitmq_resource = {} + } + + class { 'lma_collector::collectd::pacemaker': + resources => merge($rabbitmq_resource, $mysql_resource), + hostname => $::hostname, + } + } + class { 'lma_collector::influxdb': server => hiera('lma::collector::influxdb::server'), port => hiera('lma::collector::influxdb::port'), diff --git a/deployment_scripts/puppet/manifests/controller.pp b/deployment_scripts/puppet/manifests/controller.pp index 2e306cbd9..f6c915825 100644 --- a/deployment_scripts/puppet/manifests/controller.pp +++ b/deployment_scripts/puppet/manifests/controller.pp @@ -21,6 +21,7 @@ $network_metadata = hiera_hash('network_metadata') $node_profiles = hiera_hash('lma::collector::node_profiles') $is_rabbitmq = $node_profiles['rabbitmq'] +$is_mysql_server = $node_profiles['mysql'] $ceilometer = hiera_hash('ceilometer', {}) $lma_collector = hiera_hash('lma_collector') @@ -304,15 +305,34 @@ if hiera('lma::collector::influxdb::server', false) { } $pacemaker_master_resource = 'vip__management' + # Deal with detach-* plugins + if $is_mysql_server { + $mysql_resource = { + 'p_mysqld' => 'mysqld', + } + } + else { + $mysql_resource = {} + } + if $is_rabbitmq { + $rabbitmq_resource = { + 'p_rabbitmq-server' => 'rabbitmq', + } + } + else { + $rabbitmq_resource = {} + } + class { 'lma_collector::collectd::pacemaker': - resources => [ - 'vip__public', - 'vip__management', - 'vip__vrouter_pub', - 'vip__vrouter', - ], - master_resource => $pacemaker_master_resource, + resources => merge({ + 'vip__public' => 'vip__public', + 'vip__management' => 'vip__management', + 'vip__vrouter_pub' => 'vip__vrouter_pub', + 'vip__vrouter' => 'vip__vrouter', + 'p_haproxy' => 'haproxy', + }, $mysql_resource, $rabbitmq_resource), + notify_resource => $pacemaker_master_resource, hostname => $::fqdn, } diff --git a/deployment_scripts/puppet/modules/lma_collector/README.md b/deployment_scripts/puppet/modules/lma_collector/README.md index 0ff8f7aa8..d6c21ed89 100644 --- a/deployment_scripts/puppet/modules/lma_collector/README.md +++ b/deployment_scripts/puppet/modules/lma_collector/README.md @@ -737,15 +737,14 @@ which uses Pacemaker's `crm_resource` command to get statistics from Pacemaker. ##### Parameters * `resources`: *Required*. The Pacemaker resources to get statistics for. Valid - options: an array of strings. -* `master_resource`: *Optional*. If this is set a collectd `PostCache` chain is - created to generate a collectd notification each time the Python plugin - generates a metric for the Pacemaker resource identified to by - `master_resource`. Users of + options: an hash of strings. +* `notify_resource`: *Optional*. If this is set, the collectd plugin generates + a collectd notification reporting the state of the Pacemaker resource + identified to by `master_resource`. Users of [`lma_collector::collectd::openstack`](#define-lma_collectorcollectdopenstack), [`lma_collector::collectd::openstack_checks`](#class-lma_collectorcollectdopenstackchecks) and [`lma_collector::collectd::hypervisor`](#class-lma_collectorcollectdhypervisor) - with the `pacemaker_resource_master` parameter needs to declare the + with the `notify_resource` parameter needs to declare the `lma_collector::collectd::pacemaker` class and use that parameter. Valid options: a string. Default: `undef`. * `hostname`: *Optional*. If this is set it will be used to identify the local diff --git a/deployment_scripts/puppet/modules/lma_collector/files/collectd/collectd_base.py b/deployment_scripts/puppet/modules/lma_collector/files/collectd/collectd_base.py index 157ca8966..f042628d2 100644 --- a/deployment_scripts/puppet/modules/lma_collector/files/collectd/collectd_base.py +++ b/deployment_scripts/puppet/modules/lma_collector/files/collectd/collectd_base.py @@ -109,13 +109,14 @@ class Base(object): """Iterate over the collected metrics This class must be implemented by the subclass and should yield dict - objects that represent the collected values. Each dict has 3 keys: + objects that represent the collected values. Each dict has 6 keys: - 'values', a scalar number or a list of numbers if the type defines several datasources. - 'type_instance' (optional) - 'plugin_instance' (optional) - 'type' (optional, default='gauge') - 'meta' (optional) + - 'hostname' (optional) For example: @@ -141,6 +142,7 @@ class Base(object): v = self.collectd.Values( plugin=self.plugin, + host=metric.get('hostname', ''), type=metric.get('type', 'gauge'), plugin_instance=self.plugin_instance, type_instance=type_instance, diff --git a/deployment_scripts/puppet/modules/lma_collector/files/collectd/collectd_pacemaker.py b/deployment_scripts/puppet/modules/lma_collector/files/collectd/collectd_pacemaker.py new file mode 100644 index 000000000..83ef58d95 --- /dev/null +++ b/deployment_scripts/puppet/modules/lma_collector/files/collectd/collectd_pacemaker.py @@ -0,0 +1,306 @@ +#!/usr/bin/python +# Copyright 2016 Mirantis, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collectd +from collections import Counter +from collections import defaultdict +from sets import Set +import socket +import xml.etree.ElementTree as ET + +import collectd_base as base + +NAME = 'pacemaker' +CRM_MON_BINARY = '/usr/sbin/crm_mon' + +# Node status +OFFLINE_STATUS = 0 +MAINTENANCE_STATUS = 1 +ONLINE_STATUS = 2 + + +class CrmMonitorPlugin(base.Base): + + def __init__(self, *args, **kwargs): + super(CrmMonitorPlugin, self).__init__(*args, **kwargs) + self.plugin = NAME + self.crm_mon_binary = CRM_MON_BINARY + self.hostname = socket.getfqdn() + self.notify_resource = None + self.resources = {} + self.history = {} + + def config_callback(self, conf): + super(CrmMonitorPlugin, self).config_callback(conf) + + for node in conf.children: + if node.key == 'Hostname': + self.hostname = node.values[0] + elif node.key == 'CrmMonBinary': + self.crm_mon_binary = node.values[0] + elif node.key == 'Resource': + self.resources[node.values[0]] = node.values[-1] + elif node.key == 'NotifyResource': + self.notify_resource = node.values[0] + + def itermetrics(self): + def str_to_bool(v): + return str(v).lower() == 'true' + + def str_to_boolint(v): + if str_to_bool(v): + return 1 + else: + return 0 + + def shorten_hostname(v): + return v.split('.')[0] + + def same_hostname(v): + if v is not None and v.get('name') == self.hostname: + return 1 + return 0 + + out, err = self.execute([self.crm_mon_binary, '--as-xml', '-r', '-f'], + shell=False) + if not out: + raise base.CheckException( + "Failed to execute crm_mon '{}'".format(err)) + + try: + root = ET.fromstring(out) + except ET.ParseError: + raise base.CheckException( + "Failed to parse XML '{}'".format(out[:64])) + + if self.notify_resource: + # Notify the other collectd plugins whether the resource runs + # locally or not + node = root.find('resources/resource[@id="{}"]/node'.format( + self.notify_resource)) + self.collectd.Notification( + type='gauge', + message='{{"resource":"{}","value":{}}}'.format( + self.notify_resource, same_hostname(node)), + severity=self.collectd.NOTIF_OKAY + ).dispatch() + # The metric needs to be emitted too for the Lua plugins executed + # by the metric_collector service + yield { + 'type_instance': 'resource_local_active', + 'values': same_hostname(node), + 'meta': {'resource': self.notify_resource} + } + + summary = root.find('summary') + current_dc = summary.find('current_dc') + # The metric needs to be emitted for the alarms that leverage the other + # metrics emitted by the plugin + yield { + 'type_instance': 'dc_local_active', + 'values': same_hostname(current_dc), + } + + if current_dc.get('name') != self.hostname: + # The other metrics are only collected from the cluster's DC + return + + # Report global cluster metrics + yield { + 'type_instance': 'dc', + 'values': str_to_boolint(current_dc.get('present', 'false')) + } + + yield { + 'type_instance': 'quorum_status', + 'values': str_to_boolint(current_dc.get('with_quorum', 'false')) + } + yield { + 'type_instance': 'configured_nodes', + 'values': int(summary.find('nodes_configured').get('number')) + } + yield { + 'type_instance': 'configured_resources', + 'values': int(summary.find('resources_configured').get('number')) + } + + # Report node status metrics + cluster_nodes = [] + aggregated_nodes_status = {'online': 0, 'offline': 0, 'maintenance': 0} + nodes_total = 0 + for node in root.find('nodes').iter('node'): + nodes_total += 1 + hostname = shorten_hostname(node.get('name')) + cluster_nodes.append(node.get('name')) + if str_to_bool(node.get('online')): + if str_to_bool(node.get('maintenance')): + aggregated_nodes_status['maintenance'] += 1 + yield { + 'type_instance': 'node_status', + 'values': MAINTENANCE_STATUS, + 'hostname': hostname, + 'meta': {'status': 'maintenance'} + } + else: + aggregated_nodes_status['online'] += 1 + yield { + 'type_instance': 'node_status', + 'values': ONLINE_STATUS, + 'hostname': hostname, + 'meta': {'status': 'online'} + } + else: + aggregated_nodes_status['offline'] += 1 + yield { + 'type_instance': 'node_status', + 'values': OFFLINE_STATUS, + 'hostname': hostname, + 'meta': {'status': 'offline'} + } + + for status, cnt in aggregated_nodes_status.items(): + yield { + 'type_instance': 'nodes_count', + 'values': cnt, + 'meta': {'status': status} + } + yield { + 'type_instance': 'nodes_percent', + 'values': 100.0 * cnt / nodes_total, + 'meta': {'status': status} + } + + # Report the number of resources per status + # Clone resources can run on multipe nodes while "simple" resources run + # only one node at the same time + aggregated_resources = defaultdict(Counter) + resources = root.find('resources') + for resource_id, resource_name in self.resources.iteritems(): + resource_elts = [] + simple_resource = None + clone_resource = resources.find( + 'clone/resource[@id="{}"]/..'.format(resource_id)) + if not clone_resource: + simple_resource = resources.find('resource[@id="{}"]'.format( + resource_id)) + if simple_resource: + resource_elts = [simple_resource] + else: + resource_elts = clone_resource.findall('resource') + + if not resource_elts: + self.logger.error("{}: Couldn't find resource '{}'".format( + self.plugin, resource_id)) + continue + + total = 0 + for item in resource_elts: + total += 1 + if (item.get('role') in ('Slave', 'Master') and + not str_to_bool(item.get('failed'))): + # Multi-master resource + aggregated_resources[resource_name]['up'] += 1 + elif item.get('role') == 'Started': + aggregated_resources[resource_name]['up'] += 1 + else: + aggregated_resources[resource_name]['down'] += 1 + + if simple_resource: + # Report on which node the "simple" resource is running + for node in cluster_nodes: + yield { + 'type_instance': 'local_resource_active', + 'values': str_to_boolint( + node == simple_resource.find('node').get('name')), + 'hostname': shorten_hostname(node), + 'meta': {'resource': resource_name} + } + + for status in ('up', 'down'): + cnt = aggregated_resources[resource_name][status] + yield { + 'type_instance': 'resource_count', + 'values': cnt, + 'meta': {'status': status, 'resource': resource_name} + } + yield { + 'type_instance': 'resource_percent', + 'values': 100.0 * cnt / total, + 'meta': {'status': status, 'resource': resource_name} + } + + # Collect operations' history metrics for the monitored resources + # + # The reported count for the resource's operations is an approximate + # value because crm_mon doesn't provide the exact number. To estimate + # the number of operations applied to a resource, the plugin keeps a + # copy of call_ids and compares it with the current value. + for node in root.find('node_history').iter('node'): + hostname = shorten_hostname(node.get('name')) + if hostname not in self.history: + self.history[hostname] = {} + + for resource_id, resource_name in self.resources.iteritems(): + if resource_id not in self.history[hostname]: + self.history[hostname][resource_id] = { + 'fail_count': 0, + 'ops_count': 0, + 'call_ids': Set([]) + } + v = self.history[hostname][resource_id] + + res_history = node.find('resource_history[@id="{}"]'.format( + resource_id)) + if res_history: + # For simple resources, the resource_history element only + # exists for the node that runs the resource + v['fail_count'] += int(res_history.get('fail-count', 0)) + call_ids = Set([ + i.get('call') for i in res_history.findall( + 'operation_history')]) + if call_ids: + v['ops_count'] += len(call_ids - v['call_ids']) + v['call_ids'] = call_ids + + yield { + 'type_instance': 'resource_failures', + 'values': v['fail_count'], + 'hostname': hostname, + 'meta': {'resource': resource_name} + } + yield { + 'type_instance': 'resource_operations', + 'values': v['ops_count'], + 'hostname': hostname, + 'meta': {'resource': resource_name} + } + + +plugin = CrmMonitorPlugin(collectd) + + +def init_callback(): + plugin.restore_sigchld() + + +def config_callback(conf): + plugin.config_callback(conf) + + +def read_callback(): + plugin.read_callback() + +collectd.register_config(config_callback) +collectd.register_read(read_callback) diff --git a/deployment_scripts/puppet/modules/lma_collector/files/collectd/pacemaker_resource.py b/deployment_scripts/puppet/modules/lma_collector/files/collectd/pacemaker_resource.py deleted file mode 100644 index c4b08ba11..000000000 --- a/deployment_scripts/puppet/modules/lma_collector/files/collectd/pacemaker_resource.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/python -# Copyright 2015 Mirantis, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collectd -import socket - -import collectd_base as base - -NAME = 'pacemaker_resource' -CRM_RESOURCE_BIN = '/usr/sbin/crm_resource' - - -class PacemakerResourcePlugin(base.Base): - - def __init__(self, *args, **kwargs): - super(PacemakerResourcePlugin, self).__init__(*args, **kwargs) - self.plugin = NAME - self.crm_resource_bin = CRM_RESOURCE_BIN - self.hostname = socket.getfqdn() - self.resources = [] - - def config_callback(self, conf): - super(PacemakerResourcePlugin, self).config_callback(conf) - - for node in conf.children: - if node.key == 'Resource': - self.resources.extend(node.values) - elif node.key == 'Hostname': - self.hostname = node.values[0] - elif node.key == 'CrmResourceBin': - self.crm_resource_bin = node.values[0] - - def itermetrics(self): - for resource in self.resources: - out, err = self.execute([self.crm_resource_bin, '--locate', - '--quiet', '--resource', resource], - shell=False) - if not out: - msg = "{}: Failed to get the status for '%s'".format( - self.plugin, resource) - raise base.CheckException(msg) - - else: - value = 0 - if self.hostname == out.lstrip("\n"): - value = 1 - yield { - 'type_instance': resource, - 'values': value - } - -plugin = PacemakerResourcePlugin(collectd, 'pacemaker') - - -def init_callback(): - plugin.restore_sigchld() - - -def config_callback(conf): - plugin.config_callback(conf) - - -def read_callback(): - plugin.read_callback() - -collectd.register_init(init_callback) -collectd.register_config(config_callback) -collectd.register_read(read_callback) diff --git a/deployment_scripts/puppet/modules/lma_collector/files/plugins/decoders/collectd.lua b/deployment_scripts/puppet/modules/lma_collector/files/plugins/decoders/collectd.lua index e9fdffc3a..b95894b82 100644 --- a/deployment_scripts/puppet/modules/lma_collector/files/plugins/decoders/collectd.lua +++ b/deployment_scripts/puppet/modules/lma_collector/files/plugins/decoders/collectd.lua @@ -306,10 +306,20 @@ function process_message () msg['Fields'][additional_tag] = sample['type_instance'] end end - elseif metric_source == 'pacemaker_resource' then - msg['Fields']['name'] = 'pacemaker_local_resource_active' - msg['Fields']['tag_fields'] = { 'resource' } - msg['Fields']['resource'] = sample['type_instance'] + elseif metric_source == 'pacemaker' then + msg['Fields']['name'] = metric_source .. sep .. sample['type_instance'] + + -- add dimension fields + local t = {} + for _, v in ipairs({'status', 'resource'}) do + if sample['meta'] and sample['meta'][v] then + t[#t+1] = v + msg['Fields'][v] = sample['meta'][v] + end + end + if #t > 0 then + msg['Fields']['tag_fields'] = t + end elseif metric_source == 'users' then -- 'users' is a reserved name for InfluxDB v0.9 msg['Fields']['name'] = 'logged_users' diff --git a/deployment_scripts/puppet/modules/lma_collector/manifests/collectd/pacemaker.pp b/deployment_scripts/puppet/modules/lma_collector/manifests/collectd/pacemaker.pp index b6a997e24..bd0ba6914 100644 --- a/deployment_scripts/puppet/modules/lma_collector/manifests/collectd/pacemaker.pp +++ b/deployment_scripts/puppet/modules/lma_collector/manifests/collectd/pacemaker.pp @@ -15,71 +15,40 @@ class lma_collector::collectd::pacemaker ( $resources, - $master_resource = undef, + $notify_resource = undef, $hostname = undef, ) { - validate_array($resources) + validate_hash($resources) - # Add quotes around the array values - $real_resources = suffix(prefix($resources, '"'), '"') + # Add quotes around the hash keys and values + $resources_keys = suffix(prefix(keys($resources), '"'), '"') + $resources_values = suffix(prefix(values($resources), '"'), '"') + $real_resources = hash(flatten(zip($resources_keys, $resources_values))) if $hostname { - $config = { - 'Resource' => $real_resources, - 'Hostname' => "\"${hostname}\"", - } + $_hostname = {'Hostname' => "\"${hostname}\""} } else { - $config = { - 'Resource' => $real_resources, - } + $_hostname = {} + } + if $notify_resource { + $_notify_resource = {'NotifyResource' => "\"${notify_resource}\""} + } else { + $_notify_resource = {} } - lma_collector::collectd::python { 'pacemaker_resource': - config => $config + lma_collector::collectd::python { 'collectd_pacemaker': + config => merge({'Resource' => $real_resources}, $_hostname, $_notify_resource) } - if $master_resource { - - if ! member($resources, $master_resource) { - fail("${master_resource} not a member of ${resources}") - } - - # Configure a PostCache chain to create a collectd notification each time - # the pacemaker_resource plugin generates a metric whose "type instance" - # matches the resource specified by the $master_resource parameter. - # - # The notifications are caught by other plugins to know the state of that - # Pacemaker resource. - - collectd::plugin { 'target_notification': } - collectd::plugin { 'match_regex': } - - class { 'collectd::plugin::chain': - chainname => 'PostCache', - defaulttarget => 'write', - rules => [ - { - 'match' => { - 'type' => 'regex', - 'matches' => { - 'Plugin' => '^pacemaker_resource$', - 'TypeInstance' => "^${master_resource}$", - }, - }, - 'targets' => [ - { - 'type' => 'notification', - 'attributes' => { - 'Message' => '{\"resource\":\"%{type_instance}\",\"value\":%{ds:value}}', - 'Severity' => 'OKAY', - }, - }, - ], - }, - ], - } - + # Remove configuration bits from versions < 1.0 + collectd::plugin { 'target_notification': + ensure => absent + } + collectd::plugin { 'match_regex': + ensure => absent + } + class { 'collectd::plugin::chain': + ensure => absent } - } diff --git a/deployment_scripts/puppet/modules/lma_collector/spec/classes/lma_collector_collectd_pacemaker_spec.rb b/deployment_scripts/puppet/modules/lma_collector/spec/classes/lma_collector_collectd_pacemaker_spec.rb index 12f1e5ac9..29e89aaa0 100644 --- a/deployment_scripts/puppet/modules/lma_collector/spec/classes/lma_collector_collectd_pacemaker_spec.rb +++ b/deployment_scripts/puppet/modules/lma_collector/spec/classes/lma_collector_collectd_pacemaker_spec.rb @@ -20,34 +20,26 @@ describe 'lma_collector::collectd::pacemaker' do end describe 'with "resources" param' do - let(:params) {{:resources => ['vip__public', 'vip__management']}} - it { is_expected.to contain_lma_collector__collectd__python('pacemaker_resource') \ - .with_config({'Resource' => ['"vip__public"', '"vip__management"']}) } - it { is_expected.not_to contain_collectd__plugin('target_notification') } - it { is_expected.not_to contain_collectd__plugin('match_regex') } - it { is_expected.not_to contain_class('collectd::plugin::chain') } + let(:params) {{:resources => {'vip__public' => 'public', 'vip__management' => 'mgmt'}}} + it { is_expected.to contain_lma_collector__collectd__python('collectd_pacemaker') \ + .with_config({'Resource' => {'"vip__public"' => '"public"', '"vip__management"' => '"mgmt"'}}) } end describe 'with "hostname" param' do - let(:params) {{:resources => ['vip__public', 'vip__management'], + let(:params) {{:resources => {'vip__public' => 'public', 'vip__management' => 'mgmt'}, :hostname => 'foo.example.com'}} - it { is_expected.to contain_lma_collector__collectd__python('pacemaker_resource') \ - .with_config({'Resource' => ['"vip__public"', '"vip__management"'], + it { is_expected.to contain_lma_collector__collectd__python('collectd_pacemaker') \ + .with_config({'Resource' => {'"vip__public"' => '"public"', '"vip__management"' => '"mgmt"'}, 'Hostname' => '"foo.example.com"'}) } - it { is_expected.not_to contain_collectd__plugin('target_notification') } - it { is_expected.not_to contain_collectd__plugin('match_regex') } - it { is_expected.not_to contain_class('collectd::plugin::chain') } end - describe 'with "master_resource" param' do + describe 'with "notify_resource" param' do let(:params) do - {:resources => ['vip__public', 'vip__management'], - :master_resource => 'vip__management'} + {:resources => {'vip__public' => 'public', 'vip__management' => 'mgmt'}, + :notify_resource => 'vip__management'} end - it { is_expected.to contain_lma_collector__collectd__python('pacemaker_resource') \ - .with_config({'Resource' => ['"vip__public"', '"vip__management"'],}) } - it { is_expected.to contain_collectd__plugin('target_notification') } - it { is_expected.to contain_collectd__plugin('match_regex') } - it { is_expected.to contain_class('collectd::plugin::chain') } + it { is_expected.to contain_lma_collector__collectd__python('collectd_pacemaker') \ + .with_config({'Resource' => {'"vip__public"' => '"public"', '"vip__management"' => '"mgmt"'}, + "NotifyResource"=>"\"vip__management\""}) } end end diff --git a/doc/user/source/metrics/pacemaker.rst b/doc/user/source/metrics/pacemaker.rst index 9da3a8184..ed0ad5439 100644 --- a/doc/user/source/metrics/pacemaker.rst +++ b/doc/user/source/metrics/pacemaker.rst @@ -1,9 +1,67 @@ .. _pacemaker-metrics: -Resource location -^^^^^^^^^^^^^^^^^ +Cluster +^^^^^^^ -* ``pacemaker_resource_local_active``, ``1`` when the resource is located on +* ``pacemaker_dc_local_active``, ``1`` when the Designated Controller (DC) is + the local host, if not, then ``0``. + +* ``pacemaker_dc`` [#f1]_, ``1`` when the Designated Controller (DC) is + present, if not, then ``0``. +* ``pacemaker_quorum_status`` [#f1]_, ``1`` when the cluster's quorum is + reached, if not, then ``0``. +* ``pacemaker_configured_nodes`` [#f1]_, the number of configured nodes in the + cluster. +* ``pacemaker_configured_resources`` [#f1]_, the number of configured nodes in + the cluster. + +.. [#f1] this metric is only emitted from the node that is the Designated + Controller (DC) of the Pacemaker cluster. + +Node +^^^^ +The following metrics are only emitted from the node that is the Designated +Controller (DC) of the Pacemaker cluster. They have a ``status`` field which is +one of 'offline', 'maintenance', or 'online': + +* ``pacemaker_node_status``, the status of the node, ``0`` when offline, ``1`` + when in maintenance or ``2`` when online. +* ``pacemaker_node_count``, the total number of nodes with the given + ``status``. +* ``pacemaker_node_percent``, the percentage of nodes with the given + ``status``. + +Resource +^^^^^^^^ + +* ``pacemaker_resource_local_active``, ``1`` when the resource is located on the host reporting the metric, if not, then ``0``. The metric contains a ``resource`` field which is one of 'vip__public', 'vip__management', 'vip__vrouter_pub', or 'vip__vrouter'. + +* ``pacemaker_resource_failures`` [#f2]_, the total number of failures that + Pacemaker detected for the ``resource``. The counter is reset every time the + collector restarts. The metric contains a ``resource`` field which one of + 'vip__management', 'vip__public', 'vip__vrouter_pub', 'vip__vrouter', + 'rabbitmq', 'mysqld' or 'haproxy'. + +* ``pacemaker_resource_operations`` [#f2]_, the total number of operations that + Pacemaker applied to the ``resource``. The counter is reset every time the + collector restarts. The metric contains a ``resource`` field which one of + 'vip__management', 'vip__public', 'vip__vrouter_pub', 'vip__vrouter', + 'rabbitmq', 'mysqld' or 'haproxy'. + +The following metrics have ``resource`` and ``status`` fields. + +``status`` is one of 'offline', 'maintenance', or 'online'. + +``resource`` is one of 'vip__management', 'vip__public', 'vip__vrouter_pub', +'vip__vrouter', 'rabbitmq', 'mysqld' or 'haproxy'. + +* ``pacemaker_resource_count`` [#f2]_, the total number of instances for the given + ``status`` and ``resource``. +* ``pacemaker_resource_percent`` [#f2]_, the percentage of instances for the given + ``status`` and ``resource``. + +.. [#f2] this metric is only emitted from the node that is the Designated + Controller (DC) of the Pacemaker cluster.