diff --git a/deployment_scripts/puppet/manifests/controller.pp b/deployment_scripts/puppet/manifests/controller.pp index 8757a29c1..0b6a2b68f 100644 --- a/deployment_scripts/puppet/manifests/controller.pp +++ b/deployment_scripts/puppet/manifests/controller.pp @@ -259,9 +259,6 @@ if hiera('lma::collector::influxdb::server', false) { class { 'lma_collector::logs::http_metrics': } class { 'lma_collector::logs::aggregated_http_metrics': } - - # AFD filters - class { 'lma_collector::afd::workers': } } $alerting_mode = $lma_collector['alerting_mode'] diff --git a/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb b/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb index d2b11a82e..497d521a2 100644 --- a/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb +++ b/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb @@ -2427,6 +2427,506 @@ lma_collector: periods: 0 function: last + # Following are the AFD generated to check workers + # All workers are down + - name: 'nova-scheduler-all-down' + description: 'All Nova schedulers are down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'scheduler' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-cert-all-down' + description: 'All Nova certs are down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'cert' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-consoleauth-all-down' + description: 'All Nova consoleauths are down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'consoleauth' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-compute-all-down' + description: 'All Nova computes are down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'compute' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-conductor-all-down' + description: 'All Nova conductors are down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'conductor' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'cinder-scheduler-all-down' + description: 'All Cinder schedulers are down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_cinder_services + fields: + service: 'scheduler' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'cinder-volume-all-down' + description: 'All Cinder volumes are down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_cinder_services + fields: + service: 'volume' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-l3-all-down' + description: 'All Neutron L3 agents are down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'l3' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-dhcp-all-down' + description: 'All Neutron DHCP agents are down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'dhcp' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-metadata-all-down' + description: 'All Neutron metadata agents are down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'metadata' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-openvswitch-all-down' + description: 'All Neutron openvswitch agents are down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'openvswitch' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + # At least one backend is down + - name: 'nova-scheduler-one-down' + description: 'At least one Nova scheduler is down' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'scheduler' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-cert-one-down' + description: 'At least one Nova cert is down' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'cert' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-consoleauth-one-down' + description: 'At least one Nova consoleauth is down' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'consoleauth' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-compute-one-down' + description: 'At least one Nova compute is down' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'compute' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-conductor-one-down' + description: 'At least one Nova conductor is down' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'conductor' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'cinder-scheduler-one-down' + description: 'At least one Cinder scheduler is down' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: openstack_cinder_services + fields: + service: 'scheduler' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'cinder-volume-one-down' + description: 'At least one Cinder volume is down' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: openstack_cinder_services + fields: + service: 'volume' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-l3-one-down' + description: 'At least one L3 agent is down' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'l3' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-dhcp-one-down' + description: 'At least one DHCP agent is down' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'dhcp' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-metadata-one-down' + description: 'At least one metadata agents is down' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'metadata' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-openvswitch-one-down' + description: 'At least one openvswitch agents is down' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'openvswitch' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + # Less than 50% of service are up (compared to up and down). + - name: 'nova-scheduler-majority-down' + description: 'Less than 50% of Nova schedulers are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services_percent + fields: + service: 'scheduler' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'nova-cert-majority-down' + description: 'Less than 50% of Nova certs are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services_percent + fields: + service: 'cert' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'nova-consoleauth-majority-down' + description: 'Less than 50% of Nova consoleauths are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services_percent + fields: + service: 'consoleauth' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'nova-compute-majority-down' + description: 'Less than 50% of Nova computes are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services_percent + fields: + service: 'compute' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'nova-conductor-majority-down' + description: 'Less than 50% of Nova conductors are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services_percent + fields: + service: 'conductor' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'cinder-scheduler-majority-down' + description: 'Less than 50% of Cinder schedulers are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_cinder_services_percent + fields: + service: 'scheduler' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'cinder-volume-majority-down' + description: 'Less than 50% of Cinder volumes are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_cinder_services_percent + fields: + service: 'volume' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'neutron-l3-majority-down' + description: 'Less than 50% of Neutron L3 agents are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents_percent + fields: + service: 'l3' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'neutron-dhcp-majority-down' + description: 'Less than 50% of Neutron DHCP agents are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents_percent + fields: + service: 'dhcp' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'neutron-metadata-majority-down' + description: 'Less than 50% of Neutron metadata agents are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents_percent + fields: + service: 'metadata' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'neutron-openvswitch-majority-down' + description: 'Less than 50% of Neutron openvswitch agents are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents_percent + fields: + service: 'openvswitch' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + # Definition of the AFD node filters node_cluster_alarms: controller: @@ -2620,6 +3120,51 @@ lma_collector: activate_alerting: true alarms: error: ['nova-logs-error'] + nova-cert: + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + workers: + - 'nova-cert-all-down' + - 'nova-cert-majority-down' + - 'nova-cert-one-down' + nova-consoleauth: + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + workers: + - 'nova-consoleauth-all-down' + - 'nova-consoleauth-majority-down' + - 'nova-consoleauth-one-down' + nova-compute: + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + workers: + - 'nova-compute-all-down' + - 'nova-compute-majority-down' + - 'nova-compute-one-down' + nova-conductor: + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + workers: + - 'nova-conductor-all-down' + - 'nova-conductor-majority-down' + - 'nova-conductor-one-down' + nova-scheduler: + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + workers: + - 'nova-scheduler-all-down' + - 'nova-scheduler-majority-down' + - 'nova-scheduler-one-down' heat-api: apply_to_node: controller enable_notification: false @@ -2742,6 +3287,24 @@ lma_collector: activate_alerting: true alarms: error: ['cinder-logs-error'] + cinder-scheduler: + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + workers: + - 'cinder-scheduler-all-down' + - 'cinder-scheduler-majority-down' + - 'cinder-scheduler-one-down' + cinder-volume: + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + workers: + - 'cinder-volume-all-down' + - 'cinder-volume-majority-down' + - 'cinder-volume-one-down' <% if not @storage_options["volumes_ceph"] then -%> cinder-volume-logs: apply_to_node: storage @@ -2813,6 +3376,42 @@ lma_collector: activate_alerting: true alarms: error: ['neutron-logs-error'] + neutron-l3: + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + workers: + - 'neutron-l3-all-down' + - 'neutron-l3-majority-down' + - 'neutron-l3-one-down' + neutron-dhcp: + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + workers: + - 'neutron-dhcp-all-down' + - 'neutron-dhcp-majority-down' + - 'neutron-dhcp-one-down' + neutron-metadata: + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + workers: + - 'neutron-metadata-all-down' + - 'neutron-metadata-majority-down' + - 'neutron-metadata-one-down' + neutron-openvswitch: + apply_to_node: controller + enable_notification: false + activate_alerting: true + alarms: + workers: + - 'neutron-openvswitch-all-down' + - 'neutron-openvswitch-majority-down' + - 'neutron-openvswitch-one-down' neutron-logs-compute: apply_to_node: compute enable_notification: false diff --git a/deployment_scripts/puppet/modules/lma_collector/README.md b/deployment_scripts/puppet/modules/lma_collector/README.md index e8c92579a..71639645f 100644 --- a/deployment_scripts/puppet/modules/lma_collector/README.md +++ b/deployment_scripts/puppet/modules/lma_collector/README.md @@ -449,7 +449,6 @@ Public Classes: * [`lma_collector::notifications::metrics`](#class-lma_collectornotificationsmetrics) * [`lma_collector::aggregator::client`](#class-lma_collectoraggregatorclient) * [`lma_collector::aggregator::server`](#class-lma_collectoraggregatorserver) -* [`lma_collector::afd::workers`](#class-lma_collectorafdworkers) * [`lma_collector::gse_policies`](#class-lma_collectorgse_policies) * [`lma_collector::metrics::heka_monitoring`](#class-lma_collectormetricsheka_monitoring) * [`lma_collector::smtp_alert`](#class-lma_collectorsmtp_alert) @@ -870,12 +869,6 @@ Declare this class to make Heka run the aggregator service. to check the health of the aggregator service. Valid options: an integer. Default: `undef`. -#### Class: `lma_collector::afd::workers` - -Declare this class to configure the Heka filter that sends AFD metrics -reporting the availability of the Neutron agents and the Cinder and Nova -services. - #### Class: `lma_collector::gse_policies` Declare this class to configure the GSE cluster policies on the aggregator node. diff --git a/deployment_scripts/puppet/modules/lma_collector/files/plugins/filters/afd_workers.lua b/deployment_scripts/puppet/modules/lma_collector/files/plugins/filters/afd_workers.lua deleted file mode 100644 index 8eeb2223d..000000000 --- a/deployment_scripts/puppet/modules/lma_collector/files/plugins/filters/afd_workers.lua +++ /dev/null @@ -1,94 +0,0 @@ --- Copyright 2015 Mirantis, Inc. --- --- Licensed under the Apache License, Version 2.0 (the "License"); --- you may not use this file except in compliance with the License. --- You may obtain a copy of the License at --- --- http://www.apache.org/licenses/LICENSE-2.0 --- --- Unless required by applicable law or agreed to in writing, software --- distributed under the License is distributed on an "AS IS" BASIS, --- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --- See the License for the specific language governing permissions and --- limitations under the License. - -require 'string' - -local afd = require 'afd' -local consts = require 'gse_constants' - -local worker_states = {} - --- emit AFD event metrics based on openstack_nova_services, openstack_cinder_services and openstack_neutron_agents metrics -function process_message() - local metric_name = read_message('Fields[name]') - local service = string.format('%s-%s', - string.match(metric_name, 'openstack_([^_]+)'), - read_message('Fields[service]')) - local worker_key = string.format('%s.%s', metric_name, service) - - if not worker_states[worker_key] then - worker_states[worker_key] = {} - end - - local worker = worker_states[worker_key] - worker[read_message('Fields[state]')] = read_message('Fields[value]') - - local state = consts.OKAY - if not(worker.up and worker.down) then - -- not enough data for now - return 0 - end - - if worker.up == 0 then - state = consts.DOWN - afd.add_to_alarms(consts.DOWN, - 'last', - metric_name, - {service=service,state='up'}, - {}, - '==', - worker.up, - 0, - nil, - nil, - string.format("All instances for the service %s are down or disabled", service)) - elseif worker.down >= worker.up then - state = consts.CRIT - afd.add_to_alarms(consts.CRIT, - 'last', - metric_name, - {service=service,state='down'}, - {}, - '>=', - worker.down, - worker.up, - nil, - nil, - string.format("More instances of %s are down than up", service)) - elseif worker.down > 0 then - state = consts.WARN - afd.add_to_alarms(consts.WARN, - 'last', - metric_name, - {service=service,state='down'}, - {}, - '>', - worker.down, - 0, - nil, - nil, - string.format("At least one %s instance is down", service)) - end - - afd.inject_afd_service_metric(service, - state, - read_message('Fields[hostname]'), - 0, - 'workers') - - -- reset the cache for this worker - worker_states[worker_key] = {} - - return 0 -end diff --git a/deployment_scripts/puppet/modules/lma_collector/manifests/afd/workers.pp b/deployment_scripts/puppet/modules/lma_collector/manifests/afd/workers.pp deleted file mode 100644 index 2a51513bf..000000000 --- a/deployment_scripts/puppet/modules/lma_collector/manifests/afd/workers.pp +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2015 Mirantis, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# -class lma_collector::afd::workers () { - include lma_collector::params - include lma_collector::service::metric - - $lua_modules_dir = $lma_collector::params::lua_modules_dir - - $metrics_matcher = join([ - '(Type == \'metric\' || Type == \'heka.sandbox.metric\')', ' && ', - 'Fields[name] =~ /^openstack_(nova|cinder|neutron)_(services|agents)$/', - ], '') - - heka::filter::sandbox { 'afd_workers': - config_dir => $lma_collector::params::metric_config_dir, - filename => "${lma_collector::params::plugins_dir}/filters/afd_workers.lua", - message_matcher => $metrics_matcher, - module_directory => $lua_modules_dir, - notify => Class['lma_collector::service::metric'], - } -} diff --git a/deployment_scripts/puppet/modules/lma_collector/spec/classes/lma_collector_afd_workers_spec.rb b/deployment_scripts/puppet/modules/lma_collector/spec/classes/lma_collector_afd_workers_spec.rb deleted file mode 100644 index 45c06ca81..000000000 --- a/deployment_scripts/puppet/modules/lma_collector/spec/classes/lma_collector_afd_workers_spec.rb +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2015 Mirantis, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -require 'spec_helper' - -describe 'lma_collector::afd::workers' do - let(:facts) do - {:kernel => 'Linux', :operatingsystem => 'Ubuntu', - :osfamily => 'Debian'} - end - - describe 'with defaults' do - it { is_expected.to contain_heka__filter__sandbox('afd_workers') } - end -end -