diff --git a/doc/user/source/appendix_alarms.rst b/doc/user/source/appendix_alarms.rst index e3302cda5..8169649d9 100644 --- a/doc/user/source/appendix_alarms.rst +++ b/doc/user/source/appendix_alarms.rst @@ -48,6 +48,50 @@ The following is a list of StackLight built-in alarms:: window: 120 periods: 0 function: avg + - name: 'swap-usage-critical' + description: 'There is no more swap free space' + severity: 'critical' + enabled: 'true' + trigger: + logical_operator: 'or' + rules: + - metric: swap_free + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: max + - name: 'swap-activity-warning' + description: 'The swap activity is high' + severity: 'warning' + enabled: 'true' + trigger: + logical_operator: 'or' + rules: + - metric: swap_io_in + relational_operator: '>=' + threshold: 1048576 # 1 Mb/s + window: 120 + periods: 0 + function: avg + - metric: swap_io_out + relational_operator: '>=' + threshold: 1048576 # 1 Mb/s + window: 120 + periods: 0 + function: avg + - name: 'swap-usage-warning' + description: 'The swap free space is low' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: swap_percent_used + relational_operator: '>=' + threshold: 0.8 + window: 60 + periods: 0 + function: avg - name: 'cpu-critical-compute' description: 'The CPU usage is too high (compute node)' severity: 'critical' @@ -186,6 +230,9 @@ The following is a list of StackLight built-in alarms:: - name: 'rabbitmq-disk-limit-critical' description: 'RabbitMQ has reached the free disk threshold. All producers are blocked' severity: 'critical' + # If the local RabbitMQ instance is down, it will be caught by the + # rabbitmq-check alarm + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -199,6 +246,9 @@ The following is a list of StackLight built-in alarms:: - name: 'rabbitmq-disk-limit-warning' description: 'RabbitMQ is getting close to the free disk threshold' severity: 'warning' + # If the local RabbitMQ instance is down, it will be caught by the + # rabbitmq-check alarm + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -212,6 +262,9 @@ The following is a list of StackLight built-in alarms:: - name: 'rabbitmq-memory-limit-critical' description: 'RabbitMQ has reached the memory threshold. All producers are blocked' severity: 'critical' + # If the local RabbitMQ instance is down, it will be caught by the + # rabbitmq-check alarm + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -225,6 +278,9 @@ The following is a list of StackLight built-in alarms:: - name: 'rabbitmq-memory-limit-warning' description: 'RabbitMQ is getting close to the memory threshold' severity: 'warning' + # If the local RabbitMQ instance is down, it will be caught by the + # rabbitmq-check alarm + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -238,6 +294,9 @@ The following is a list of StackLight built-in alarms:: - name: 'rabbitmq-queue-warning' description: 'The number of outstanding messages is too high' severity: 'warning' + # If the local RabbitMQ instance is down, it will be caught by the + # rabbitmq-check alarm + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -248,6 +307,57 @@ The following is a list of StackLight built-in alarms:: window: 120 periods: 0 function: avg + - name: 'rabbitmq-pacemaker-down' + description: 'The RabbitMQ cluster is down' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + logical_operator: 'and' + rules: + - metric: pacemaker_resource_percent + fields: + resource: rabbitmq + status: up + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'rabbitmq-pacemaker-critical' + description: 'The RabbitMQ cluster is critical because less than half of the nodes are up' + severity: 'critical' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + logical_operator: 'and' + rules: + - metric: pacemaker_resource_percent + fields: + resource: rabbitmq + status: up + relational_operator: '<' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'rabbitmq-pacemaker-warning' + description: 'The RabbitMQ cluster is degraded because some RabbitMQ nodes are missing' + severity: 'warning' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + logical_operator: 'and' + rules: + - metric: pacemaker_resource_percent + fields: + resource: rabbitmq + status: up + relational_operator: '<' + threshold: 100 + window: 60 + periods: 0 + function: last - name: 'apache-warning' description: 'There is no Apache idle workers available' severity: 'warning' @@ -261,6 +371,18 @@ The following is a list of StackLight built-in alarms:: window: 60 periods: 0 function: min + - name: 'apache-check' + description: 'Apache cannot be checked' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: apache_check + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last - name: 'log-fs-warning' description: "The log filesystem's free space is low" severity: 'warning' @@ -271,7 +393,7 @@ The following is a list of StackLight built-in alarms:: fields: fs: '/var/log' relational_operator: '<' - threshold: 10 + threshold: 10 window: 60 periods: 0 function: min @@ -285,7 +407,7 @@ The following is a list of StackLight built-in alarms:: fields: fs: '/var/log' relational_operator: '<' - threshold: 5 + threshold: 5 window: 60 periods: 0 function: min @@ -299,7 +421,7 @@ The following is a list of StackLight built-in alarms:: fields: fs: '/' relational_operator: '<' - threshold: 5 + threshold: 10 window: 60 periods: 0 function: min @@ -313,7 +435,7 @@ The following is a list of StackLight built-in alarms:: fields: fs: '/' relational_operator: '<' - threshold: 2 + threshold: 5 window: 60 periods: 0 function: min @@ -327,7 +449,7 @@ The following is a list of StackLight built-in alarms:: fields: fs: '/var/lib/mysql' relational_operator: '<' - threshold: 5 + threshold: 10 window: 60 periods: 0 function: min @@ -341,7 +463,7 @@ The following is a list of StackLight built-in alarms:: fields: fs: '/var/lib/mysql' relational_operator: '<' - threshold: 2 + threshold: 5 window: 60 periods: 0 function: min @@ -373,6 +495,54 @@ The following is a list of StackLight built-in alarms:: window: 60 periods: 0 function: min + - name: 'other-fs-warning' + description: "The filesystem's free space is low" + severity: 'warning' + enabled: 'true' + no_data_policy: 'okay' + trigger: + rules: + - metric: fs_space_percent_free + fields: + fs: '!= /var/lib/nova && != /var/log && != /var/lib/mysql && != / && !~ ceph%-%d+$' + group_by: [fs] + relational_operator: '<' + threshold: 10 + window: 60 + periods: 0 + function: min + - name: 'other-fs-critical' + description: "The filesystem's free space is too low" + severity: 'critical' + enabled: 'true' + no_data_policy: 'okay' + trigger: + rules: + - metric: fs_space_percent_free + fields: + fs: '!= /var/lib/nova && != /var/log && != /var/lib/mysql && != / && !~ ceph%-%d+$' + group_by: [fs] + relational_operator: '<' + threshold: 5 + window: 60 + periods: 0 + function: min + - name: 'osd-disk-critical' + description: "The filesystem's free space is too low (OSD disk)" + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: fs_space_percent_free + fields: + # Real FS is /var/lib/ceph/osd/ceph-0 but Collectd substituted '/' by '-' + fs: '=~ ceph/%d+$' + group_by: [fs] + relational_operator: '<' + threshold: 5 + window: 60 + periods: 0 + function: min - name: 'nova-api-http-errors' description: 'Too many 5xx HTTP errors have been detected on nova-api' severity: 'warning' @@ -391,6 +561,7 @@ The following is a list of StackLight built-in alarms:: - name: 'nova-logs-error' description: 'Too many errors have been detected in Nova logs' severity: 'warning' + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -422,6 +593,7 @@ The following is a list of StackLight built-in alarms:: - name: 'heat-logs-error' description: 'Too many errors have been detected in Heat logs' severity: 'warning' + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -444,12 +616,29 @@ The following is a list of StackLight built-in alarms:: rules: - metric: haproxy_backend_response_5xx fields: - backend: 'swift-api' + backend: 'swift-api || object-storage' relational_operator: '>' threshold: 0 window: 60 periods: 1 function: diff + - name: 'swift-logs-error' + description: 'Too many errors have been detected in Swift logs' + severity: 'warning' + no_data_policy: 'okay' + enabled: 'true' + trigger: + logical_operator: 'or' + rules: + - metric: log_messages + fields: + service: 'swift' + level: 'error' + relational_operator: '>' + threshold: 0.1 + window: 70 + periods: 0 + function: max - name: 'cinder-api-http-errors' description: 'Too many 5xx HTTP errors have been detected on cinder-api' severity: 'warning' @@ -468,6 +657,7 @@ The following is a list of StackLight built-in alarms:: - name: 'cinder-logs-error' description: 'Too many errors have been detected in Cinder logs' severity: 'warning' + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -499,6 +689,7 @@ The following is a list of StackLight built-in alarms:: - name: 'glance-logs-error' description: 'Too many errors have been detected in Glance logs' severity: 'warning' + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -530,6 +721,7 @@ The following is a list of StackLight built-in alarms:: - name: 'neutron-logs-error' description: 'Too many errors have been detected in Neutron logs' severity: 'warning' + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -543,6 +735,24 @@ The following is a list of StackLight built-in alarms:: window: 70 periods: 0 function: max + - name: 'keystone-response-time-duration' + description: 'Keystone API is too slow' + severity: 'warning' + no_data_policy: 'okay' + enabled: 'true' + trigger: + logical_operator: 'or' + rules: + - metric: openstack_keystone_http_response_times + fields: + http_method: '== GET || == POST' + http_status: '!= 5xx' + relational_operator: '>' + threshold: 0.3 + window: 60 + periods: 0 + value: upper_90 + function: max - name: 'keystone-public-api-http-errors' description: 'Too many 5xx HTTP errors have been detected on keystone-public-api' severity: 'warning' @@ -573,9 +783,25 @@ The following is a list of StackLight built-in alarms:: window: 60 periods: 1 function: diff + - name: 'horizon-web-http-errors' + description: 'Too many 5xx HTTP errors have been detected on horizon' + severity: 'warning' + enabled: 'true' + trigger: + logical_operator: 'or' + rules: + - metric: haproxy_backend_response_5xx + fields: + backend: 'horizon-web || horizon-https' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 1 + function: diff - name: 'keystone-logs-error' description: 'Too many errors have been detected in Keystone logs' severity: 'warning' + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -691,7 +917,7 @@ The following is a list of StackLight built-in alarms:: fields: fs: '/opt/es/data' # Real FS is /opt/es-data but Collectd substituted '/' by '-' relational_operator: '<' - threshold: 20 + threshold: 20 # The low watermark for disk usage is 85% by default window: 60 periods: 0 function: min @@ -705,7 +931,7 @@ The following is a list of StackLight built-in alarms:: fields: fs: '/opt/es/data' # Real FS is /opt/es-data but Collectd substituted '/' by '-' relational_operator: '<' - threshold: 15 + threshold: 15 # The high watermark for disk usage is 90% by default window: 60 periods: 0 function: min @@ -736,4 +962,1998 @@ The following is a list of StackLight built-in alarms:: threshold: 5 window: 60 periods: 0 - function: min \ No newline at end of file + function: min + - name: 'haproxy-check' + description: "HAProxy cannot be checked" + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_check + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'rabbitmq-check' + description: "RabbitMQ cannot be checked" + # This alarm's severity is warning because the effective status of the + # RabbitMQ cluster is computed by rabbitmq-pacemaker-* alarms. + # This alarm is still useful because it will report the node(s) on which + # RabbitMQ isn't running. + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: rabbitmq_check + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'ceph-mon-check' + description: "Ceph monitor cannot be checked" + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: ceph_mon_check + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'ceph-osd-check' + description: "Ceph OSD cannot be checked" + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: ceph_osd_check + relational_operator: '==' + threshold: 0 + window: 80 # The metric interval collection is 60s + periods: 0 + function: last + - name: 'pacemaker-check' + description: "Pacemaker cannot be checked" + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: pacemaker_check + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'elasticsearch-check' + description: "Elasticsearch cannot be checked" + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: elasticsearch_check + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'influxdb-check' + description: "InfluxDB cannot be checked" + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: influxdb_check + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'libvirt-check' + description: "Libvirt cannot be checked" + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: libvirt_check + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'memcached-check' + description: "memcached cannot be checked" + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: memcached_check + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'mysql-check' + description: "MySQL cannot be checked" + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: mysql_check + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'network-warning-dropped-rx' + description: "Some received packets have been dropped" + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: if_dropped_rx + relational_operator: '>' + threshold: 100 + window: 60 + periods: 0 + function: avg + - name: 'network-critical-dropped-rx' + description: "Too many received packets have been dropped" + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: if_dropped_rx + relational_operator: '>' + threshold: 1000 + window: 60 + periods: 0 + function: avg + - name: 'network-warning-dropped-tx' + description: "Some transmitted packets have been dropped" + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: if_dropped_tx + relational_operator: '>' + threshold: 100 + window: 60 + periods: 0 + function: avg + - name: 'network-critical-dropped-tx' + description: "Too many transmitted packets have been dropped" + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: if_dropped_tx + relational_operator: '>' + threshold: 1000 + function: avg + window: 60 + - name: 'instance-creation-time-warning' + description: "Instance creation takes too much time" + severity: 'warning' + no_data_policy: 'okay' # This is a sporadic metric + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_instance_creation_time + relational_operator: '>' + threshold: 20 + window: 600 + periods: 0 + function: avg + - name: 'hdd-errors-critical' + description: 'Errors on hard drive(s) have been detected' + severity: 'critical' + enabled: 'true' + no_data_policy: okay + trigger: + rules: + - metric: hdd_errors_rate + group_by: ['device'] + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: max + - name: 'total-nova-free-vcpu-warning' + description: 'There is none VCPU available for new instances' + severity: 'warning' + enabled: 'true' + no_data_policy: skip # the metric is only collected from the aggregator node + trigger: + rules: + - metric: openstack_nova_total_free_vcpus + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: max + - name: 'total-nova-free-memory-warning' + description: 'There is none memory available for new instances' + severity: 'warning' + enabled: 'true' + no_data_policy: skip # the metric is only collected from the aggregator node + trigger: + rules: + - metric: openstack_nova_total_free_ram + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: max + + # Adds alarm on local check for OpenStack services endpoint + - name: 'cinder-api-local-endpoint' + description: 'Cinder API is locally down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_check_local_api + fields: + service: 'cinder-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'glance-api-local-endpoint' + description: 'Glance API is locally down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_check_local_api + fields: + service: 'glance-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'heat-api-local-endpoint' + description: 'Heat API is locally down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_check_local_api + fields: + service: 'heat-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'heat-cfn-api-local-endpoint' + description: 'Heat CFN API is locally down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_check_local_api + fields: + service: 'heat-cfn-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'keystone-public-api-local-endpoint' + description: 'Keystone public API is locally down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_check_local_api + fields: + service: 'keystone-public-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-api-local-endpoint' + description: 'Neutron API is locally down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_check_local_api + fields: + service: 'neutron-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-api-local-endpoint' + description: 'Nova API is locally down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_check_local_api + fields: + service: 'nova-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'swift-api-local-endpoint' + description: 'Swift API is locally down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: openstack_check_local_api + fields: + service: 'swift-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + + # Following are the OpenStack service check API definitions and + # also InfluxDB API + - name: 'influxdb-api-check-failed' + description: 'Endpoint check for InfluxDB is failed' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP + enabled: 'true' + trigger: + rules: + - metric: http_check + fields: + service: 'influxdb-cluster' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-api-check-failed' + description: 'Endpoint check for nova-api is failed' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP + enabled: 'true' + trigger: + rules: + - metric: openstack_check_api + fields: + service: 'nova-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-api-check-failed' + description: 'Endpoint check for neutron-api is failed' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP + enabled: 'true' + trigger: + rules: + - metric: openstack_check_api + fields: + service: 'neutron-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'cinder-api-check-failed' + description: 'Endpoint check for cinder-api is failed' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP + enabled: 'true' + trigger: + rules: + - metric: openstack_check_api + fields: + service: 'cinder-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'cinder-v2-api-check-failed' + description: 'Endpoint check for cinder-v2-api is failed' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP + enabled: 'true' + trigger: + rules: + - metric: openstack_check_api + fields: + service: 'cinder-v2-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'glance-api-check-failed' + description: 'Endpoint check for glance-api is failed' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP + enabled: 'true' + trigger: + rules: + - metric: openstack_check_api + fields: + service: 'glance-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'heat-api-check-failed' + description: 'Endpoint check for heat-api is failed' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP + enabled: 'true' + trigger: + rules: + - metric: openstack_check_api + fields: + service: 'heat-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'heat-cfn-api-check-failed' + description: 'Endpoint check for heat-cfn-api is failed' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP + enabled: 'true' + trigger: + rules: + - metric: openstack_check_api + fields: + service: 'heat-cfn-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'swift-api-check-failed' + description: 'Endpoint check for swift-api is failed' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP + enabled: 'true' + trigger: + rules: + - metric: openstack_check_api + fields: + service: 'swift-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'swift-s3-api-check-failed' + description: 'Endpoint check for swift-s3-api is failed' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP + enabled: 'true' + trigger: + rules: + - metric: openstack_check_api + fields: + service: 'swift-s3-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'keystone-public-api-check-failed' + description: 'Endpoint check for keystone-public-api is failed' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP + enabled: 'true' + trigger: + rules: + - metric: openstack_check_api + fields: + service: 'keystone-public-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'ceilometer-api-check-failed' + description: 'Endpoint check for ceilometer-api is failed' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP + enabled: 'true' + trigger: + rules: + - metric: openstack_check_api + fields: + service: 'ceilometer-api' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + + # Following are the AFD generated to check API backends + # All backends are down + - name: 'elasticsearch-api-backends-all-down' + description: 'All Elasticsearch backends are down' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'elasticsearch-rest' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'kibana-api-backends-all-down' + description: 'All API backends are down for Kibana' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'kibana' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'influxdb-api-backends-all-down' + description: 'All API backends are down for InfluxDB' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'influxdb' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'grafana-api-backends-all-down' + description: 'All API backends are down for Grafana' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'grafana' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'glance-registry-api-backends-all-down' + description: 'All API backends are down for glance-registry-api' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'glance-registry-api' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-api-backends-all-down' + description: 'All API backends are down for nova-api' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'nova-api' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'cinder-api-backends-all-down' + description: 'All API backends are down for cinder-api' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'cinder-api' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'object-storage-api-backends-all-down' + description: 'All API backends are down for object-storage' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'object-storage' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'heat-cfn-api-backends-all-down' + description: 'All API backends are down for heat-cfn-api' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'heat-cfn-api' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'horizon-web-api-backends-all-down' + description: 'All API backends are down for horizon-web' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'horizon-web || horizon-https' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-novncproxy-websocket-api-backends-all-down' + description: 'All API backends are down for nova-novncproxy-websocket' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'nova-novncproxy-websocket' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'heat-api-backends-all-down' + description: 'All API backends are down for heat-api' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'heat-api' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'keystone-public-api-backends-all-down' + description: 'All API backends are down for keystone-public-api' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'keystone-public-api' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'heat-cloudwatch-api-backends-all-down' + description: 'All API backends are down for heat-cloudwatch-api' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'heat-cloudwatch-api' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-metadata-api-backends-all-down' + description: 'All API backends are down for nova-metadata-api' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'nova-metadata-api' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'mysqld-tcp-api-backends-all-down' + description: 'All API backends are down for mysqld-tcp' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'mysqld-tcp' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'keystone-admin-api-backends-all-down' + description: 'All API backends are down for keystone-admin-api' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'keystone-admin-api' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'glance-api-backends-all-down' + description: 'All API backends are down for glance-api' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'glance-api' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-api-backends-all-down' + description: 'All API backends are down for neutron-api' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'neutron-api' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'swift-api-backends-all-down' + description: 'All API backends are down for swift-api' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'swift-api || object-storage' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'ceilometer-api-backends-all-down' + description: 'All API backends are down for ceilometer-api' + severity: 'down' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'ceilometer-api' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + # At least one backend is down + - name: 'elasticsearch-api-backends-one-down' + description: 'At least one API backend is down for elasticsearch' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'elasticsearch-rest' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'kibana-api-backends-one-down' + description: 'At least one API backend is down for kibana' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'kibana' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'influxdb-api-backends-one-down' + description: 'At least one API backend is down for influxdb' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'influxdb' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'grafana-api-backends-one-down' + description: 'At least one API backend is down for grafana' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'grafana' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'glance-registry-api-backends-one-down' + description: 'At least one API backend is down for glance-registry-api' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'glance-registry-api' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-api-backends-one-down' + description: 'At least one API backend is down for nova-api' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'nova-api' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'cinder-api-backends-one-down' + description: 'At least one API backend is down for cinder-api' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'cinder-api' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'object-storage-api-backends-one-down' + description: 'At least one API backend is down for object-storage' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'object-storage' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'heat-cfn-api-backends-one-down' + description: 'At least one API backend is down for heat-cfn-api' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'heat-cfn-api' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'horizon-web-api-backends-one-down' + description: 'At least one API backend is down for horizon-web' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'horizon-web || horizon-https' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-novncproxy-websocket-api-backends-one-down' + description: 'At least one API backend is down for nova-novncproxy-websocket' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'nova-novncproxy-websocket' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'heat-api-backends-one-down' + description: 'At least one API backend is down for heat-api' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'heat-api' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'keystone-public-api-backends-one-down' + description: 'At least one API backend is down for keystone-public-api' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'keystone-public-api' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'heat-cloudwatch-api-backends-one-down' + description: 'At least one API backend is down for heat-cloudwatch-api' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'heat-cloudwatch-api' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-metadata-api-backends-one-down' + description: 'At least one API backend is down for nova-metadata-api' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'nova-metadata-api' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'mysqld-tcp-api-backends-one-down' + description: 'At least one API backend is down for mysqld-tcp' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'mysqld-tcp' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'keystone-admin-api-backends-one-down' + description: 'At least one API backend is down for keystone-admin-api' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'keystone-admin-api' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'glance-api-backends-one-down' + description: 'At least one API backend is down for glance-api' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'glance-api' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-api-backends-one-down' + description: 'At least one API backend is down for neutron-api' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'neutron-api' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'swift-api-backends-one-down' + description: 'At least one API backend is down for swift-api' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'swift-api || object-storage' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'ceilometer-api-backends-one-down' + description: 'At least one API backend is down for ceilometer-api' + severity: 'warning' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers + fields: + backend: 'ceilometer-api' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + # Less than 50% of backends are up + - name: 'elasticsearch-api-backends-majority-down' + description: 'Less than 50% of backends are up for elasticsearch' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'elasticsearch-rest' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'kibana-api-backends-majority-down' + description: 'Less than 50% of backends are up for kibana' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'kibana' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'influxdb-api-backends-majority-down' + description: 'Less than 50% of backends are up for influxdb' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'influxdb' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'grafana-api-backends-majority-down' + description: 'Less than 50% of backends are up for grafana' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'grafana' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'glance-registry-api-backends-majority-down' + description: 'Less than 50% of backends are up for glance-registry-api' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'glance-registry-api' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'nova-api-backends-majority-down' + description: 'Less than 50% of backends are up for nova-api' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'nova-api' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'cinder-api-backends-majority-down' + description: 'Less than 50% of backends are up for cinder-api' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'cinder-api' + state: 'up' + + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'object-storage-api-backends-majority-down' + description: 'Less than 50% of backends are up for object-storage' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'object-storage' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'heat-cfn-api-backends-majority-down' + description: 'Less than 50% of backends are up for heat-cfn-api' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'heat-cfn-api' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'horizon-web-api-backends-majority-down' + description: 'Less than 50% of backends are up for horizon-web' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'horizon-web || horizon-https' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'nova-novncproxy-websocket-api-backends-majority-down' + description: 'Less than 50% of backends are up for nova-novncproxy-websocket' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'nova-novncproxy-websocket' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'heat-api-backends-majority-down' + description: 'Less than 50% of backends are up for heat-api' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'heat-api' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'keystone-public-api-backends-majority-down' + description: 'Less than 50% of backends are up for keystone-public-api' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'keystone-public-api' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'heat-cloudwatch-api-backends-majority-down' + description: 'Less than 50% of backends are up for heat-cloudwatch-api' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'heat-cloudwatch-api' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'nova-metadata-api-backends-majority-down' + description: 'Less than 50% of backends are up for nova-metadata-api' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'nova-metadata-api' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'mysqld-tcp-api-backends-majority-down' + description: 'Less than 50% of backends are up for mysqld-tcp' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'mysqld-tcp' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'keystone-admin-api-backends-majority-down' + description: 'Less than 50% of backends are up for keystone-admin-api' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'keystone-admin-api' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'glance-api-backends-majority-down' + description: 'Less than 50% of backends are up for glance-api' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'glance-api' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'neutron-api-backends-majority-down' + description: 'Less than 50% of backends are up for neutron-api' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'neutron-api' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'swift-api-backends-majority-down' + description: 'Less than 50% of backends are up for swift-api' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'swift-api || object-storage' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'ceilometer-api-backends-majority-down' + description: 'Less than 50% of backends are up for ceilometer-api' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: haproxy_backend_servers_percent + fields: + backend: 'ceilometer-api' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + + # Following are the AFD generated to check workers + # All workers are down + - name: 'nova-scheduler-all-down' + description: 'All Nova schedulers are down' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'scheduler' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-cert-all-down' + description: 'All Nova certs are down' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'cert' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-consoleauth-all-down' + description: 'All Nova consoleauths are down' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'consoleauth' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-compute-all-down' + description: 'All Nova computes are down' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'compute' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-conductor-all-down' + description: 'All Nova conductors are down' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'conductor' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'cinder-scheduler-all-down' + description: 'All Cinder schedulers are down' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_cinder_services + fields: + service: 'scheduler' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'cinder-volume-all-down' + description: 'All Cinder volumes are down' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_cinder_services + fields: + service: 'volume' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-l3-all-down' + description: 'All Neutron L3 agents are down' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'l3' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-dhcp-all-down' + description: 'All Neutron DHCP agents are down' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'dhcp' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-metadata-all-down' + description: 'All Neutron metadata agents are down' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'metadata' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-openvswitch-all-down' + description: 'All Neutron openvswitch agents are down' + severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'openvswitch' + state: 'up' + relational_operator: '==' + threshold: 0 + window: 60 + periods: 0 + function: last + # At least one backend is down + - name: 'nova-scheduler-one-down' + description: 'At least one Nova scheduler is down' + severity: 'warning' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'scheduler' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-cert-one-down' + description: 'At least one Nova cert is down' + severity: 'warning' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'cert' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-consoleauth-one-down' + description: 'At least one Nova consoleauth is down' + severity: 'warning' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'consoleauth' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-compute-one-down' + description: 'At least one Nova compute is down' + severity: 'warning' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'compute' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'nova-conductor-one-down' + description: 'At least one Nova conductor is down' + severity: 'warning' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services + fields: + service: 'conductor' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'cinder-scheduler-one-down' + description: 'At least one Cinder scheduler is down' + severity: 'warning' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_cinder_services + fields: + service: 'scheduler' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'cinder-volume-one-down' + description: 'At least one Cinder volume is down' + severity: 'warning' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_cinder_services + fields: + service: 'volume' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-l3-one-down' + description: 'At least one L3 agent is down' + severity: 'warning' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'l3' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-dhcp-one-down' + description: 'At least one DHCP agent is down' + severity: 'warning' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'dhcp' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-metadata-one-down' + description: 'At least one metadata agents is down' + severity: 'warning' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'metadata' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + - name: 'neutron-openvswitch-one-down' + description: 'At least one openvswitch agents is down' + severity: 'warning' + no_data_policy: 'skip' # the metric is only collected from the DC node + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents + fields: + service: 'openvswitch' + state: 'down' + relational_operator: '>' + threshold: 0 + window: 60 + periods: 0 + function: last + # Less than 50% of service are up (compared to up and down). + - name: 'nova-scheduler-majority-down' + description: 'Less than 50% of Nova schedulers are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services_percent + fields: + service: 'scheduler' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'nova-cert-majority-down' + description: 'Less than 50% of Nova certs are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services_percent + fields: + service: 'cert' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'nova-consoleauth-majority-down' + description: 'Less than 50% of Nova consoleauths are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services_percent + fields: + service: 'consoleauth' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'nova-compute-majority-down' + description: 'Less than 50% of Nova computes are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services_percent + fields: + service: 'compute' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'nova-conductor-majority-down' + description: 'Less than 50% of Nova conductors are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_nova_services_percent + fields: + service: 'conductor' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'cinder-scheduler-majority-down' + description: 'Less than 50% of Cinder schedulers are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_cinder_services_percent + fields: + service: 'scheduler' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'cinder-volume-majority-down' + description: 'Less than 50% of Cinder volumes are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_cinder_services_percent + fields: + service: 'volume' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'neutron-l3-majority-down' + description: 'Less than 50% of Neutron L3 agents are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents_percent + fields: + service: 'l3' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'neutron-dhcp-majority-down' + description: 'Less than 50% of Neutron DHCP agents are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents_percent + fields: + service: 'dhcp' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'neutron-metadata-majority-down' + description: 'Less than 50% of Neutron metadata agents are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents_percent + fields: + service: 'metadata' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last + - name: 'neutron-openvswitch-majority-down' + description: 'Less than 50% of Neutron openvswitch agents are up' + severity: 'critical' + enabled: 'true' + trigger: + rules: + - metric: openstack_neutron_agents_percent + fields: + service: 'openvswitch' + state: 'up' + relational_operator: '<=' + threshold: 50 + window: 60 + periods: 0 + function: last diff --git a/doc/user/source/configure_alarms.rst b/doc/user/source/configure_alarms.rst index cbc850c73..0c302a97c 100644 --- a/doc/user/source/configure_alarms.rst +++ b/doc/user/source/configure_alarms.rst @@ -368,12 +368,21 @@ file. This file has the following sections: to that category of nodes. For example:: node_cluster_alarms: - controller: - cpu: ['cpu-critical-controller', 'cpu-warning-controller'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - log-fs: ['log-fs-critical', 'log-fs-warning'] + controller-nodes: + apply_to_node: controller + alerting: enabled + members: + cpu: + alarms: ['cpu-critical-controller', 'cpu-warning-controller'] + root-fs: + alarms: ['root-fs-critical', 'root-fs-warning'] + log-fs: + alarms: ['log-fs-critical', 'log-fs-warning'] + hdd-errors: + alerting: enabled_with_notification + alarms: ['hdd-errors-critical'] - Creates three alarm groups for the cluster of nodes called 'controller': + Creates four alarm groups for the cluster of controller nodes: * The *cpu* alarm group is mapped to two alarms defined in the ``alarms`` section known as the 'cpu-critical-controller' and @@ -388,6 +397,13 @@ file. This file has the following sections: section known as the 'log-fs-critical' and 'log-fs-warning' alarms. These alarms monitor the file system where the logs are created on the controller nodes. + * The *hdd-errors* alarm group is mapped to the 'hdd-errors-critical' alarm + defined in the ``alarms`` section. This alarm monitors the ``kern.log`` + log entries containing critical IO errors detected by the kernel. + The *hdd-error* alarm obtains the *enabled_with_notification* alerting + attribute, meaning that the operator will be notified if any of the + controller nodes encounters a disk failure. Other alarms do not trigger + notification per node but at an aggregated cluster level. .. note:: An *alarm group* is a mere implementation artifact (although it has functional value) that is primarily used to distribute the alarms @@ -425,7 +441,7 @@ structure of that file. important to keep exactly the same copy of ``/etc/hiera/override/gse_filters.yaml`` across all the nodes of the OpenStack environment including the node(s) where Nagios is installed. - + The aggregation rules and correlation policies are defined in the ``/etc/hiera/override/gse_filters.yaml`` configuration file. This file has the following sections: @@ -590,6 +606,7 @@ the service cluster aggregation rules:: output_metric_name: cluster_service_status interval: 10 warm_up_period: 20 + alerting: enabled_with_notification clusters: nova-api: policy: highest_severity @@ -638,6 +655,10 @@ Where | The number of seconds after a (re)start that the GSE plugin will wait before emitting its metric messages. +| alerting +| Type: string (one of 'disabled', 'enabled' or 'enabled_with_notification'). +| The alerting configuration of the service clusters. + | clusters | Type: list | The list of service clusters that the plugin handles. See @@ -720,6 +741,7 @@ cluster aggregation rules:: output_metric_name: cluster_node_status interval: 10 warm_up_period: 80 + alerting: enabled_with_notification clusters: controller: policy: majority_of_members @@ -768,6 +790,10 @@ Where | The number of seconds after a (re)start that the GSE plugin will wait before emitting its metric messages. +| alerting +| Type: string (one of 'disabled', 'enabled' or 'enabled_with_notification'). +| The alerting configuration of the node clusters. + | clusters | Type: list | The list of node clusters that the plugin handles. See diff --git a/doc/user/source/release_notes.rst b/doc/user/source/release_notes.rst index fd4e58010..838dc27fa 100644 --- a/doc/user/source/release_notes.rst +++ b/doc/user/source/release_notes.rst @@ -10,6 +10,34 @@ Release notes Version 1.0.0 +++++++++++++ +The StackLight Collector plugin 1.0.0 for Fuel contains the following updates: + +New alarms: + + * Monitor RabbitMQ based on Pacemaker point-of-view + * Monitor all partitions and OSD disk(s) + * Horizon HTTP 5xx errors + * Keystone slow response times + * HDD errors + * SWAP percent usage + * Network packet drops + * Local OpenStack API checks + * Local checks for services: Apache, Memcached, MySQL, RabbitMQ, Pacemaker + +Alarm enhancements: + + * Added the ``group by`` attribute support for alarm rules + * Added support for ``pattern matching`` to filter metric dimensions + +Bug fixes: + + * Fixed the concurrent execution of logrotate. + See `#1455104 `_. + * Implemented the capability for the Elasticsearch bulk size to increase when + required. See `#1617211 `_. + * Implemented the capability to use RabbitMQ management API in place of the + :command:`rabbitmqctl` command. + Version 0.10.0 ++++++++++++++