fuel-plugin-lma-collector/deployment_scripts/puppet/modules/fuel_lma_collector/templates/clusters.yaml.erb

744 lines
19 KiB
Plaintext

---
lma_collector:
# The GSE policies are applied to clusters and describe how the the GSE
# cluster filter computes the cluster's status. The LMA toolchain is
# pre-configured with a few set of policies but everything can be customized:
# thresholds can be modified, new policies can be defined, and so on.
#
# A policy consists of a list of rules which are evaluated against the
# current status of the cluster's members. When one of the rules matches, the
# cluster's status gets the value associated with the rule and the evaluation
# stops here. The last rule of the list is usually a catch-all rule that
# defines the default status in case no other rule matched.
#
# The declaration of a policy rule is similar to an alarm rule except that:
# - there are no 'metric', 'window' and 'period' parameters.
# - there are only 2 supported functions:
# - 'count' which returns the number of members that match the passed value(s)
# - 'percent' which returns the percentage of members that match the passed value(s)
#
# The following rule definition reads as "the cluster's status is critical if
# more than 50% of its members are either down or critical":
#
# - status: critical
# trigger:
# logical_operator: or
# rules:
# - function: percent
# arguments: [ down, critical ]
# relational_operator: '>'
# threshold: 50
#
gse_policies:
# A policy defining that the cluster's status depends on the member with the
# highest severity, typically used for a cluster of services.
highest_severity:
- status: down
trigger:
logical_operator: or
rules:
- function: count
arguments: [ down ]
relational_operator: '>'
threshold: 0
- status: critical
trigger:
logical_operator: or
rules:
- function: count
arguments: [ critical ]
relational_operator: '>'
threshold: 0
- status: warning
trigger:
logical_operator: or
rules:
- function: count
arguments: [ warning ]
relational_operator: '>'
threshold: 0
- status: okay
trigger:
logical_operator: or
rules:
- function: count
arguments: [ okay ]
relational_operator: '>'
threshold: 0
- status: unknown
# A policy which is typically used for clusters managed by Pacemaker
# with the no-quorum-policy set to 'stop'.
majority_of_members:
- status: down
trigger:
logical_operator: or
rules:
- function: percent
arguments: [ down ]
relational_operator: '>'
threshold: 50
- status: critical
trigger:
logical_operator: and
rules:
- function: percent
arguments: [ down, critical ]
relational_operator: '>'
threshold: 20
- function: percent
arguments: [ okay ]
relational_operator: '<'
threshold: 50
function: percent
- status: warning
trigger:
logical_operator: or
rules:
- function: percent
arguments: [ okay ]
relational_operator: '<'
threshold: 50
function: percent
- status: okay
gse_cluster_service:
input_message_types:
- afd_service_metric
aggregator_flag: true
# the field in the input messages to identify the cluster
cluster_field: service
# the field in the input messages to identify the cluster's member
member_field: source
output_message_type: gse_service_cluster_metric
output_metric_name: cluster_service_status
interval: 10
warm_up_period: 20
clusters:
nova-logs:
policy: highest_severity
group_by: hostname
members:
- error
heat-logs:
policy: highest_severity
group_by: hostname
members:
- error
cinder-logs:
policy: highest_severity
group_by: hostname
members:
- error
glance-logs:
policy: highest_severity
group_by: hostname
members:
- error
neutron-logs:
policy: highest_severity
group_by: hostname
members:
- error
keystone-logs:
policy: highest_severity
group_by: hostname
members:
- error
mysqld-tcp:
policy: highest_severity
group_by: member
members:
- backends
mysql:
policy: majority_of_members
group_by: hostname
members:
- node-status
- check
haproxy-openstack:
policy: majority_of_members
group_by: hostname
members:
- check
apache:
policy: majority_of_members
group_by: hostname
members:
- worker
- check
memcached-service:
policy: majority_of_members
group_by: hostname
members:
- check
rabbitmq-cluster:
policy: highest_severity
group_by: member
members:
- memory
- disk
- queue
- pacemaker
rabbitmq-service:
# A check failure on a single node doesn't mean that the whole cluster
# is down, this is why a 'hostname' group_by and 'majority_of_members'
# policy are used here.
policy: majority_of_members
group_by: hostname
members:
- check
nova-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
- http_errors
nova-novncproxy-websocket:
policy: highest_severity
group_by: member
members:
- backends
nova-metadata-api:
policy: highest_severity
group_by: member
members:
- backends
nova-scheduler:
policy: highest_severity
group_by: member
members:
- workers
nova-cert:
policy: highest_severity
group_by: member
members:
- workers
nova-consoleauth:
policy: highest_severity
group_by: member
members:
- workers
nova-compute:
policy: highest_severity
group_by: member
members:
- workers
nova-conductor:
policy: highest_severity
group_by: member
members:
- workers
# The AFDs are emitted by all the controller nodes because all of them
# collect the openstack_nova_instance_creation_time metric. Hence the
# service is considered not okay when a majority of controllers report
# not okay.
nova-instances:
policy: majority_of_members
group_by: hostname
members:
- creation-time
cinder-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
- http_errors
cinder-v2-api:
policy: highest_severity
group_by: member
members:
# Cinder V2 backends are in fact the same as the Cinder backends
- endpoint
cinder-scheduler:
policy: highest_severity
group_by: member
members:
- workers
cinder-volume:
policy: highest_severity
group_by: member
members:
- workers
neutron-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
- http_errors
neutron-l3:
policy: highest_severity
group_by: member
members:
- workers
neutron-dhcp:
policy: highest_severity
group_by: member
members:
- workers
neutron-metadata:
policy: highest_severity
group_by: member
members:
- workers
neutron-openvswitch:
policy: highest_severity
group_by: member
members:
- workers
keystone-response-time:
policy: highest_severity
group_by: member
members:
- duration
keystone-public-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
- http_errors
keystone-admin-api:
policy: highest_severity
group_by: member
members:
# TODO(pasquier-s): add a metric reporting the status of the keystone-admin-api endpoint
- backends
- http_errors
glance-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
- http_errors
glance-registry-api:
policy: highest_severity
group_by: member
members:
- backends
heat-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
- http_errors
heat-cfn-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
heat-cloudwatch-api:
policy: highest_severity
group_by: member
members:
- backends
<% if @tls_enabled then -%>
horizon-https:
policy: highest_severity
group_by: member
members:
- backends
<% else -%>
horizon-ui:
policy: highest_severity
group_by: member
members:
- backends
<% end -%>
<% if not @storage_options["objects_ceph"] then -%>
swift-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
- http_errors
swift-s3-api:
policy: highest_severity
group_by: member
members:
# Swift S3 backends are in fact the same as the Swift backends
- endpoint
<% end -%>
<% if @ceilometer_enabled -%>
ceilometer-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
<% end -%>
<% if @storage_options["volumes_ceph"] then -%>
ceph-mon-cluster:
policy: highest_severity
group_by: member
members:
- health
- capacity
ceph-mon-service:
policy: majority_of_members
group_by: hostname
members:
- check
ceph-osd-service:
policy: majority_of_members
group_by: hostname
members:
- check
<% end -%>
<% if @monitor_elasticsearch -%>
elasticsearch-cluster:
policy: highest_severity
group_by: member
members:
- health
elasticsearch-service:
policy: majority_of_members
group_by: hostname
members:
- check
<% end -%>
<% if @monitor_influxdb -%>
influxdb-cluster:
policy: highest_severity
group_by: member
members:
- endpoint
influxdb-service:
policy: majority_of_members
group_by: hostname
members:
- check
<% end -%>
pacemaker-service:
policy: majority_of_members
group_by: hostname
members:
- check
libvirt-service:
policy: majority_of_members
group_by: hostname
members:
- check
gse_cluster_node:
input_message_types:
- afd_node_metric
aggregator_flag: true
# the field in the input messages to identify the cluster
cluster_field: node_role
# the field in the input messages to identify the cluster's member
member_field: source
output_message_type: gse_node_cluster_metric
output_metric_name: cluster_node_status
interval: 10
warm_up_period: 80
clusters:
controller:
policy: majority_of_members
group_by: hostname
members:
- cpu
- network-rx
- network-tx
- root-fs
- log-fs
- swap
- hdd-errors
<% if @detach_rabbitmq_enabled -%>
rabbitmq-nodes:
policy: majority_of_members
group_by: hostname
members:
- cpu
- network-rx
- network-tx
- root-fs
- swap
- hdd-errors
<% end -%>
mysql-nodes:
policy: majority_of_members
group_by: hostname
members:
<% if @detach_database_enabled -%>
- cpu
- network-rx
- network-tx
- root-fs
- swap
- hdd-errors
<% end -%>
- mysql-fs
compute:
policy: majority_of_members
group_by: hostname
members:
- cpu
- network-rx
- network-tx
- root-fs
- nova-fs
- swap
- hdd-errors
storage:
policy: majority_of_members
group_by: hostname
members:
- cpu
- network-rx
- network-tx
- root-fs
- swap
- hdd-errors
<% if @monitor_elasticsearch -%>
elasticsearch-nodes:
policy: majority_of_members
group_by: hostname
members:
- data-fs
- root-fs
- cpu
- network-rx
- network-tx
- swap
- hdd-errors
<% end -%>
<% if @monitor_influxdb -%>
influxdb-nodes:
policy: majority_of_members
group_by: hostname
members:
- data-fs
- root-fs
- cpu
- network-rx
- network-tx
- swap
- hdd-errors
<% end -%>
gse_cluster_global:
input_message_types:
- gse_service_cluster_metric
- gse_node_cluster_metric
aggregator_flag: false
# the field in the input messages to identify the cluster's member
member_field: cluster_name
output_message_type: gse_cluster_metric
output_metric_name: cluster_status
interval: 10
warm_up_period: 30
clusters:
mysql:
policy: highest_severity
group_by: member
members:
- mysqld-tcp
- mysql
- mysql-nodes
<% if not @detach_database_enabled -%>
- controller
<% end -%>
haproxy-openstack:
policy: highest_severity
group_by: member
members:
- haproxy-openstack
- controller
apache:
policy: highest_severity
group_by: member
members:
- apache
- controller
memcached:
policy: highest_severity
group_by: member
members:
- memcached-service
- controller
rabbitmq:
policy: highest_severity
group_by: member
members:
- rabbitmq-cluster
- rabbitmq-service
<% if @detach_rabbitmq_enabled -%>
- rabbitmq-nodes
<% else -%>
- controller
<% end -%>
nova:
policy: highest_severity
group_by: member
members:
- nova-logs
- nova-api
- nova-metadata-api
- nova-scheduler
- nova-compute
- nova-conductor
- nova-cert
- nova-consoleauth
- nova-novncproxy-websocket
- nova-instances
- controller
- compute
- libvirt-service
hints:
- cinder
- glance
- keystone
- neutron
- mysql
- rabbitmq
cinder:
policy: highest_severity
group_by: member
members:
- cinder-logs
- cinder-api
- cinder-v2-api
- cinder-scheduler
- cinder-volume
- controller
- storage
hints:
- keystone
- mysql
- rabbitmq
neutron:
policy: highest_severity
group_by: member
members:
- neutron-logs
- neutron-api
<% if not @contrail_plugin then -%>
- neutron-l3
- neutron-dhcp
- neutron-metadata
- neutron-openvswitch
<% end -%>
- controller
hints:
- keystone
- mysql
- rabbitmq
keystone:
policy: highest_severity
group_by: member
members:
- keystone-logs
- keystone-response-time
- keystone-public-api
- keystone-admin-api
- controller
hints:
- mysql
- apache
glance:
policy: highest_severity
group_by: member
members:
- glance-logs
- glance-api
- glance-registry-api
- controller
hints:
- keystone
- mysql
heat:
policy: highest_severity
group_by: member
members:
- heat-logs
- heat-api
- heat-cfn-api
- heat-cloudwatch-api
- controller
hints:
- cinder
- glance
- keystone
- neutron
- nova
- mysql
- rabbitmq
horizon:
policy: highest_severity
group_by: member
members:
<% if @tls_enabled then -%>
- horizon-https
<% else -%>
- horizon-ui
<% end -%>
- controller
hints:
- keystone
- apache
<% if not @storage_options["objects_ceph"] then -%>
swift:
policy: highest_severity
group_by: member
members:
- swift-api
- swift-s3-api
- controller
hints:
- keystone
<% end -%>
<% if @ceilometer_enabled -%>
ceilometer:
policy: highest_severity
group_by: member
members:
- ceilometer-api
- controller
hints:
- keystone
- rabbitmq
<% end -%>
<% if @storage_options["volumes_ceph"] then -%>
ceph:
policy: highest_severity
group_by: member
members:
- ceph-mon-cluster
- ceph-mon-service
- ceph-osd-service
- controller
- storage
<% end -%>
<% if @monitor_elasticsearch -%>
elasticsearch:
policy: highest_severity
group_by: member
members:
- elasticsearch-cluster
- elasticsearch-service
- elasticsearch-nodes
<% end -%>
<% if @monitor_influxdb -%>
influxdb:
policy: highest_severity
group_by: member
members:
- influxdb-cluster
- influxdb-service
- influxdb-nodes
<% end -%>
pacemaker:
policy: highest_severity
group_by: member
members:
- pacemaker-service