Nagios: Update ceph_health check

The ceph_health check in Nagios incorrectly sets the warning and error level to 0. The ceph_health_status metric's value of 0 indicates the cluster is healthy, while 1 indicates a warning and 2 indicates an error state. The Nagios check for ceph_health is updated to reflect these values Change-Id: Iffe80f1c34f6edee6370dd7e707e5f55f83f1ec1
2018-11-06 14:51:40 -06:00 · 2018-11-06 14:51:40 -06:00 · ba22b0e726
parent fca344900f
commit ba22b0e726
1 changed files with 1 additions and 1 deletions
--- a/nagios/values.yaml
+++ b/nagios/values.yaml
@ -453,7 +453,7 @@ conf:
          command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.'
      - check_ceph_health:
          command_name: check_ceph_health
-          command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 0 --warning 0
+          command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 2 --warning 1
      - check_prometheus_hosts:
          command_name: check_prometheus_hosts
          command_line: $USER1$/check_update_prometheus_hosts.py --prometheus_api $USER2$ --object_file_loc /opt/nagios/etc/objects/prometheus_discovery_objects.cfg