From ba22b0e726fc08801be76c0cd79e7c02b2a40378 Mon Sep 17 00:00:00 2001 From: Steve Wilkerson Date: Tue, 6 Nov 2018 14:51:40 -0600 Subject: [PATCH] Nagios: Update ceph_health check The ceph_health check in Nagios incorrectly sets the warning and error level to 0. The ceph_health_status metric's value of 0 indicates the cluster is healthy, while 1 indicates a warning and 2 indicates an error state. The Nagios check for ceph_health is updated to reflect these values Change-Id: Iffe80f1c34f6edee6370dd7e707e5f55f83f1ec1 --- nagios/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nagios/values.yaml b/nagios/values.yaml index 139c2db29..351cd2ee6 100644 --- a/nagios/values.yaml +++ b/nagios/values.yaml @@ -453,7 +453,7 @@ conf: command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.' - check_ceph_health: command_name: check_ceph_health - command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 0 --warning 0 + command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 2 --warning 1 - check_prometheus_hosts: command_name: check_prometheus_hosts command_line: $USER1$/check_update_prometheus_hosts.py --prometheus_api $USER2$ --object_file_loc /opt/nagios/etc/objects/prometheus_discovery_objects.cfg