Prometheus: Ceph Alerts Scalar/Vector Conversion

This change updates the prometheus alerting rules to use ranged vectors in their expressions, to avoid situations wher missed scrapes would cause scalar metrics to "go stale" - resetting the alert timer. Only the ceph alerts are affected by this change. Change-Id: Ib47866d12616aaa808e6a09c58aa4352e338a152 Co-Authored-By: Meghan Heisler <mkheisler93@gmail.com>
2019-11-08 14:00:12 -06:00 · 2019-11-08 14:00:12 -06:00 · f37865d6a0
parent aa48b16896
commit f37865d6a0
2 changed files with 38 additions and 19 deletions
--- a/nagios/values.yaml
+++ b/nagios/values.yaml
@ -990,7 +990,15 @@ conf:
          }

          define service {
-            check_command check_prom_alert!ceph_mon_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
+            check_command check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description CEPH_quorum
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!ceph_monitor_quorum_absent!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description CEPH_quorum
--- a/prometheus/values_overrides/ceph.yaml
+++ b/prometheus/values_overrides/ceph.yaml
@ -3,7 +3,17 @@ conf:
    rules:
      ceph:
        groups:
-        - name: ceph.rules
+        - name: ceph.recording_rules
+          rules:
+          - record: ceph_cluster_usage_percent
+            expr: 100 * (ceph_cluster_total_used_bytes / ceph_cluster_total_bytes)
+          - record: ceph_placement_group_degrade_percent
+            expr: 100 * (ceph_pg_degraded / ceph_pg_total)
+          - record: ceph_osd_down_percent
+            expr: 100 * (count(ceph_osd_up == 0) / count(ceph_osd_metadata))
+          - record: ceph_osd_out_percent
+            expr: 100 * (count(ceph_osd_in == 0) / count(ceph_osd_metadata))
+        - name: ceph.alerting_rules
          rules:
          - alert: prom_exporter_ceph_unavailable
            expr: absent(ceph_health_status)
@ -14,14 +24,13 @@ conf:
              description: Ceph exporter is not collecting metrics or is not available for past 10 minutes
              title: Ceph exporter is not collecting metrics or is not available
          - alert: no_active_ceph_mgr
-            expr: count(up{job="ceph-mgr"} == 1) == 0
-            for: 5m
+            expr: avg_over_time(up{job="ceph-mgr"}[5m]) == 0
            labels:
              severity: warning
            annotations:
              description: 'no ceph active mgr is present or all ceph mgr are down'
              summary: 'no ceph active mgt is present'
-          - alert: ceph_mon_quorum_low
+          - alert: ceph_monitor_quorum_low
            expr:  ceph_mon_quorum_count < 3
            for: 5m
            labels:
@ -29,43 +38,45 @@ conf:
            annotations:
              description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
              summary: 'ceph high availability is at risk'
+          - alert: ceph_monitor_quorum_absent
+            expr:  absent(avg_over_time(ceph_mon_quorum_status[5m]))
+            labels:
+              severity: page
+            annotations:
+              description: 'ceph monitor quorum has been gone for more than 5 minutes'
+              summary: 'ceph high availability is at risk'
          - alert: ceph_cluster_usage_high
-            expr: 100* ceph_cluster_total_used_bytes/ceph_cluster_total_bytes > 80
-            for: 5m
+            expr: avg_over_time(ceph_cluster_usage_percent[5m]) > 80
            labels:
              severity: page
            annotations:
              description: 'ceph cluster capacity usage more than 80 percent'
              summary: 'ceph cluster usage is more than 80 percent'
          - alert: ceph_placement_group_degrade_pct_high
-            expr: 100 * sum(ceph_pg_degraded)/sum(ceph_osd_numpg) > 80
-            for: 5m
+            expr: avg_over_time(ceph_placement_group_degrade_percent[5m]) > 80
            labels:
              severity: critical
            annotations:
              description: 'ceph placement group degradation is more than 80 percent'
              summary: 'ceph placement groups degraded'
          - alert: ceph_osd_down_pct_high
-            expr:  100 * count(ceph_osd_up==0)/count(ceph_osd_metadata) > 80
-            for: 5m
+            expr:  avg_over_time(ceph_osd_down_percent[5m]) > 80
            labels:
              severity: critical
            annotations:
              description: 'ceph OSDs down percent is more than 80 percent'
              summary: 'ceph OSDs down percent is high'
          - alert: ceph_osd_down
-            expr: ceph_osd_up == 0
-            for: 1m
+            expr: avg_over_time(ceph_osd_up[5m]) == 0
            labels:
              severity: critical
            annotations:
-              description: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
-              summary: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
+              description: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}.'
+              summary: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}.'
          - alert: ceph_osd_out
-            expr: ceph_osd_in == 0
-            for: 5m
+            expr: avg_over_time(ceph_osd_in[5m]) == 0
            labels:
              severity: page
            annotations:
-              description: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'
-              summary: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'
+              description: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}.'
+              summary: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}.'