Prometheus: Node Alerts Scalar/Vector Conversion

This change converts alert expressions which relied on instant vectors
to use range aggregate functions instead - For just the 'basic_linux'
rules.

Change-Id: I30d6ab71d747b297f522bbeb12b8f4dbfce1eefe
Co-Authored-By: Meghan Heisler <mkheisler93@gmail.com>
This commit is contained in:
Steven Fitzpatrick 2019-11-12 09:14:14 -06:00
parent aa48b16896
commit a41262e459
1 changed files with 15 additions and 12 deletions

View File

@ -3,7 +3,15 @@ conf:
rules:
nodes:
groups:
- name: nodes.rules
- name: node.recording_rules
rules:
- record: node_filesystem_free_percent
expr: 100 * {fstype =~ "xfs|ext[34]"} / node_filesystem_size{fstype =~ "xfs|ext[34]"}
- record: node_ram_usage_percent
expr: 100 * (node_memory_MemFree + node_memory_Buffers + node_memory_Cached) / node_memory_MemTotal
- record: node_swap_usage_percent
expr: 100 * (node_memory_SwapFree + node_memory_SwapCached) / node_memory_SwapTotal
- name: nodes.alerting_rules
rules:
- alert: prom_exporter_node_unavailable
expr: absent(node_uname_info)
@ -14,14 +22,13 @@ conf:
description: node exporter is not collecting metrics or is not available for past 10 minutes
title: node exporter is not collecting metrics or is not available
- alert: node_filesystem_full_80percent
expr: sort(node_filesystem_free{fstype =~ "xfs|ext[34]"} < node_filesystem_size{fstype =~ "xfs|ext[34]"}
* 0.2) / 1024 ^ 3
expr: avg_over_time(node_filesystem_free_percent[2m]) > 80
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
got less than 10% space left on its filesystem.'
has less than 20% free space left.'
summary: '{{$labels.alias}}: Filesystem is running out of space soon.'
- alert: node_filesystem_full_in_4h
expr: predict_linear(node_filesystem_free{fstype =~ "xfs|ext[34]"}[1h], 4 * 3600) <= 0
@ -61,8 +68,7 @@ conf:
1h.'
summary: '{{$labels.alias}}: High CPU utilization.'
- alert: node_ram_using_90percent
expr: node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal
* 0.1
expr: avg_over_time(node_ram_usage_percent[2m]) > 90
for: 30m
labels:
severity: page
@ -71,8 +77,7 @@ conf:
30 minutes now.'
summary: '{{$labels.alias}}: Using lots of RAM.'
- alert: node_swap_using_80percent
expr: node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached)
> node_memory_SwapTotal * 0.8
expr: avg_over_time(node_swap_usage_percent[2m]) > 80
for: 10m
labels:
severity: page
@ -89,8 +94,7 @@ conf:
description: '{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}'
summary: '{{$labels.alias}}: Running on high load: {{$value}}'
- alert: node_high_memory_load
expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers
+ node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85
expr: avg_over_time(node_ram_usage_percent[2m]) > 85
for: 1m
labels:
severity: warning
@ -99,8 +103,7 @@ conf:
instance {{ $labels.instance }} of job {{ $labels.job }}.
summary: Server memory is almost full
- alert: node_high_storage_load
expr: (node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})
/ node_filesystem_size{mountpoint="/"} * 100 > 85
expr: avg_over_time(node_storage_usage_percent{mountpoint="/"}[2m]) > 85
for: 30s
labels:
severity: warning