Implement prometheus alert rules

Alert rules can be attached as a resource and will be transmitted via
the metrics-endpoint relation. Default alert rules taken from upstream
ceph have been added for reference.

Change-Id: I6a3c6f06e9b9d911b35c8ced1968becc6471b362
This commit is contained in:
Peter Sabaini 2022-09-22 10:28:06 +02:00
parent ade9df195e
commit 9c7101f573
8 changed files with 862 additions and 47 deletions

View File

@ -143,6 +143,16 @@ The charm supports Ceph metric monitoring with Prometheus. Add relations to the
Alternatively, integration with the [COS Lite][cos-lite] observability
stack is available via the metrics-endpoint relation.
Relating to prometheus-k8s via the metrics-endpoint interface (as is
found in the [COS Lite][cos-lite] bundle) will send metrics to
prometheus. Additionally, alerting rules will be configured for
prometheus as well. Alerting rules are configured as a resource
`alert-rules`; the default rules are taken from [upstream ceph
rules][ceph-rules]. It is possible to replace the default with
customized rules by attaching a resource:
juju attach ceph-mon alert-rules=./my-prom-alerts.yaml.rules
## Actions
This section lists Juju [actions][juju-docs-actions] supported by the charm.
@ -228,3 +238,4 @@ For general charm questions refer to the OpenStack [Charm Guide][cg].
[upstream-ceph-buckets]: https://docs.ceph.com/docs/master/rados/operations/crush-map/#types-and-buckets
[jq]: https://stedolan.github.io/jq/
[cos-lite]: https://charmhub.io/cos-lite
[ceph-rules]: https://github.com/ceph/ceph/blob/351e1ac63950164ea5f08a6bfc7c14af586bb208/monitoring/ceph-mixin/prometheus_alerts.yml

View File

@ -0,0 +1,635 @@
groups:
- name: "cluster health"
rules:
- alert: "CephHealthError"
annotations:
description: "The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information."
summary: "Ceph is in the ERROR state"
expr: "ceph_health_status == 2"
for: "5m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.2.1"
severity: "critical"
type: "ceph_default"
- alert: "CephHealthWarning"
annotations:
description: "The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information."
summary: "Ceph is in the WARNING state"
expr: "ceph_health_status == 1"
for: "15m"
labels:
severity: "warning"
type: "ceph_default"
- name: "mon"
rules:
- alert: "CephMonDownQuorumAtRisk"
annotations:
description: "{{ $min := query \"floor(count(ceph_mon_metadata) / 2) + 1\" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range query \"(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)\" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down"
summary: "Monitor quorum is at risk"
expr: |
(
(ceph_health_detail{name="MON_DOWN"} == 1) * on() (
count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1)
)
) == 1
for: "30s"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.3.1"
severity: "critical"
type: "ceph_default"
- alert: "CephMonDown"
annotations:
description: |
{{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down"
summary: "One or more monitors down"
expr: |
count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1)
for: "30s"
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephMonDiskspaceCritical"
annotations:
description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}"
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit"
summary: "Filesystem space on at least one monitor is critically low"
expr: "ceph_health_detail{name=\"MON_DISK_CRIT\"} == 1"
for: "1m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.3.2"
severity: "critical"
type: "ceph_default"
- alert: "CephMonDiskspaceLow"
annotations:
description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}"
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low"
summary: "Drive space on at least one monitor is approaching full"
expr: "ceph_health_detail{name=\"MON_DISK_LOW\"} == 1"
for: "5m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephMonClockSkew"
annotations:
description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew"
summary: "Clock skew detected among monitors"
expr: "ceph_health_detail{name=\"MON_CLOCK_SKEW\"} == 1"
for: "1m"
labels:
severity: "warning"
type: "ceph_default"
- name: "osd"
rules:
- alert: "CephOSDDownHigh"
annotations:
description: "{{ $value | humanize }}% or {{ with query \"count(ceph_osd_up == 0)\" }}{{ . | first | value }}{{ end }} of {{ with query \"count(ceph_osd_up)\" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
summary: "More than 10% of OSDs are down"
expr: "count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.4.1"
severity: "critical"
type: "ceph_default"
- alert: "CephOSDHostDown"
annotations:
description: "The following OSDs are down: {{- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}"
summary: "An OSD host is offline"
expr: "ceph_health_detail{name=\"OSD_HOST_DOWN\"} == 1"
for: "5m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.4.8"
severity: "warning"
type: "ceph_default"
- alert: "CephOSDDown"
annotations:
description: |
{{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down"
summary: "An OSD has been marked down"
expr: "ceph_health_detail{name=\"OSD_DOWN\"} == 1"
for: "5m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.4.2"
severity: "warning"
type: "ceph_default"
- alert: "CephOSDNearFull"
annotations:
description: "One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull"
summary: "OSD(s) running low on free space (NEARFULL)"
expr: "ceph_health_detail{name=\"OSD_NEARFULL\"} == 1"
for: "5m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.4.3"
severity: "warning"
type: "ceph_default"
- alert: "CephOSDFull"
annotations:
description: "An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full"
summary: "OSD full, writes blocked"
expr: "ceph_health_detail{name=\"OSD_FULL\"} > 0"
for: "1m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.4.6"
severity: "critical"
type: "ceph_default"
- alert: "CephOSDBackfillFull"
annotations:
description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull"
summary: "OSD(s) too full for backfill operations"
expr: "ceph_health_detail{name=\"OSD_BACKFILLFULL\"} > 0"
for: "1m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephOSDTooManyRepairs"
annotations:
description: "Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs"
summary: "OSD reports a high number of read errors"
expr: "ceph_health_detail{name=\"OSD_TOO_MANY_REPAIRS\"} == 1"
for: "30s"
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephOSDTimeoutsPublicNetwork"
annotations:
description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs."
summary: "Network issues delaying OSD heartbeats (public network)"
expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_FRONT\"} == 1"
for: "1m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephOSDTimeoutsClusterNetwork"
annotations:
description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs."
summary: "Network issues delaying OSD heartbeats (cluster network)"
expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_BACK\"} == 1"
for: "1m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephOSDInternalDiskSizeMismatch"
annotations:
description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch"
summary: "OSD size inconsistency error"
expr: "ceph_health_detail{name=\"BLUESTORE_DISK_SIZE_MISMATCH\"} == 1"
for: "1m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephDeviceFailurePredicted"
annotations:
description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#id2"
summary: "Device(s) predicted to fail soon"
expr: "ceph_health_detail{name=\"DEVICE_HEALTH\"} == 1"
for: "1m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephDeviceFailurePredictionTooHigh"
annotations:
description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availabililty. Prevent data integrity issues by adding new OSDs so that data may be relocated."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany"
summary: "Too many devices are predicted to fail, unable to resolve"
expr: "ceph_health_detail{name=\"DEVICE_HEALTH_TOOMANY\"} == 1"
for: "1m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.4.7"
severity: "critical"
type: "ceph_default"
- alert: "CephDeviceFailureRelocationIncomplete"
annotations:
description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use"
summary: "Device failure is predicted, but unable to relocate data"
expr: "ceph_health_detail{name=\"DEVICE_HEALTH_IN_USE\"} == 1"
for: "1m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephOSDFlapping"
annotations:
description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was marked down and back up {{ $value | humanize }} times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)."
documentation: "https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds"
summary: "Network issues are causing OSDs to flap (mark each other down)"
expr: "(rate(ceph_osd_up[5m]) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) * 60 > 1"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.4.4"
severity: "warning"
type: "ceph_default"
- alert: "CephOSDReadErrors"
annotations:
description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors"
summary: "Device read errors detected"
expr: "ceph_health_detail{name=\"BLUESTORE_SPURIOUS_READ_ERRORS\"} == 1"
for: "30s"
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephPGImbalance"
annotations:
description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count."
summary: "PGs are not balanced across OSDs"
expr: |
abs(
((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) /
on (job) group_left avg(ceph_osd_numpg > 0) by (job)
) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
for: "5m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.4.5"
severity: "warning"
type: "ceph_default"
- name: "mds"
rules:
- alert: "CephFilesystemDamaged"
annotations:
description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support."
documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages"
summary: "CephFS filesystem is damaged."
expr: "ceph_health_detail{name=\"MDS_DAMAGE\"} > 0"
for: "1m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.5.1"
severity: "critical"
type: "ceph_default"
- alert: "CephFilesystemOffline"
annotations:
description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline."
documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down"
summary: "CephFS filesystem is offline"
expr: "ceph_health_detail{name=\"MDS_ALL_DOWN\"} > 0"
for: "1m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.5.3"
severity: "critical"
type: "ceph_default"
- alert: "CephFilesystemDegraded"
annotations:
description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable."
documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded"
summary: "CephFS filesystem is degraded"
expr: "ceph_health_detail{name=\"FS_DEGRADED\"} > 0"
for: "1m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.5.4"
severity: "critical"
type: "ceph_default"
- alert: "CephFilesystemMDSRanksLow"
annotations:
description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value."
documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max"
summary: "Ceph MDS daemon count is lower than configured"
expr: "ceph_health_detail{name=\"MDS_UP_LESS_THAN_MAX\"} > 0"
for: "1m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephFilesystemInsufficientStandby"
annotations:
description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons."
documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby"
summary: "Ceph filesystem standby daemons too few"
expr: "ceph_health_detail{name=\"MDS_INSUFFICIENT_STANDBY\"} > 0"
for: "1m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephFilesystemFailureNoStandby"
annotations:
description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS."
documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds"
summary: "MDS daemon failed, no further standby available"
expr: "ceph_health_detail{name=\"FS_WITH_FAILED_MDS\"} > 0"
for: "1m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.5.5"
severity: "critical"
type: "ceph_default"
- alert: "CephFilesystemReadOnly"
annotations:
description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support."
documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages"
summary: "CephFS filesystem in read only mode due to write error(s)"
expr: "ceph_health_detail{name=\"MDS_HEALTH_READ_ONLY\"} > 0"
for: "1m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.5.2"
severity: "critical"
type: "ceph_default"
- name: "mgr"
rules:
- alert: "CephMgrModuleCrash"
annotations:
description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash"
summary: "A manager module has recently crashed"
expr: "ceph_health_detail{name=\"RECENT_MGR_MODULE_CRASH\"} == 1"
for: "5m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.6.1"
severity: "critical"
type: "ceph_default"
- alert: "CephMgrPrometheusModuleInactive"
annotations:
description: "The mgr/prometheus module at {{ $labels.instance }} is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'."
summary: "The mgr/prometheus module is not available"
expr: "up{job=\"ceph\"} == 0"
for: "1m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.6.2"
severity: "critical"
type: "ceph_default"
- name: "pgs"
rules:
- alert: "CephPGsInactive"
annotations:
description: "{{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}. Inactive placement groups are not able to serve read/write requests."
summary: "One or more placement groups are inactive"
expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0"
for: "5m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.7.1"
severity: "critical"
type: "ceph_default"
- alert: "CephPGsUnclean"
annotations:
description: "{{ $value }} PGs have been unclean for more than 15 minutes in pool {{ $labels.name }}. Unclean PGs have not recovered from a previous failure."
summary: "One or more placement groups are marked unclean"
expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0"
for: "15m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.7.2"
severity: "warning"
type: "ceph_default"
- alert: "CephPGsDamaged"
annotations:
description: "During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged"
summary: "Placement group damaged, manual intervention needed"
expr: "ceph_health_detail{name=~\"PG_DAMAGED|OSD_SCRUB_ERRORS\"} == 1"
for: "5m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.7.4"
severity: "critical"
type: "ceph_default"
- alert: "CephPGRecoveryAtRisk"
annotations:
description: "Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full"
summary: "OSDs are too full for recovery"
expr: "ceph_health_detail{name=\"PG_RECOVERY_FULL\"} == 1"
for: "1m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.7.5"
severity: "critical"
type: "ceph_default"
- alert: "CephPGUnavilableBlockingIO"
annotations:
description: "Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability"
summary: "PG is unavailable, blocking I/O"
expr: "((ceph_health_detail{name=\"PG_AVAILABILITY\"} == 1) - scalar(ceph_health_detail{name=\"OSD_DOWN\"})) == 1"
for: "1m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.7.3"
severity: "critical"
type: "ceph_default"
- alert: "CephPGBackfillAtRisk"
annotations:
description: "Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full"
summary: "Backfill operations are blocked due to lack of free space"
expr: "ceph_health_detail{name=\"PG_BACKFILL_FULL\"} == 1"
for: "1m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.7.6"
severity: "critical"
type: "ceph_default"
- alert: "CephPGNotScrubbed"
annotations:
description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>"
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed"
summary: "Placement group(s) have not been scrubbed"
expr: "ceph_health_detail{name=\"PG_NOT_SCRUBBED\"} == 1"
for: "5m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephPGsHighPerOSD"
annotations:
description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs"
summary: "Placement groups per OSD is too high"
expr: "ceph_health_detail{name=\"TOO_MANY_PGS\"} == 1"
for: "1m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephPGNotDeepScrubbed"
annotations:
description: "One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed"
summary: "Placement group(s) have not been deep scrubbed"
expr: "ceph_health_detail{name=\"PG_NOT_DEEP_SCRUBBED\"} == 1"
for: "5m"
labels:
severity: "warning"
type: "ceph_default"
- name: "nodes"
rules:
- alert: "CephNodeRootFilesystemFull"
annotations:
description: "Root volume is dangerously full: {{ $value | humanize }}% free."
summary: "Root filesystem is dangerously full"
expr: "node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"} * 100 < 5"
for: "5m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.8.1"
severity: "critical"
type: "ceph_default"
- alert: "CephNodeNetworkPacketDrops"
annotations:
description: "Node {{ $labels.instance }} experiences packet drop > 0.5% or > 10 packets/s on interface {{ $labels.device }}."
summary: "One or more NICs reports packet drops"
expr: |
(
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
rate(node_network_transmit_drop_total{device!="lo"}[1m])
) / (
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
rate(node_network_transmit_packets_total{device!="lo"}[1m])
) >= 0.0050000000000000001 and (
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
rate(node_network_transmit_drop_total{device!="lo"}[1m])
) >= 10
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.8.2"
severity: "warning"
type: "ceph_default"
- alert: "CephNodeNetworkPacketErrors"
annotations:
description: "Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}."
summary: "One or more NICs reports packet errors"
expr: |
(
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
rate(node_network_transmit_errs_total{device!="lo"}[1m])
) / (
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
rate(node_network_transmit_packets_total{device!="lo"}[1m])
) >= 0.0001 or (
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
rate(node_network_transmit_errs_total{device!="lo"}[1m])
) >= 10
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.8.3"
severity: "warning"
type: "ceph_default"
- alert: "CephNodeDiskspaceWarning"
annotations:
description: "Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will be full in less than 5 days based on the 48 hour trailing fill rate."
summary: "Host filesystem free space is getting low"
expr: "predict_linear(node_filesystem_free_bytes{device=~\"/.*\"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.8.4"
severity: "warning"
type: "ceph_default"
- alert: "CephNodeInconsistentMTU"
annotations:
description: "Node {{ $labels.instance }} has a different MTU size ({{ $value }}) than the median of devices named {{ $labels.device }}."
summary: "MTU settings across Ceph hosts are inconsistent"
expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( max by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( min by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )"
labels:
severity: "warning"
type: "ceph_default"
- name: "pools"
rules:
- alert: "CephPoolGrowthWarning"
annotations:
description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours."
summary: "Pool growth rate may soon exceed capacity"
expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id) group_right ceph_pool_metadata) >= 95"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.9.2"
severity: "warning"
type: "ceph_default"
- alert: "CephPoolBackfillFull"
annotations:
description: "A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity."
summary: "Free space in a pool is too low for recovery/backfill"
expr: "ceph_health_detail{name=\"POOL_BACKFILLFULL\"} > 0"
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephPoolFull"
annotations:
description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range query \"topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))\" }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full"
summary: "Pool is full - writes are blocked"
expr: "ceph_health_detail{name=\"POOL_FULL\"} > 0"
for: "1m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.9.1"
severity: "critical"
type: "ceph_default"
- alert: "CephPoolNearFull"
annotations:
description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active."
summary: "One or more Ceph pools are nearly full"
expr: "ceph_health_detail{name=\"POOL_NEAR_FULL\"} > 0"
for: "5m"
labels:
severity: "warning"
type: "ceph_default"
- name: "healthchecks"
rules:
- alert: "CephSlowOps"
annotations:
description: "{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
summary: "OSD operations are slow to complete"
expr: "ceph_healthcheck_slow_ops > 0"
for: "30s"
labels:
severity: "warning"
type: "ceph_default"
- name: "cephadm"
rules:
- alert: "CephadmUpgradeFailed"
annotations:
description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue"
summary: "Ceph version upgrade has failed"
expr: "ceph_health_detail{name=\"UPGRADE_EXCEPTION\"} > 0"
for: "30s"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.11.2"
severity: "critical"
type: "ceph_default"
- alert: "CephadmDaemonFailed"
annotations:
description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'"
summary: "A ceph daemon manged by cephadm is down"
expr: "ceph_health_detail{name=\"CEPHADM_FAILED_DAEMON\"} > 0"
for: "30s"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.11.1"
severity: "critical"
type: "ceph_default"
- alert: "CephadmPaused"
annotations:
description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'"
documentation: "https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused"
summary: "Orchestration tasks via cephadm are PAUSED"
expr: "ceph_health_detail{name=\"CEPHADM_PAUSED\"} > 0"
for: "1m"
labels:
severity: "warning"
type: "ceph_default"
- name: "PrometheusServer"
rules:
- alert: "PrometheusJobMissing"
annotations:
description: "The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance."
summary: "The scrape job for Ceph is missing from Prometheus"
expr: "absent(up{job=\"ceph\"})"
for: "30s"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.12.1"
severity: "critical"
type: "ceph_default"
- name: "rados"
rules:
- alert: "CephObjectMissing"
annotations:
description: "The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound"
summary: "Object(s) marked UNFOUND"
expr: "(ceph_health_detail{name=\"OBJECT_UNFOUND\"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1"
for: "30s"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.10.1"
severity: "critical"
type: "ceph_default"
- name: "generic"
rules:
- alert: "CephDaemonCrash"
annotations:
description: "One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command."
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash"
summary: "One or more Ceph daemons have crashed, and are pending acknowledgement"
expr: "ceph_health_detail{name=\"RECENT_CRASH\"} == 1"
for: "1m"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.1.2"
severity: "critical"
type: "ceph_default"

View File

@ -322,7 +322,6 @@ import re
import socket
import subprocess
import tempfile
import uuid
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
@ -339,7 +338,7 @@ LIBAPI = 0
# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 21
LIBPATCH = 22
logger = logging.getLogger(__name__)
@ -356,7 +355,10 @@ ALLOWED_KEYS = {
"sample_limit",
"label_limit",
"label_name_length_limit",
"label_value_lenght_limit",
"label_value_length_limit",
"scheme",
"basic_auth",
"tls_config",
}
DEFAULT_JOB = {
"metrics_path": "/metrics",
@ -639,6 +641,12 @@ class AlertRules:
logger.error("Failed to read alert rules from %s: %s", file_path.name, e)
return []
if not rule_file:
logger.warning("Empty rules file: %s", file_path.name)
return []
if not isinstance(rule_file, dict):
logger.error("Invalid rules file (must be a dict): %s", file_path.name)
return []
if _is_official_alert_rule_format(rule_file):
alert_groups = rule_file["groups"]
elif _is_single_alert_rule_format(rule_file):
@ -920,7 +928,7 @@ class MetricsEndpointConsumer(Object):
for topology_identifier, alert_rule_groups in self.metrics_consumer.alerts().items():
filename = "juju_" + topology_identifier + ".rules"
path = os.path.join(PROMETHEUS_RULES_DIR, filename)
rules = yaml.dump(alert_rule_groups)
rules = yaml.safe_dump(alert_rule_groups)
container.push(path, rules, make_dirs=True)
```
@ -937,7 +945,6 @@ class MetricsEndpointConsumer(Object):
if not alert_rules:
continue
identifier = None
try:
scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"])
identifier = JujuTopology.from_dict(scrape_metadata).identifier
@ -1118,7 +1125,7 @@ class MetricsEndpointConsumer(Object):
# label all static configs in the Prometheus job
# labeling inserts Juju topology information and
# sets a relable config for instance labels
# sets a relabeling config for instance labels
for static_config in static_configs:
labels = static_config.get("labels", {}) if static_configs else {}
all_targets = static_config.get("targets", [])
@ -1187,7 +1194,7 @@ class MetricsEndpointConsumer(Object):
Returns:
a copy of the `labels` dictionary augmented with Juju
topology information with the exception of unit name.
topology information except for unit name.
"""
juju_labels = labels.copy() # deep copy not needed
juju_labels.update(JujuTopology.from_dict(scrape_metadata).label_matcher_dict)
@ -1262,7 +1269,7 @@ class MetricsEndpointConsumer(Object):
def _dedupe_job_names(jobs: List[dict]):
"""Deduplicate a list of dicts by appending a hash to the value of the 'job_name' key.
Additionally fully dedeuplicate any identical jobs.
Additionally, fully de-duplicate any identical jobs.
Args:
jobs: A list of prometheus scrape jobs
@ -1345,6 +1352,7 @@ class MetricsEndpointProvider(Object):
jobs=None,
alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH,
refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None,
external_hostname: str = None,
):
"""Construct a metrics provider for a Prometheus charm.
@ -1430,7 +1438,7 @@ class MetricsEndpointProvider(Object):
Args:
charm: a `CharmBase` object that manages this
`MetricsEndpointProvider` object. Typically this is
`MetricsEndpointProvider` object. Typically, this is
`self` in the instantiating class.
relation_name: an optional string name of the relation between `charm`
and the Prometheus charmed service. The default is "metrics-endpoint".
@ -1449,6 +1457,8 @@ class MetricsEndpointProvider(Object):
The alert rules are automatically updated on charm upgrade.
refresh_event: an optional bound event or list of bound events which
will be observed to re-set scrape job data (IP address and others)
external_hostname: an optional argument that represents an external hostname that
can be generated by an Ingress or a Proxy.
Raises:
RelationNotFoundError: If there is no relation in the charm's metadata.yaml
@ -1482,7 +1492,7 @@ class MetricsEndpointProvider(Object):
# sanitize job configurations to the supported subset of parameters
jobs = [] if jobs is None else jobs
self._jobs = [_sanitize_scrape_configuration(job) for job in jobs]
self.external_hostname = external_hostname
events = self._charm.on[self._relation_name]
self.framework.observe(events.relation_joined, self._set_scrape_job_spec)
self.framework.observe(events.relation_changed, self._on_relation_changed)
@ -1510,7 +1520,7 @@ class MetricsEndpointProvider(Object):
refresh_event = [refresh_event]
for ev in refresh_event:
self.framework.observe(ev, self._set_unit_ip)
self.framework.observe(ev, self._set_scrape_job_spec)
self.framework.observe(self._charm.on.upgrade_charm, self._set_scrape_job_spec)
@ -1539,7 +1549,7 @@ class MetricsEndpointProvider(Object):
When a metrics provider charm is related to a prometheus charm, the
metrics provider sets specification and metadata related to its own
scrape configuration. This information is set using Juju application
data. In addition each of the consumer units also sets its own
data. In addition, each of the consumer units also sets its own
host address in Juju unit relation data.
"""
self._set_unit_ip(event)
@ -1568,16 +1578,21 @@ class MetricsEndpointProvider(Object):
Each time a metrics provider charm container is restarted it updates its own
host address in the unit relation data for the prometheus charm.
The only argument specified is an event and it ignored. this is for expediency
The only argument specified is an event, and it ignored. This is for expediency
to be able to use this method as an event handler, although no access to the
event is actually needed.
"""
for relation in self._charm.model.relations[self._relation_name]:
unit_ip = str(self._charm.model.get_binding(relation).network.bind_address)
relation.data[self._charm.unit]["prometheus_scrape_unit_address"] = (
unit_ip if self._is_valid_unit_address(unit_ip) else socket.getfqdn()
)
if self.external_hostname:
unit_address = self.external_hostname
elif self._is_valid_unit_address(unit_ip):
unit_address = unit_ip
else:
unit_address = socket.getfqdn()
relation.data[self._charm.unit]["prometheus_scrape_unit_address"] = unit_address
relation.data[self._charm.unit]["prometheus_scrape_unit_name"] = str(
self._charm.model.unit.name
)
@ -1634,7 +1649,7 @@ class PrometheusRulesProvider(Object):
relation_name: Name of the relation in `metadata.yaml` that
has the `prometheus_scrape` interface.
dir_path: Root directory for the collection of rule files.
recursive: Whether or not to scan for rule files recursively.
recursive: Whether to scan for rule files recursively.
"""
def __init__(
@ -1696,7 +1711,7 @@ class MetricsEndpointAggregator(Object):
`MetricsEndpointAggregator` collects scrape target information from one
or more related charms and forwards this to a `MetricsEndpointConsumer`
charm, which may be in a different Juju model. However it is
charm, which may be in a different Juju model. However, it is
essential that `MetricsEndpointAggregator` itself resides in the same
model as its scrape targets, as this is currently the only way to
ensure in Juju that the `MetricsEndpointAggregator` will be able to
@ -1765,7 +1780,7 @@ class MetricsEndpointAggregator(Object):
information, just like `MetricsEndpointProvider` and
`MetricsEndpointConsumer` do.
By default `MetricsEndpointAggregator` ensures that Prometheus
By default, `MetricsEndpointAggregator` ensures that Prometheus
"instance" labels refer to Juju topology. This ensures that
instance labels are stable over unit recreation. While it is not
advisable to change this option, if required it can be done by
@ -1778,7 +1793,7 @@ class MetricsEndpointAggregator(Object):
Args:
charm: a `CharmBase` object that manages this
`MetricsEndpointAggregator` object. Typically this is
`MetricsEndpointAggregator` object. Typically, this is
`self` in the instantiating class.
relation_names: a dictionary with three keys. The value
of the "scrape_target" and "alert_rules" keys are
@ -1843,7 +1858,7 @@ class MetricsEndpointAggregator(Object):
When there is any change in relation data with any scrape
target, the Prometheus scrape job, for that specific target is
updated. Additionally, if this method is called manually, do the
sameself.
same.
Args:
targets: a `dict` containing target information
@ -1985,7 +2000,7 @@ class MetricsEndpointAggregator(Object):
Scrape target information is returned for each unit in the
relation. This information contains the unit name, network
hostname (or address) for that unit, and port on which an
hostname (or address) for that unit, and port on which a
metrics endpoint is exposed in that unit.
Args:
@ -2142,7 +2157,7 @@ class MetricsEndpointAggregator(Object):
labels are stable across unit recreation.
Returns:
a list of Prometheus relabling configurations. Each item in
a list of Prometheus relabeling configurations. Each item in
this list is one relabel configuration.
"""
return (
@ -2216,22 +2231,7 @@ class CosTool:
with tempfile.TemporaryDirectory() as tmpdir:
rule_path = Path(tmpdir + "/validate_rule.yaml")
# Smash "our" rules format into what upstream actually uses, which is more like:
#
# groups:
# - name: foo
# rules:
# - alert: SomeAlert
# expr: up
# - alert: OtherAlert
# expr: up
transformed_rules = {"groups": []} # type: ignore
for rule in rules["groups"]:
transformed = {"name": str(uuid.uuid4()), "rules": [rule]}
transformed_rules["groups"].append(transformed)
rule_path.write_text(yaml.dump(transformed_rules))
rule_path.write_text(yaml.dump(rules))
args = [str(self.path), "validate", str(rule_path)]
# noinspection PyBroadException
@ -2240,7 +2240,13 @@ class CosTool:
return True, ""
except subprocess.CalledProcessError as e:
logger.debug("Validating the rules failed: %s", e.output)
return False, ", ".join([line for line in e.output if "error validating" in line])
return False, ", ".join(
[
line
for line in e.output.decode("utf8").splitlines()
if "error validating" in line
]
)
def inject_label_matchers(self, expression, topology) -> str:
"""Add label matchers to an expression."""
@ -2277,6 +2283,5 @@ class CosTool:
return None
def _exec(self, cmd) -> str:
result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE)
output = result.stdout.decode("utf-8").strip()
return output
result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
return result.stdout.decode("utf-8").strip()

View File

@ -43,3 +43,8 @@ provides:
requires:
bootstrap-source:
interface: ceph-bootstrap
resources:
alert-rules:
type: file
filename: alert.yaml.rules
description: "Alerting rules"

View File

@ -1235,7 +1235,7 @@ def is_unsupported_cmr(unit_name):
return unsupported
def assess_status():
def assess_status(charm=None):
'''Assess status of current unit'''
application_version_set(get_upstream_version(VERSION_PACKAGE))
if not config('permit-insecure-cmr'):
@ -1291,6 +1291,9 @@ def assess_status():
status_set('blocked', str(e))
return
if charm is not None and charm.metrics_endpoint.assess_alert_rule_errors():
return
# active - bootstrapped + quorum status check
if ceph.is_bootstrapped() and ceph.is_quorum():
expected_osd_count = config('expected-osd-count') or 3

View File

@ -5,9 +5,16 @@
Configure prometheus scrape jobs via the metrics-endpoint relation.
"""
import json
import logging
import os.path
import pathlib
from typing import Optional, Union, List
import ops.model
from ops.model import BlockedStatus
import charm
from charms.prometheus_k8s.v0 import prometheus_scrape
from charms_ceph import utils as ceph_utils
from ops.framework import BoundEvent
@ -19,15 +26,16 @@ DEFAULT_CEPH_JOB = {
"metrics_path": "/metrics",
"static_configs": [{"targets": ["*:9283"]}],
}
DEFAULT_ALERT_RULES_RELATIVE_PATH = "files/prometheus_alert_rules"
class CephMetricsEndpointProvider(prometheus_scrape.MetricsEndpointProvider):
def __init__(
self,
charm,
charm: charm.CephMonCharm,
relation_name: str = prometheus_scrape.DEFAULT_RELATION_NAME,
jobs=None,
alert_rules_path: str = prometheus_scrape.DEFAULT_ALERT_RULES_RELATIVE_PATH, # noqa
alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH,
refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None,
):
if jobs is None:
@ -43,6 +51,11 @@ class CephMetricsEndpointProvider(prometheus_scrape.MetricsEndpointProvider):
self.framework.observe(
events.relation_departed, self._on_relation_departed
)
self.framework.observe(
self.on.alert_rule_status_changed,
self._on_alert_rule_status_changed,
)
charm._stored.set_default(alert_rule_errors=None)
def _on_relation_changed(self, event):
"""Enable prometheus on relation change"""
@ -63,3 +76,64 @@ class CephMetricsEndpointProvider(prometheus_scrape.MetricsEndpointProvider):
)
ceph_utils.mgr_disable_module("prometheus")
logger.debug("module_disabled")
# We're not related to prom, don't care about alert rules
self._charm._stored.alert_rule_errors = None
def assess_alert_rule_errors(self):
if self._charm._stored.alert_rule_errors:
self._charm.unit.status = BlockedStatus(
"invalid alert rules, check unit logs"
)
return True
def _on_alert_rule_status_changed(self, event):
logger.debug(
"alert rule status changed: %s, %s, %s",
event,
event.valid,
event.errors,
)
if event.errors:
logger.warning("invalid alert rules: %s", event.errors)
self._charm._stored.alert_rule_errors = event.errors
else:
self._charm._stored.alert_rule_errors = None
def get_alert_rules_resource(self):
try:
return self._charm.model.resources.fetch("alert-rules")
except ops.model.ModelError as e:
logger.warning("can't get alert-rules resource: %s", e)
def _set_alert_rules(self, rules_dict):
logger.debug("set alert rules: %s", rules_dict)
# alert rules seem ok locally, clear any errors
# prometheus may still signal alert rule errors
# via the relation though
self._charm._stored.alert_rule_errors = None
for relation in self._charm.model.relations[self._relation_name]:
relation.data[self._charm.app]["alert_rules"] = json.dumps(
rules_dict
)
def update_alert_rules(self):
if self._charm.unit.is_leader() and ceph_utils.is_bootstrapped():
resource = self.get_alert_rules_resource()
if resource is None or not os.path.getsize(resource):
logger.debug("empty rules resource, clearing alert rules")
self._set_alert_rules({})
return
sink = pathlib.Path(self._alert_rules_path) / "alert.yaml.rules"
if sink.exists():
sink.unlink()
sink.symlink_to(resource)
alert_rules = prometheus_scrape.AlertRules(topology=self.topology)
alert_rules.add_path(str(sink), recursive=True)
alert_rules_as_dict = alert_rules.as_dict()
if not alert_rules_as_dict:
msg = "invalid alert rules: {}".format(sink.open().read())
logger.warning(msg)
self._charm._stored.alert_rule_errors = msg
return
self._set_alert_rules(alert_rules_as_dict)

View File

@ -40,58 +40,77 @@ class CephMonCharm(ops_openstack.core.OSBaseCharm):
systemd.service_pause('ceph-create-keys')
except systemd.SystemdError:
pass
hooks.assess_status(self)
def on_config(self, event):
hooks.config_changed()
hooks.assess_status(self)
def on_pre_series_upgrade(self, event):
hooks.pre_series_upgrade()
hooks.assess_status(self)
def on_upgrade(self, event):
self.metrics_endpoint.update_alert_rules()
hooks.upgrade_charm()
hooks.assess_status(self)
def on_post_series_upgrade(self, event):
hooks.post_series_upgrade()
hooks.assess_status(self)
# Relations.
def on_mon_relation_joined(self, event):
hooks.mon_relation_joined()
hooks.assess_status(self)
def on_bootstrap_source_relation_changed(self, event):
hooks.bootstrap_source_relation_changed()
hooks.assess_status(self)
def on_prometheus_relation_joined_or_changed(self, event):
hooks.prometheus_relation()
hooks.assess_status(self)
def on_prometheus_relation_departed(self, event):
hooks.prometheus_left()
hooks.assess_status(self)
def on_mon_relation(self, event):
hooks.mon_relation()
hooks.assess_status(self)
def on_osd_relation(self, event):
hooks.osd_relation()
hooks.assess_status(self)
def on_dashboard_relation_joined(self, event):
hooks.dashboard_relation()
hooks.assess_status(self)
def on_radosgw_relation(self, event):
hooks.radosgw_relation()
hooks.assess_status(self)
def on_rbd_mirror_relation(self, event):
hooks.rbd_mirror_relation()
hooks.assess_status(self)
def on_mds_relation(self, event):
hooks.mds_relation_joined()
hooks.assess_status(self)
def on_admin_relation(self, event):
hooks.admin_relation_joined()
hooks.assess_status(self)
def on_client_relation(self, event):
hooks.client_relation()
hooks.assess_status(self)
def on_nrpe_relation(self, event):
hooks.update_nrpe_config()
hooks.assess_status(self)
# Actions.
@ -195,4 +214,3 @@ class CephMonCharm(ops_openstack.core.OSBaseCharm):
if __name__ == '__main__':
main(CephMonCharm)
hooks.assess_status()

View File

@ -1,4 +1,8 @@
#!/usr/bin/env python3
import json
import pathlib
import tempfile
import textwrap
# Copyright 2022 Canonical Ltd.
# See LICENSE file for licensing details.
@ -9,10 +13,30 @@ import unittest
from ops import storage, model, framework
from ops.testing import Harness, _TestingModelBackend
import ceph_metrics # noqa: avoid circ. import
import charm
class TestCephMetrics(unittest.TestCase):
@classmethod
def setUpClass(cls):
"""Run once before tests begin."""
cls.tempdir = tempfile.TemporaryDirectory()
cls.tmp = pathlib.Path(cls.tempdir.name)
cls.rules_dir = cls.tmp / "rules"
cls.rules_dir.mkdir()
cls.rules = textwrap.dedent(
"""
groups:
- name: "testgroup"
rules: []
"""
)
@classmethod
def tearDownClass(cls):
cls.tempdir.cleanup()
def setUp(self):
super().setUp()
self.harness = Harness(charm.CephMonCharm)
@ -42,9 +66,12 @@ class TestCephMetrics(unittest.TestCase):
self.harness._model,
)
# END Workaround
self.addCleanup(self.harness.cleanup)
self.harness.begin()
self.harness.set_leader(True)
self.harness.charm.metrics_endpoint._alert_rules_path = self.rules_dir
def test_init(self):
self.assertEqual(
@ -94,3 +121,40 @@ class TestCephMetrics(unittest.TestCase):
self.harness.remove_relation(rel_id)
mgr_disable_module.assert_called_once()
def get_alert_rules(self, rel_id):
app_rel_data = self.harness.get_relation_data(
rel_id, self.harness.model.app
)
return json.loads(app_rel_data["alert_rules"])
@patch("ceph_metrics.ceph_utils.is_bootstrapped", return_value=True)
@patch("ceph_metrics.CephMetricsEndpointProvider._set_alert_rules")
def test_update_alert_rules_empty(
self, set_alert_rules, _is_bootstrapped,
):
"""Test: no alert rules created with empty alert rules file."""
rel_id = self.harness.add_relation("metrics-endpoint", "prometheus")
self.harness.add_relation_unit(rel_id, "prometheus/0")
self.harness.add_resource("alert-rules", "")
self.harness.charm.metrics_endpoint.update_alert_rules()
set_alert_rules.assert_called_with({})
@patch("ceph_metrics.ceph_utils.is_bootstrapped", return_value=True)
def test_update_alert_rules_invalid(self, _is_bootstrapped):
rel_id = self.harness.add_relation("metrics-endpoint", "prometheus")
self.harness.add_relation_unit(rel_id, "prometheus/0")
self.harness.add_resource("alert-rules", "not-a-rule")
self.harness.charm.metrics_endpoint.update_alert_rules()
self.assertTrue(
self.harness.charm.metrics_endpoint.assess_alert_rule_errors()
)
@patch("ceph_metrics.ceph_utils.is_bootstrapped", return_value=True)
def test_update_alert_rules(self, _is_bootstrapped):
rel_id = self.harness.add_relation("metrics-endpoint", "prometheus")
self.harness.add_relation_unit(rel_id, "prometheus/0")
self.harness.add_resource("alert-rules", self.rules)
self.harness.charm.metrics_endpoint.update_alert_rules()
alert_rules = self.get_alert_rules(rel_id)
self.assertTrue(alert_rules.get("groups"))