Implement prometheus alert rules

Alert rules can be attached as a resource and will be transmitted via the metrics-endpoint relation. Default alert rules taken from upstream ceph have been added for reference. Change-Id: I6a3c6f06e9b9d911b35c8ced1968becc6471b362
2022-09-22 10:28:06 +02:00 · 2022-09-22 10:28:06 +02:00 · 9c7101f573
parent ade9df195e
commit 9c7101f573
8 changed files with 862 additions and 47 deletions
--- a/README.md
+++ b/README.md
@ -143,6 +143,16 @@ The charm supports Ceph metric monitoring with Prometheus. Add relations to the
 Alternatively, integration with the [COS Lite][cos-lite] observability
 stack is available via the metrics-endpoint relation.

+Relating to prometheus-k8s via the metrics-endpoint interface (as is
+found in the [COS Lite][cos-lite] bundle) will send metrics to
+prometheus. Additionally, alerting rules will be configured for
+prometheus as well. Alerting rules are configured as a resource
+`alert-rules`; the default rules are taken from [upstream ceph
+rules][ceph-rules]. It is possible to replace the default with
+customized rules by attaching a resource:
+
+    juju attach ceph-mon alert-rules=./my-prom-alerts.yaml.rules
+
 ## Actions

 This section lists Juju [actions][juju-docs-actions] supported by the charm.
@ -228,3 +238,4 @@ For general charm questions refer to the OpenStack [Charm Guide][cg].
 [upstream-ceph-buckets]: https://docs.ceph.com/docs/master/rados/operations/crush-map/#types-and-buckets
 [jq]: https://stedolan.github.io/jq/
 [cos-lite]: https://charmhub.io/cos-lite
+[ceph-rules]: https://github.com/ceph/ceph/blob/351e1ac63950164ea5f08a6bfc7c14af586bb208/monitoring/ceph-mixin/prometheus_alerts.yml
--- a/files/prometheus_alert_rules/prometheus_alerts.yml.default
+++ b/files/prometheus_alert_rules/prometheus_alerts.yml.default
@ -0,0 +1,635 @@
+groups:
+  - name: "cluster health"
+    rules:
+      - alert: "CephHealthError"
+        annotations:
+          description: "The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information."
+          summary: "Ceph is in the ERROR state"
+        expr: "ceph_health_status == 2"
+        for: "5m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.2.1"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephHealthWarning"
+        annotations:
+          description: "The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information."
+          summary: "Ceph is in the WARNING state"
+        expr: "ceph_health_status == 1"
+        for: "15m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+  - name: "mon"
+    rules:
+      - alert: "CephMonDownQuorumAtRisk"
+        annotations:
+          description: "{{ $min := query \"floor(count(ceph_mon_metadata) / 2) + 1\" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range query \"(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)\" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down"
+          summary: "Monitor quorum is at risk"
+        expr: |
+          (
+            (ceph_health_detail{name="MON_DOWN"} == 1) * on() (
+              count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1)
+            )
+          ) == 1
+        for: "30s"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.3.1"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephMonDown"
+        annotations:
+          description: |
+            {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable.  The following monitors are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}   - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down"
+          summary: "One or more monitors down"
+        expr: |
+          count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1)
+        for: "30s"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephMonDiskspaceCritical"
+        annotations:
+          description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}"
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit"
+          summary: "Filesystem space on at least one monitor is critically low"
+        expr: "ceph_health_detail{name=\"MON_DISK_CRIT\"} == 1"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.3.2"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephMonDiskspaceLow"
+        annotations:
+          description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*.  Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}"
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low"
+          summary: "Drive space on at least one monitor is approaching full"
+        expr: "ceph_health_detail{name=\"MON_DISK_LOW\"} == 1"
+        for: "5m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephMonClockSkew"
+        annotations:
+          description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew"
+          summary: "Clock skew detected among monitors"
+        expr: "ceph_health_detail{name=\"MON_CLOCK_SKEW\"} == 1"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+  - name: "osd"
+    rules:
+      - alert: "CephOSDDownHigh"
+        annotations:
+          description: "{{ $value | humanize }}% or {{ with query \"count(ceph_osd_up == 0)\" }}{{ . | first | value }}{{ end }} of {{ with query \"count(ceph_osd_up)\" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
+          summary: "More than 10% of OSDs are down"
+        expr: "count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.4.1"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephOSDHostDown"
+        annotations:
+          description: "The following OSDs are down: {{- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}"
+          summary: "An OSD host is offline"
+        expr: "ceph_health_detail{name=\"OSD_HOST_DOWN\"} == 1"
+        for: "5m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.4.8"
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephOSDDown"
+        annotations:
+          description: |
+            {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down"
+          summary: "An OSD has been marked down"
+        expr: "ceph_health_detail{name=\"OSD_DOWN\"} == 1"
+        for: "5m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.4.2"
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephOSDNearFull"
+        annotations:
+          description: "One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull"
+          summary: "OSD(s) running low on free space (NEARFULL)"
+        expr: "ceph_health_detail{name=\"OSD_NEARFULL\"} == 1"
+        for: "5m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.4.3"
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephOSDFull"
+        annotations:
+          description: "An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full"
+          summary: "OSD full, writes blocked"
+        expr: "ceph_health_detail{name=\"OSD_FULL\"} > 0"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.4.6"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephOSDBackfillFull"
+        annotations:
+          description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull"
+          summary: "OSD(s) too full for backfill operations"
+        expr: "ceph_health_detail{name=\"OSD_BACKFILLFULL\"} > 0"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephOSDTooManyRepairs"
+        annotations:
+          description: "Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs"
+          summary: "OSD reports a high number of read errors"
+        expr: "ceph_health_detail{name=\"OSD_TOO_MANY_REPAIRS\"} == 1"
+        for: "30s"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephOSDTimeoutsPublicNetwork"
+        annotations:
+          description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs."
+          summary: "Network issues delaying OSD heartbeats (public network)"
+        expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_FRONT\"} == 1"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephOSDTimeoutsClusterNetwork"
+        annotations:
+          description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs."
+          summary: "Network issues delaying OSD heartbeats (cluster network)"
+        expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_BACK\"} == 1"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephOSDInternalDiskSizeMismatch"
+        annotations:
+          description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch"
+          summary: "OSD size inconsistency error"
+        expr: "ceph_health_detail{name=\"BLUESTORE_DISK_SIZE_MISMATCH\"} == 1"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephDeviceFailurePredicted"
+        annotations:
+          description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#id2"
+          summary: "Device(s) predicted to fail soon"
+        expr: "ceph_health_detail{name=\"DEVICE_HEALTH\"} == 1"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephDeviceFailurePredictionTooHigh"
+        annotations:
+          description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availabililty. Prevent data integrity issues by adding new OSDs so that data may be relocated."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany"
+          summary: "Too many devices are predicted to fail, unable to resolve"
+        expr: "ceph_health_detail{name=\"DEVICE_HEALTH_TOOMANY\"} == 1"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.4.7"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephDeviceFailureRelocationIncomplete"
+        annotations:
+          description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use"
+          summary: "Device failure is predicted, but unable to relocate data"
+        expr: "ceph_health_detail{name=\"DEVICE_HEALTH_IN_USE\"} == 1"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephOSDFlapping"
+        annotations:
+          description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was marked down and back up {{ $value | humanize }} times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)."
+          documentation: "https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds"
+          summary: "Network issues are causing OSDs to flap (mark each other down)"
+        expr: "(rate(ceph_osd_up[5m]) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) * 60 > 1"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.4.4"
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephOSDReadErrors"
+        annotations:
+          description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors"
+          summary: "Device read errors detected"
+        expr: "ceph_health_detail{name=\"BLUESTORE_SPURIOUS_READ_ERRORS\"} == 1"
+        for: "30s"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephPGImbalance"
+        annotations:
+          description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count."
+          summary: "PGs are not balanced across OSDs"
+        expr: |
+          abs(
+            ((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) /
+            on (job) group_left avg(ceph_osd_numpg > 0) by (job)
+          ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
+        for: "5m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.4.5"
+          severity: "warning"
+          type: "ceph_default"
+  - name: "mds"
+    rules:
+      - alert: "CephFilesystemDamaged"
+        annotations:
+          description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support."
+          documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages"
+          summary: "CephFS filesystem is damaged."
+        expr: "ceph_health_detail{name=\"MDS_DAMAGE\"} > 0"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.5.1"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephFilesystemOffline"
+        annotations:
+          description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline."
+          documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down"
+          summary: "CephFS filesystem is offline"
+        expr: "ceph_health_detail{name=\"MDS_ALL_DOWN\"} > 0"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.5.3"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephFilesystemDegraded"
+        annotations:
+          description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable."
+          documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded"
+          summary: "CephFS filesystem is degraded"
+        expr: "ceph_health_detail{name=\"FS_DEGRADED\"} > 0"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.5.4"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephFilesystemMDSRanksLow"
+        annotations:
+          description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value."
+          documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max"
+          summary: "Ceph MDS daemon count is lower than configured"
+        expr: "ceph_health_detail{name=\"MDS_UP_LESS_THAN_MAX\"} > 0"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephFilesystemInsufficientStandby"
+        annotations:
+          description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons."
+          documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby"
+          summary: "Ceph filesystem standby daemons too few"
+        expr: "ceph_health_detail{name=\"MDS_INSUFFICIENT_STANDBY\"} > 0"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephFilesystemFailureNoStandby"
+        annotations:
+          description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS."
+          documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds"
+          summary: "MDS daemon failed, no further standby available"
+        expr: "ceph_health_detail{name=\"FS_WITH_FAILED_MDS\"} > 0"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.5.5"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephFilesystemReadOnly"
+        annotations:
+          description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support."
+          documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages"
+          summary: "CephFS filesystem in read only mode due to write error(s)"
+        expr: "ceph_health_detail{name=\"MDS_HEALTH_READ_ONLY\"} > 0"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.5.2"
+          severity: "critical"
+          type: "ceph_default"
+  - name: "mgr"
+    rules:
+      - alert: "CephMgrModuleCrash"
+        annotations:
+          description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash"
+          summary: "A manager module has recently crashed"
+        expr: "ceph_health_detail{name=\"RECENT_MGR_MODULE_CRASH\"} == 1"
+        for: "5m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.6.1"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephMgrPrometheusModuleInactive"
+        annotations:
+          description: "The mgr/prometheus module at {{ $labels.instance }} is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'."
+          summary: "The mgr/prometheus module is not available"
+        expr: "up{job=\"ceph\"} == 0"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.6.2"
+          severity: "critical"
+          type: "ceph_default"
+  - name: "pgs"
+    rules:
+      - alert: "CephPGsInactive"
+        annotations:
+          description: "{{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}. Inactive placement groups are not able to serve read/write requests."
+          summary: "One or more placement groups are inactive"
+        expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0"
+        for: "5m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.7.1"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephPGsUnclean"
+        annotations:
+          description: "{{ $value }} PGs have been unclean for more than 15 minutes in pool {{ $labels.name }}. Unclean PGs have not recovered from a previous failure."
+          summary: "One or more placement groups are marked unclean"
+        expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0"
+        for: "15m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.7.2"
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephPGsDamaged"
+        annotations:
+          description: "During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged"
+          summary: "Placement group damaged, manual intervention needed"
+        expr: "ceph_health_detail{name=~\"PG_DAMAGED|OSD_SCRUB_ERRORS\"} == 1"
+        for: "5m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.7.4"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephPGRecoveryAtRisk"
+        annotations:
+          description: "Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full"
+          summary: "OSDs are too full for recovery"
+        expr: "ceph_health_detail{name=\"PG_RECOVERY_FULL\"} == 1"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.7.5"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephPGUnavilableBlockingIO"
+        annotations:
+          description: "Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability"
+          summary: "PG is unavailable, blocking I/O"
+        expr: "((ceph_health_detail{name=\"PG_AVAILABILITY\"} == 1) - scalar(ceph_health_detail{name=\"OSD_DOWN\"})) == 1"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.7.3"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephPGBackfillAtRisk"
+        annotations:
+          description: "Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full"
+          summary: "Backfill operations are blocked due to lack of free space"
+        expr: "ceph_health_detail{name=\"PG_BACKFILL_FULL\"} == 1"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.7.6"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephPGNotScrubbed"
+        annotations:
+          description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>"
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed"
+          summary: "Placement group(s) have not been scrubbed"
+        expr: "ceph_health_detail{name=\"PG_NOT_SCRUBBED\"} == 1"
+        for: "5m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephPGsHighPerOSD"
+        annotations:
+          description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs"
+          summary: "Placement groups per OSD is too high"
+        expr: "ceph_health_detail{name=\"TOO_MANY_PGS\"} == 1"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephPGNotDeepScrubbed"
+        annotations:
+          description: "One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed"
+          summary: "Placement group(s) have not been deep scrubbed"
+        expr: "ceph_health_detail{name=\"PG_NOT_DEEP_SCRUBBED\"} == 1"
+        for: "5m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+  - name: "nodes"
+    rules:
+      - alert: "CephNodeRootFilesystemFull"
+        annotations:
+          description: "Root volume is dangerously full: {{ $value | humanize }}% free."
+          summary: "Root filesystem is dangerously full"
+        expr: "node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"} * 100 < 5"
+        for: "5m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.8.1"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephNodeNetworkPacketDrops"
+        annotations:
+          description: "Node {{ $labels.instance }} experiences packet drop > 0.5% or > 10 packets/s on interface {{ $labels.device }}."
+          summary: "One or more NICs reports packet drops"
+        expr: |
+          (
+            rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+            rate(node_network_transmit_drop_total{device!="lo"}[1m])
+          ) / (
+            rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+            rate(node_network_transmit_packets_total{device!="lo"}[1m])
+          ) >= 0.0050000000000000001 and (
+            rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+            rate(node_network_transmit_drop_total{device!="lo"}[1m])
+          ) >= 10
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.8.2"
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephNodeNetworkPacketErrors"
+        annotations:
+          description: "Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}."
+          summary: "One or more NICs reports packet errors"
+        expr: |
+          (
+            rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+            rate(node_network_transmit_errs_total{device!="lo"}[1m])
+          ) / (
+            rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+            rate(node_network_transmit_packets_total{device!="lo"}[1m])
+          ) >= 0.0001 or (
+            rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+            rate(node_network_transmit_errs_total{device!="lo"}[1m])
+          ) >= 10
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.8.3"
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephNodeDiskspaceWarning"
+        annotations:
+          description: "Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will be full in less than 5 days based on the 48 hour trailing fill rate."
+          summary: "Host filesystem free space is getting low"
+        expr: "predict_linear(node_filesystem_free_bytes{device=~\"/.*\"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.8.4"
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephNodeInconsistentMTU"
+        annotations:
+          description: "Node {{ $labels.instance }} has a different MTU size ({{ $value }}) than the median of devices named {{ $labels.device }}."
+          summary: "MTU settings across Ceph hosts are inconsistent"
+        expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    max by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    min by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+  - name: "pools"
+    rules:
+      - alert: "CephPoolGrowthWarning"
+        annotations:
+          description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours."
+          summary: "Pool growth rate may soon exceed capacity"
+        expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)    group_right ceph_pool_metadata) >= 95"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.9.2"
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephPoolBackfillFull"
+        annotations:
+          description: "A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity."
+          summary: "Free space in a pool is too low for recovery/backfill"
+        expr: "ceph_health_detail{name=\"POOL_BACKFILLFULL\"} > 0"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "CephPoolFull"
+        annotations:
+          description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range query \"topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))\" }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full"
+          summary: "Pool is full - writes are blocked"
+        expr: "ceph_health_detail{name=\"POOL_FULL\"} > 0"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.9.1"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephPoolNearFull"
+        annotations:
+          description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active."
+          summary: "One or more Ceph pools are nearly full"
+        expr: "ceph_health_detail{name=\"POOL_NEAR_FULL\"} > 0"
+        for: "5m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+  - name: "healthchecks"
+    rules:
+      - alert: "CephSlowOps"
+        annotations:
+          description: "{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
+          summary: "OSD operations are slow to complete"
+        expr: "ceph_healthcheck_slow_ops > 0"
+        for: "30s"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+  - name: "cephadm"
+    rules:
+      - alert: "CephadmUpgradeFailed"
+        annotations:
+          description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue"
+          summary: "Ceph version upgrade has failed"
+        expr: "ceph_health_detail{name=\"UPGRADE_EXCEPTION\"} > 0"
+        for: "30s"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.11.2"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephadmDaemonFailed"
+        annotations:
+          description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'"
+          summary: "A ceph daemon manged by cephadm is down"
+        expr: "ceph_health_detail{name=\"CEPHADM_FAILED_DAEMON\"} > 0"
+        for: "30s"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.11.1"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephadmPaused"
+        annotations:
+          description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'"
+          documentation: "https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused"
+          summary: "Orchestration tasks via cephadm are PAUSED"
+        expr: "ceph_health_detail{name=\"CEPHADM_PAUSED\"} > 0"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+  - name: "PrometheusServer"
+    rules:
+      - alert: "PrometheusJobMissing"
+        annotations:
+          description: "The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster.  Please review the job definitions in the prometheus.yml file of the prometheus instance."
+          summary: "The scrape job for Ceph is missing from Prometheus"
+        expr: "absent(up{job=\"ceph\"})"
+        for: "30s"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.12.1"
+          severity: "critical"
+          type: "ceph_default"
+  - name: "rados"
+    rules:
+      - alert: "CephObjectMissing"
+        annotations:
+          description: "The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound"
+          summary: "Object(s) marked UNFOUND"
+        expr: "(ceph_health_detail{name=\"OBJECT_UNFOUND\"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1"
+        for: "30s"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.10.1"
+          severity: "critical"
+          type: "ceph_default"
+  - name: "generic"
+    rules:
+      - alert: "CephDaemonCrash"
+        annotations:
+          description: "One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command."
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash"
+          summary: "One or more Ceph daemons have crashed, and are pending acknowledgement"
+        expr: "ceph_health_detail{name=\"RECENT_CRASH\"} == 1"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.1.2"
+          severity: "critical"
+          type: "ceph_default"
--- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py
+++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py
@ -322,7 +322,6 @@ import re
 import socket
 import subprocess
 import tempfile
-import uuid
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union

@ -339,7 +338,7 @@ LIBAPI = 0

 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 21
+LIBPATCH = 22

 logger = logging.getLogger(__name__)

@ -356,7 +355,10 @@ ALLOWED_KEYS = {
    "sample_limit",
    "label_limit",
    "label_name_length_limit",
-    "label_value_lenght_limit",
+    "label_value_length_limit",
+    "scheme",
+    "basic_auth",
+    "tls_config",
 }
 DEFAULT_JOB = {
    "metrics_path": "/metrics",
@ -639,6 +641,12 @@ class AlertRules:
                logger.error("Failed to read alert rules from %s: %s", file_path.name, e)
                return []

+            if not rule_file:
+                logger.warning("Empty rules file: %s", file_path.name)
+                return []
+            if not isinstance(rule_file, dict):
+                logger.error("Invalid rules file (must be a dict): %s", file_path.name)
+                return []
            if _is_official_alert_rule_format(rule_file):
                alert_groups = rule_file["groups"]
            elif _is_single_alert_rule_format(rule_file):
@ -920,7 +928,7 @@ class MetricsEndpointConsumer(Object):
        for topology_identifier, alert_rule_groups in self.metrics_consumer.alerts().items():
            filename = "juju_" + topology_identifier + ".rules"
            path = os.path.join(PROMETHEUS_RULES_DIR, filename)
-            rules = yaml.dump(alert_rule_groups)
+            rules = yaml.safe_dump(alert_rule_groups)
            container.push(path, rules, make_dirs=True)
        ```

@ -937,7 +945,6 @@ class MetricsEndpointConsumer(Object):
            if not alert_rules:
                continue

-            identifier = None
            try:
                scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"])
                identifier = JujuTopology.from_dict(scrape_metadata).identifier
@ -1118,7 +1125,7 @@ class MetricsEndpointConsumer(Object):

        # label all static configs in the Prometheus job
        # labeling inserts Juju topology information and
-        # sets a relable config for instance labels
+        # sets a relabeling config for instance labels
        for static_config in static_configs:
            labels = static_config.get("labels", {}) if static_configs else {}
            all_targets = static_config.get("targets", [])
@ -1187,7 +1194,7 @@ class MetricsEndpointConsumer(Object):

        Returns:
            a copy of the `labels` dictionary augmented with Juju
-            topology information with the exception of unit name.
+            topology information except for unit name.
        """
        juju_labels = labels.copy()  # deep copy not needed
        juju_labels.update(JujuTopology.from_dict(scrape_metadata).label_matcher_dict)
@ -1262,7 +1269,7 @@ class MetricsEndpointConsumer(Object):
 def _dedupe_job_names(jobs: List[dict]):
    """Deduplicate a list of dicts by appending a hash to the value of the 'job_name' key.

-    Additionally fully dedeuplicate any identical jobs.
+    Additionally, fully de-duplicate any identical jobs.

    Args:
        jobs: A list of prometheus scrape jobs
@ -1345,6 +1352,7 @@ class MetricsEndpointProvider(Object):
        jobs=None,
        alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH,
        refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None,
+        external_hostname: str = None,
    ):
        """Construct a metrics provider for a Prometheus charm.

@ -1430,7 +1438,7 @@ class MetricsEndpointProvider(Object):

        Args:
            charm: a `CharmBase` object that manages this
-                `MetricsEndpointProvider` object. Typically this is
+                `MetricsEndpointProvider` object. Typically, this is
                `self` in the instantiating class.
            relation_name: an optional string name of the relation between `charm`
                and the Prometheus charmed service. The default is "metrics-endpoint".
@ -1449,6 +1457,8 @@ class MetricsEndpointProvider(Object):
                The alert rules are automatically updated on charm upgrade.
            refresh_event: an optional bound event or list of bound events which
                will be observed to re-set scrape job data (IP address and others)
+            external_hostname: an optional argument that represents an external hostname that
+                can be generated by an Ingress or a Proxy.

        Raises:
            RelationNotFoundError: If there is no relation in the charm's metadata.yaml
@ -1482,7 +1492,7 @@ class MetricsEndpointProvider(Object):
        # sanitize job configurations to the supported subset of parameters
        jobs = [] if jobs is None else jobs
        self._jobs = [_sanitize_scrape_configuration(job) for job in jobs]
-
+        self.external_hostname = external_hostname
        events = self._charm.on[self._relation_name]
        self.framework.observe(events.relation_joined, self._set_scrape_job_spec)
        self.framework.observe(events.relation_changed, self._on_relation_changed)
@ -1510,7 +1520,7 @@ class MetricsEndpointProvider(Object):
                refresh_event = [refresh_event]

        for ev in refresh_event:
-            self.framework.observe(ev, self._set_unit_ip)
+            self.framework.observe(ev, self._set_scrape_job_spec)

        self.framework.observe(self._charm.on.upgrade_charm, self._set_scrape_job_spec)

@ -1539,7 +1549,7 @@ class MetricsEndpointProvider(Object):
        When a metrics provider charm is related to a prometheus charm, the
        metrics provider sets specification and metadata related to its own
        scrape configuration. This information is set using Juju application
-        data. In addition each of the consumer units also sets its own
+        data. In addition, each of the consumer units also sets its own
        host address in Juju unit relation data.
        """
        self._set_unit_ip(event)
@ -1568,16 +1578,21 @@ class MetricsEndpointProvider(Object):
        Each time a metrics provider charm container is restarted it updates its own
        host address in the unit relation data for the prometheus charm.

-        The only argument specified is an event and it ignored. this is for expediency
+        The only argument specified is an event, and it ignored. This is for expediency
        to be able to use this method as an event handler, although no access to the
        event is actually needed.
        """
        for relation in self._charm.model.relations[self._relation_name]:
            unit_ip = str(self._charm.model.get_binding(relation).network.bind_address)
-            relation.data[self._charm.unit]["prometheus_scrape_unit_address"] = (
-                unit_ip if self._is_valid_unit_address(unit_ip) else socket.getfqdn()
-            )

+            if self.external_hostname:
+                unit_address = self.external_hostname
+            elif self._is_valid_unit_address(unit_ip):
+                unit_address = unit_ip
+            else:
+                unit_address = socket.getfqdn()
+
+            relation.data[self._charm.unit]["prometheus_scrape_unit_address"] = unit_address
            relation.data[self._charm.unit]["prometheus_scrape_unit_name"] = str(
                self._charm.model.unit.name
            )
@ -1634,7 +1649,7 @@ class PrometheusRulesProvider(Object):
        relation_name: Name of the relation in `metadata.yaml` that
            has the `prometheus_scrape` interface.
        dir_path: Root directory for the collection of rule files.
-        recursive: Whether or not to scan for rule files recursively.
+        recursive: Whether to scan for rule files recursively.
    """

    def __init__(
@ -1696,7 +1711,7 @@ class MetricsEndpointAggregator(Object):

    `MetricsEndpointAggregator` collects scrape target information from one
    or more related charms and forwards this to a `MetricsEndpointConsumer`
-    charm, which may be in a different Juju model. However it is
+    charm, which may be in a different Juju model. However, it is
    essential that `MetricsEndpointAggregator` itself resides in the same
    model as its scrape targets, as this is currently the only way to
    ensure in Juju that the `MetricsEndpointAggregator` will be able to
@ -1765,7 +1780,7 @@ class MetricsEndpointAggregator(Object):
    information, just like `MetricsEndpointProvider` and
    `MetricsEndpointConsumer` do.

-    By default `MetricsEndpointAggregator` ensures that Prometheus
+    By default, `MetricsEndpointAggregator` ensures that Prometheus
    "instance" labels refer to Juju topology. This ensures that
    instance labels are stable over unit recreation. While it is not
    advisable to change this option, if required it can be done by
@ -1778,7 +1793,7 @@ class MetricsEndpointAggregator(Object):

        Args:
            charm: a `CharmBase` object that manages this
-                `MetricsEndpointAggregator` object. Typically this is
+                `MetricsEndpointAggregator` object. Typically, this is
                `self` in the instantiating class.
            relation_names: a dictionary with three keys. The value
                of the "scrape_target" and "alert_rules" keys are
@ -1843,7 +1858,7 @@ class MetricsEndpointAggregator(Object):
        When there is any change in relation data with any scrape
        target, the Prometheus scrape job, for that specific target is
        updated. Additionally, if this method is called manually, do the
-        sameself.
+        same.

        Args:
            targets: a `dict` containing target information
@ -1985,7 +2000,7 @@ class MetricsEndpointAggregator(Object):

        Scrape target information is returned for each unit in the
        relation. This information contains the unit name, network
-        hostname (or address) for that unit, and port on which an
+        hostname (or address) for that unit, and port on which a
        metrics endpoint is exposed in that unit.

        Args:
@ -2142,7 +2157,7 @@ class MetricsEndpointAggregator(Object):
        labels are stable across unit recreation.

        Returns:
-            a list of Prometheus relabling configurations. Each item in
+            a list of Prometheus relabeling configurations. Each item in
            this list is one relabel configuration.
        """
        return (
@ -2216,22 +2231,7 @@ class CosTool:

        with tempfile.TemporaryDirectory() as tmpdir:
            rule_path = Path(tmpdir + "/validate_rule.yaml")
-
-            # Smash "our" rules format into what upstream actually uses, which is more like:
-            #
-            # groups:
-            #   - name: foo
-            #     rules:
-            #       - alert: SomeAlert
-            #         expr: up
-            #       - alert: OtherAlert
-            #         expr: up
-            transformed_rules = {"groups": []}  # type: ignore
-            for rule in rules["groups"]:
-                transformed = {"name": str(uuid.uuid4()), "rules": [rule]}
-                transformed_rules["groups"].append(transformed)
-
-            rule_path.write_text(yaml.dump(transformed_rules))
+            rule_path.write_text(yaml.dump(rules))

            args = [str(self.path), "validate", str(rule_path)]
            # noinspection PyBroadException
@ -2240,7 +2240,13 @@ class CosTool:
                return True, ""
            except subprocess.CalledProcessError as e:
                logger.debug("Validating the rules failed: %s", e.output)
-                return False, ", ".join([line for line in e.output if "error validating" in line])
+                return False, ", ".join(
+                    [
+                        line
+                        for line in e.output.decode("utf8").splitlines()
+                        if "error validating" in line
+                    ]
+                )

    def inject_label_matchers(self, expression, topology) -> str:
        """Add label matchers to an expression."""
@ -2277,6 +2283,5 @@ class CosTool:
        return None

    def _exec(self, cmd) -> str:
-        result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE)
-        output = result.stdout.decode("utf-8").strip()
-        return output
+        result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+        return result.stdout.decode("utf-8").strip()
--- a/metadata.yaml
+++ b/metadata.yaml
@ -43,3 +43,8 @@ provides:
 requires:
  bootstrap-source:
    interface: ceph-bootstrap
+resources:
+  alert-rules:
+    type: file
+    filename: alert.yaml.rules
+    description: "Alerting rules"
--- a/src/ceph_hooks.py
+++ b/src/ceph_hooks.py
@ -1235,7 +1235,7 @@ def is_unsupported_cmr(unit_name):
    return unsupported


-def assess_status():
+def assess_status(charm=None):
    '''Assess status of current unit'''
    application_version_set(get_upstream_version(VERSION_PACKAGE))
    if not config('permit-insecure-cmr'):
@ -1291,6 +1291,9 @@ def assess_status():
        status_set('blocked', str(e))
        return

+    if charm is not None and charm.metrics_endpoint.assess_alert_rule_errors():
+        return
+
    # active - bootstrapped + quorum status check
    if ceph.is_bootstrapped() and ceph.is_quorum():
        expected_osd_count = config('expected-osd-count') or 3
--- a/src/ceph_metrics.py
+++ b/src/ceph_metrics.py
@ -5,9 +5,16 @@

 Configure prometheus scrape jobs via the metrics-endpoint relation.
 """
+import json
 import logging
+import os.path
+import pathlib
 from typing import Optional, Union, List

+import ops.model
+from ops.model import BlockedStatus
+
+import charm
 from charms.prometheus_k8s.v0 import prometheus_scrape
 from charms_ceph import utils as ceph_utils
 from ops.framework import BoundEvent
@ -19,15 +26,16 @@ DEFAULT_CEPH_JOB = {
    "metrics_path": "/metrics",
    "static_configs": [{"targets": ["*:9283"]}],
 }
+DEFAULT_ALERT_RULES_RELATIVE_PATH = "files/prometheus_alert_rules"


 class CephMetricsEndpointProvider(prometheus_scrape.MetricsEndpointProvider):
    def __init__(
        self,
-        charm,
+        charm: charm.CephMonCharm,
        relation_name: str = prometheus_scrape.DEFAULT_RELATION_NAME,
        jobs=None,
-        alert_rules_path: str = prometheus_scrape.DEFAULT_ALERT_RULES_RELATIVE_PATH,  # noqa
+        alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH,
        refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None,
    ):
        if jobs is None:
@ -43,6 +51,11 @@ class CephMetricsEndpointProvider(prometheus_scrape.MetricsEndpointProvider):
        self.framework.observe(
            events.relation_departed, self._on_relation_departed
        )
+        self.framework.observe(
+            self.on.alert_rule_status_changed,
+            self._on_alert_rule_status_changed,
+        )
+        charm._stored.set_default(alert_rule_errors=None)

    def _on_relation_changed(self, event):
        """Enable prometheus on relation change"""
@ -63,3 +76,64 @@ class CephMetricsEndpointProvider(prometheus_scrape.MetricsEndpointProvider):
            )
            ceph_utils.mgr_disable_module("prometheus")
            logger.debug("module_disabled")
+            # We're not related to prom, don't care about alert rules
+            self._charm._stored.alert_rule_errors = None
+
+    def assess_alert_rule_errors(self):
+        if self._charm._stored.alert_rule_errors:
+            self._charm.unit.status = BlockedStatus(
+                "invalid alert rules, check unit logs"
+            )
+            return True
+
+    def _on_alert_rule_status_changed(self, event):
+        logger.debug(
+            "alert rule status changed: %s, %s, %s",
+            event,
+            event.valid,
+            event.errors,
+        )
+        if event.errors:
+            logger.warning("invalid alert rules: %s", event.errors)
+            self._charm._stored.alert_rule_errors = event.errors
+        else:
+            self._charm._stored.alert_rule_errors = None
+
+    def get_alert_rules_resource(self):
+        try:
+            return self._charm.model.resources.fetch("alert-rules")
+        except ops.model.ModelError as e:
+            logger.warning("can't get alert-rules resource: %s", e)
+
+    def _set_alert_rules(self, rules_dict):
+        logger.debug("set alert rules: %s", rules_dict)
+        # alert rules seem ok locally, clear any errors
+        # prometheus may still signal alert rule errors
+        # via the relation though
+        self._charm._stored.alert_rule_errors = None
+
+        for relation in self._charm.model.relations[self._relation_name]:
+            relation.data[self._charm.app]["alert_rules"] = json.dumps(
+                rules_dict
+            )
+
+    def update_alert_rules(self):
+        if self._charm.unit.is_leader() and ceph_utils.is_bootstrapped():
+            resource = self.get_alert_rules_resource()
+            if resource is None or not os.path.getsize(resource):
+                logger.debug("empty rules resource, clearing alert rules")
+                self._set_alert_rules({})
+                return
+            sink = pathlib.Path(self._alert_rules_path) / "alert.yaml.rules"
+            if sink.exists():
+                sink.unlink()
+            sink.symlink_to(resource)
+            alert_rules = prometheus_scrape.AlertRules(topology=self.topology)
+            alert_rules.add_path(str(sink), recursive=True)
+            alert_rules_as_dict = alert_rules.as_dict()
+            if not alert_rules_as_dict:
+                msg = "invalid alert rules: {}".format(sink.open().read())
+                logger.warning(msg)
+                self._charm._stored.alert_rule_errors = msg
+                return
+            self._set_alert_rules(alert_rules_as_dict)
--- a/src/charm.py
+++ b/src/charm.py
@ -40,58 +40,77 @@ class CephMonCharm(ops_openstack.core.OSBaseCharm):
            systemd.service_pause('ceph-create-keys')
        except systemd.SystemdError:
            pass
+        hooks.assess_status(self)

    def on_config(self, event):
        hooks.config_changed()
+        hooks.assess_status(self)

    def on_pre_series_upgrade(self, event):
        hooks.pre_series_upgrade()
+        hooks.assess_status(self)

    def on_upgrade(self, event):
+        self.metrics_endpoint.update_alert_rules()
        hooks.upgrade_charm()
+        hooks.assess_status(self)

    def on_post_series_upgrade(self, event):
        hooks.post_series_upgrade()
+        hooks.assess_status(self)

    # Relations.
    def on_mon_relation_joined(self, event):
        hooks.mon_relation_joined()
+        hooks.assess_status(self)

    def on_bootstrap_source_relation_changed(self, event):
        hooks.bootstrap_source_relation_changed()
+        hooks.assess_status(self)

    def on_prometheus_relation_joined_or_changed(self, event):
        hooks.prometheus_relation()
+        hooks.assess_status(self)

    def on_prometheus_relation_departed(self, event):
        hooks.prometheus_left()
+        hooks.assess_status(self)

    def on_mon_relation(self, event):
        hooks.mon_relation()
+        hooks.assess_status(self)

    def on_osd_relation(self, event):
        hooks.osd_relation()
+        hooks.assess_status(self)

    def on_dashboard_relation_joined(self, event):
        hooks.dashboard_relation()
+        hooks.assess_status(self)

    def on_radosgw_relation(self, event):
        hooks.radosgw_relation()
+        hooks.assess_status(self)

    def on_rbd_mirror_relation(self, event):
        hooks.rbd_mirror_relation()
+        hooks.assess_status(self)

    def on_mds_relation(self, event):
        hooks.mds_relation_joined()
+        hooks.assess_status(self)

    def on_admin_relation(self, event):
        hooks.admin_relation_joined()
+        hooks.assess_status(self)

    def on_client_relation(self, event):
        hooks.client_relation()
+        hooks.assess_status(self)

    def on_nrpe_relation(self, event):
        hooks.update_nrpe_config()
+        hooks.assess_status(self)

    # Actions.

@ -195,4 +214,3 @@ class CephMonCharm(ops_openstack.core.OSBaseCharm):

 if __name__ == '__main__':
    main(CephMonCharm)
-    hooks.assess_status()
--- a/unit_tests/test_ceph_metrics.py
+++ b/unit_tests/test_ceph_metrics.py
@ -1,4 +1,8 @@
 #!/usr/bin/env python3
+import json
+import pathlib
+import tempfile
+import textwrap

 # Copyright 2022 Canonical Ltd.
 # See LICENSE file for licensing details.
@ -9,10 +13,30 @@ import unittest
 from ops import storage, model, framework
 from ops.testing import Harness, _TestingModelBackend

+import ceph_metrics  # noqa: avoid circ. import
 import charm


 class TestCephMetrics(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Run once before tests begin."""
+        cls.tempdir = tempfile.TemporaryDirectory()
+        cls.tmp = pathlib.Path(cls.tempdir.name)
+        cls.rules_dir = cls.tmp / "rules"
+        cls.rules_dir.mkdir()
+        cls.rules = textwrap.dedent(
+            """
+            groups:
+              - name: "testgroup"
+                rules: []
+            """
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.tempdir.cleanup()
+
    def setUp(self):
        super().setUp()
        self.harness = Harness(charm.CephMonCharm)
@ -42,9 +66,12 @@ class TestCephMetrics(unittest.TestCase):
            self.harness._model,
        )
        # END Workaround
+
        self.addCleanup(self.harness.cleanup)
+
        self.harness.begin()
        self.harness.set_leader(True)
+        self.harness.charm.metrics_endpoint._alert_rules_path = self.rules_dir

    def test_init(self):
        self.assertEqual(
@ -94,3 +121,40 @@ class TestCephMetrics(unittest.TestCase):

        self.harness.remove_relation(rel_id)
        mgr_disable_module.assert_called_once()
+
+    def get_alert_rules(self, rel_id):
+        app_rel_data = self.harness.get_relation_data(
+            rel_id, self.harness.model.app
+        )
+        return json.loads(app_rel_data["alert_rules"])
+
+    @patch("ceph_metrics.ceph_utils.is_bootstrapped", return_value=True)
+    @patch("ceph_metrics.CephMetricsEndpointProvider._set_alert_rules")
+    def test_update_alert_rules_empty(
+        self, set_alert_rules, _is_bootstrapped,
+    ):
+        """Test: no alert rules created with empty alert rules file."""
+        rel_id = self.harness.add_relation("metrics-endpoint", "prometheus")
+        self.harness.add_relation_unit(rel_id, "prometheus/0")
+        self.harness.add_resource("alert-rules", "")
+        self.harness.charm.metrics_endpoint.update_alert_rules()
+        set_alert_rules.assert_called_with({})
+
+    @patch("ceph_metrics.ceph_utils.is_bootstrapped", return_value=True)
+    def test_update_alert_rules_invalid(self, _is_bootstrapped):
+        rel_id = self.harness.add_relation("metrics-endpoint", "prometheus")
+        self.harness.add_relation_unit(rel_id, "prometheus/0")
+        self.harness.add_resource("alert-rules", "not-a-rule")
+        self.harness.charm.metrics_endpoint.update_alert_rules()
+        self.assertTrue(
+            self.harness.charm.metrics_endpoint.assess_alert_rule_errors()
+        )
+
+    @patch("ceph_metrics.ceph_utils.is_bootstrapped", return_value=True)
+    def test_update_alert_rules(self, _is_bootstrapped):
+        rel_id = self.harness.add_relation("metrics-endpoint", "prometheus")
+        self.harness.add_relation_unit(rel_id, "prometheus/0")
+        self.harness.add_resource("alert-rules", self.rules)
+        self.harness.charm.metrics_endpoint.update_alert_rules()
+        alert_rules = self.get_alert_rules(rel_id)
+        self.assertTrue(alert_rules.get("groups"))