140 lines
5.0 KiB
Python
140 lines
5.0 KiB
Python
# Copyright 2022 Canonical Ltd.
|
|
# See LICENSE file for licensing details.
|
|
|
|
"""Provide ceph metrics to prometheus
|
|
|
|
Configure prometheus scrape jobs via the metrics-endpoint relation.
|
|
"""
|
|
import json
|
|
import logging
|
|
import os.path
|
|
import pathlib
|
|
from typing import Optional, Union, List, TYPE_CHECKING
|
|
|
|
import ops.model
|
|
|
|
if TYPE_CHECKING:
|
|
import charm
|
|
|
|
from charms.prometheus_k8s.v0 import prometheus_scrape
|
|
from charms_ceph import utils as ceph_utils
|
|
from ops.framework import BoundEvent
|
|
from utils import mgr_config_set_rbd_stats_pools
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DEFAULT_CEPH_JOB = {
|
|
"metrics_path": "/metrics",
|
|
"static_configs": [{"targets": ["*:9283"]}],
|
|
}
|
|
DEFAULT_ALERT_RULES_RELATIVE_PATH = "files/prometheus_alert_rules"
|
|
|
|
|
|
class CephMetricsEndpointProvider(prometheus_scrape.MetricsEndpointProvider):
|
|
def __init__(
|
|
self,
|
|
charm: "charm.CephMonCharm",
|
|
relation_name: str = prometheus_scrape.DEFAULT_RELATION_NAME,
|
|
jobs=None,
|
|
alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH,
|
|
refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None,
|
|
):
|
|
if jobs is None:
|
|
jobs = [DEFAULT_CEPH_JOB]
|
|
super().__init__(
|
|
charm,
|
|
relation_name=relation_name,
|
|
jobs=jobs,
|
|
alert_rules_path=alert_rules_path,
|
|
refresh_event=refresh_event,
|
|
)
|
|
events = charm.on[relation_name]
|
|
self.framework.observe(
|
|
events.relation_departed, self._on_relation_departed
|
|
)
|
|
self.framework.observe(
|
|
self.on.alert_rule_status_changed,
|
|
self._on_alert_rule_status_changed,
|
|
)
|
|
charm._stored.set_default(alert_rule_errors=None)
|
|
|
|
def _on_relation_changed(self, event):
|
|
"""Enable prometheus on relation change"""
|
|
if self._charm.unit.is_leader() and ceph_utils.is_bootstrapped():
|
|
logger.debug(
|
|
"is_leader and is_bootstrapped, running rel changed: %s", event
|
|
)
|
|
mgr_config_set_rbd_stats_pools()
|
|
ceph_utils.mgr_enable_module("prometheus")
|
|
logger.debug("module_enabled")
|
|
self.update_alert_rules()
|
|
super()._on_relation_changed(event)
|
|
|
|
def _on_relation_departed(self, event):
|
|
"""Disable prometheus on depart of relation"""
|
|
if self._charm.unit.is_leader() and ceph_utils.is_bootstrapped():
|
|
logger.debug(
|
|
"is_leader and is_bootstrapped, running rel departed: %s",
|
|
event,
|
|
)
|
|
ceph_utils.mgr_disable_module("prometheus")
|
|
logger.debug("module_disabled")
|
|
# We're not related to prom, don't care about alert rules
|
|
self._charm._stored.alert_rule_errors = None
|
|
|
|
def have_alert_rule_errors(self):
|
|
return bool(self._charm._stored.alert_rule_errors)
|
|
|
|
def _on_alert_rule_status_changed(self, event):
|
|
logger.debug(
|
|
"alert rule status changed: %s, %s, %s",
|
|
event,
|
|
event.valid,
|
|
event.errors,
|
|
)
|
|
if event.errors:
|
|
logger.warning("invalid alert rules: %s", event.errors)
|
|
self._charm._stored.alert_rule_errors = event.errors
|
|
else:
|
|
self._charm._stored.alert_rule_errors = None
|
|
|
|
def get_alert_rules_resource(self):
|
|
try:
|
|
return self._charm.model.resources.fetch("alert-rules")
|
|
except ops.model.ModelError as e:
|
|
logger.warning("can't get alert-rules resource: %s", e)
|
|
|
|
def _set_alert_rules(self, rules_dict):
|
|
logger.debug("set alert rules: %s", rules_dict)
|
|
# alert rules seem ok locally, clear any errors
|
|
# prometheus may still signal alert rule errors
|
|
# via the relation though
|
|
self._charm._stored.alert_rule_errors = None
|
|
|
|
for relation in self._charm.model.relations[self._relation_name]:
|
|
relation.data[self._charm.app]["alert_rules"] = json.dumps(
|
|
rules_dict
|
|
)
|
|
|
|
def update_alert_rules(self):
|
|
if self._charm.unit.is_leader() and ceph_utils.is_bootstrapped():
|
|
resource = self.get_alert_rules_resource()
|
|
if resource is None or not os.path.getsize(resource):
|
|
logger.debug("empty rules resource, clearing alert rules")
|
|
self._set_alert_rules({})
|
|
return
|
|
sink = pathlib.Path(self._alert_rules_path) / "alert.yaml.rules"
|
|
if sink.exists() or sink.is_symlink():
|
|
sink.unlink()
|
|
sink.symlink_to(resource)
|
|
alert_rules = prometheus_scrape.AlertRules(topology=self.topology)
|
|
alert_rules.add_path(str(sink), recursive=True)
|
|
alert_rules_as_dict = alert_rules.as_dict()
|
|
if not alert_rules_as_dict:
|
|
msg = "invalid alert rules: {}".format(sink.open().read())
|
|
logger.warning(msg)
|
|
self._charm._stored.alert_rule_errors = msg
|
|
return
|
|
self._set_alert_rules(alert_rules_as_dict)
|