244 lines
11 KiB
Python
244 lines
11 KiB
Python
# (C) Copyright 2017 Hewlett Packard Enterprise Development LP
|
|
import requests
|
|
|
|
from monasca_agent.collector import checks
|
|
from monasca_agent.collector.checks import utils
|
|
|
|
DEFAULT_TIMEOUT = 5
|
|
NODE_CONDITIONS_MAP = {
|
|
"OutOfDisk": {
|
|
"metric_name": "node.out_of_disk",
|
|
"expected_status": "False"
|
|
},
|
|
"MemoryPressure": {
|
|
"metric_name": "node.memory_pressure",
|
|
"expected_status": "False"
|
|
},
|
|
"DiskPressure": {
|
|
"metric_name": "node.disk_pressure",
|
|
"expected_status": "False"
|
|
},
|
|
"Ready": {
|
|
"metric_name": "node.ready_status",
|
|
"expected_status": "True"
|
|
}
|
|
}
|
|
|
|
|
|
class KubernetesAPI(checks.AgentCheck):
|
|
"""Queries Kubernetes API to get metrics about the Kubernetes deployment
|
|
"""
|
|
def __init__(self, name, init_config, agent_config, instances=None):
|
|
checks.AgentCheck.__init__(self, name, init_config, agent_config, instances)
|
|
if instances is not None and len(instances) > 1:
|
|
raise Exception('Kubernetes api check only supports one configured instance.')
|
|
self.connection_timeout = int(init_config.get('connection_timeout', DEFAULT_TIMEOUT))
|
|
self.kubernetes_connector = None
|
|
self.kubernetes_api = None
|
|
|
|
def prepare_run(self):
|
|
"""Set up Kubernetes connection information"""
|
|
instance = self.instances[0]
|
|
host = instance.get("host", None)
|
|
derive_api_url = instance.get("derive_api_url", None)
|
|
if not host:
|
|
if derive_api_url:
|
|
self.kubernetes_connector = utils.KubernetesConnector(self.connection_timeout)
|
|
else:
|
|
exception_message = "Either Kubernetes API url (host and port)" \
|
|
" or derive_api_url=True must be set" \
|
|
" when running Kubernetes API plugin."
|
|
self.log.error(exception_message)
|
|
raise Exception(exception_message)
|
|
else:
|
|
kubernetes_api_port = instance.get("kubernetes_api_port", "8080")
|
|
self.kubernetes_api = "http://{}:{}".format(host, kubernetes_api_port)
|
|
|
|
def check(self, instance):
|
|
kubernetes_labels = instance.get('kubernetes_labels', ["app"])
|
|
dimensions = self._set_dimensions(None, instance)
|
|
# Remove hostname from dimensions as the majority of the metrics are not
|
|
# tied to the hostname.
|
|
del dimensions['hostname']
|
|
kubernetes_api_health = self._get_api_health()
|
|
self.gauge("kubernetes.api.health_status", 0 if kubernetes_api_health else 1, dimensions,
|
|
hostname="SUPPRESS")
|
|
self._report_cluster_component_statuses(dimensions)
|
|
self._report_nodes_metrics(dimensions)
|
|
self._report_deployment_metrics(dimensions, kubernetes_labels)
|
|
self._report_replication_controller_metrics(dimensions, kubernetes_labels)
|
|
|
|
def _send_request(self, endpoint, as_json=True):
|
|
if self.kubernetes_connector:
|
|
return self.kubernetes_connector.get_request(endpoint, as_json=as_json)
|
|
else:
|
|
result = requests.get("{}/{}".format(self.kubernetes_api, endpoint))
|
|
return result.json() if as_json else result
|
|
|
|
def _get_api_health(self):
|
|
try:
|
|
result = self._send_request("healthz", as_json=False)
|
|
except Exception as e:
|
|
self.log.error("Error connecting to the health endpoint with exception {}".format(e))
|
|
return False
|
|
else:
|
|
# Return true if 'ok' is in result
|
|
return 'ok' in result.iter_lines()
|
|
|
|
def _report_cluster_component_statuses(self, dimensions):
|
|
try:
|
|
component_statuses = self._send_request("/api/v1/componentstatuses")
|
|
except Exception as e:
|
|
self.log.error("Error getting data from Kubernetes API - {}".format(e))
|
|
return
|
|
for component in component_statuses['items']:
|
|
component_dimensions = dimensions.copy()
|
|
component_dimensions['component_name'] = component['metadata']['name']
|
|
component_status = False
|
|
component_conditions = component['conditions']
|
|
for condition in component_conditions:
|
|
if 'type' in condition and condition['type'] == 'Healthy':
|
|
if condition['status']:
|
|
component_status = True
|
|
break
|
|
self.gauge(
|
|
"kubernetes.component_status",
|
|
0 if component_status else 1,
|
|
component_dimensions,
|
|
hostname="SUPPRESS")
|
|
|
|
def _set_kubernetes_dimensions(self, dimensions, type, metadata, kubernetes_labels):
|
|
dimensions['type'] = metadata['name']
|
|
dimensions['namespace'] = metadata['namespace']
|
|
if 'labels' in metadata:
|
|
labels = metadata['labels']
|
|
for label in kubernetes_labels:
|
|
if label in labels:
|
|
dimensions[label] = labels[label]
|
|
|
|
def _report_node_resource_metrics(self, resource, metrics, node_dimensions):
|
|
resource_metrics_dimensions = node_dimensions.copy()
|
|
for metric_name, metric_value in metrics.items():
|
|
if "gpu" in metric_name:
|
|
continue
|
|
if metric_name == "memory":
|
|
metric_name += "_bytes"
|
|
metric_value = utils.convert_memory_string_to_bytes(metric_value)
|
|
resource_metrics_dimensions.update({'unit': 'bytes'})
|
|
elif metric_name == "cpu":
|
|
resource_metrics_dimensions.update({'unit': 'cores'})
|
|
metric_name = "kubernetes.node.{}.{}".format(resource, metric_name)
|
|
self.gauge(metric_name, float(metric_value), resource_metrics_dimensions)
|
|
|
|
def _report_node_conditions_metrics(self, node_conditions, node_dimensions):
|
|
for condition in node_conditions:
|
|
condition_type = condition["type"]
|
|
if condition_type in NODE_CONDITIONS_MAP:
|
|
condition_map = NODE_CONDITIONS_MAP[condition_type]
|
|
condition_status = condition['status']
|
|
if condition_status == condition_map['expected_status']:
|
|
self.gauge("kubernetes." + condition_map['metric_name'], 0, node_dimensions)
|
|
else:
|
|
value_meta = {"reason": condition['message'][:1024]}
|
|
self.gauge(
|
|
"kubernetes." +
|
|
condition_map['metric_name'],
|
|
1,
|
|
node_dimensions,
|
|
value_meta=value_meta)
|
|
|
|
def _report_nodes_metrics(self, dimensions):
|
|
try:
|
|
nodes = self._send_request("/api/v1/nodes")
|
|
except Exception as e:
|
|
self.log.error("Error getting node data from Kubernetes API - {}".format(e))
|
|
return
|
|
for node in nodes['items']:
|
|
node_dimensions = dimensions.copy()
|
|
node_dimensions['hostname'] = node['metadata']['name']
|
|
node_status = node['status']
|
|
self._report_node_conditions_metrics(node_status['conditions'], node_dimensions)
|
|
if 'spec' in node and 'unschedulable' in node['spec']:
|
|
if node['spec']['unschedulable']:
|
|
continue
|
|
node_capacity = node_status['capacity']
|
|
node_allocatable = node_status['allocatable']
|
|
self._report_node_resource_metrics('capacity', node_capacity, node_dimensions)
|
|
self._report_node_resource_metrics('allocatable', node_allocatable, node_dimensions)
|
|
|
|
def _report_deployment_metrics(self, dimensions, kubernetes_labels):
|
|
try:
|
|
deployments = self._send_request("/apis/extensions/v1beta1/deployments")
|
|
except Exception as e:
|
|
self.log.error("Error getting deployment data from Kubernetes API - {}".format(e))
|
|
return
|
|
for deployment in deployments['items']:
|
|
try:
|
|
deployment_dimensions = dimensions.copy()
|
|
self._set_kubernetes_dimensions(
|
|
deployment_dimensions,
|
|
"deployment",
|
|
deployment['metadata'],
|
|
kubernetes_labels)
|
|
deployment_status = deployment['status']
|
|
deployment_replicas = deployment_status['replicas']
|
|
deployment_updated_replicas = deployment_status['updatedReplicas']
|
|
deployment_available_replicas = deployment_status['availableReplicas']
|
|
deployment_unavailable_replicas = \
|
|
deployment_available_replicas - deployment_replicas
|
|
self.gauge("kubernetes.deployment.replicas", deployment_replicas,
|
|
deployment_dimensions, hostname="SUPPRESS")
|
|
self.gauge(
|
|
"kubernetes.deployment.available_replicas",
|
|
deployment_available_replicas,
|
|
deployment_dimensions,
|
|
hostname="SUPPRESS")
|
|
self.gauge(
|
|
"kubernetes.deployment.unavailable_replicas",
|
|
deployment_unavailable_replicas,
|
|
deployment_dimensions,
|
|
hostname="SUPPRESS")
|
|
self.gauge("kubernetes.deployment.updated_replicas", deployment_updated_replicas,
|
|
deployment_dimensions, hostname="SUPPRESS")
|
|
except Exception as e:
|
|
self.log.info(
|
|
"Error {} parsing deployment {}. Skipping".format(
|
|
e, deployment), exc_info=e)
|
|
|
|
def _report_replication_controller_metrics(self, dimensions, kubernetes_labels):
|
|
# Get namespaces first
|
|
try:
|
|
namespaces = self._send_request("/api/v1/namespaces")
|
|
except Exception as e:
|
|
self.log.error("Error getting namespaces from API - {}. "
|
|
"Skipping getting replication controller metrics".format(e))
|
|
return
|
|
for namespace in namespaces['items']:
|
|
namespace_name = namespace['metadata']['name']
|
|
try:
|
|
replication_controllers = self._send_request(
|
|
"/api/v1/namespaces/{}/replicationcontrollers".format(namespace_name))
|
|
except Exception as e:
|
|
self.log.error("Error getting replication controllers for the namespace {} "
|
|
"with the error {}".format(namespace, e))
|
|
continue
|
|
if 'items' not in replication_controllers:
|
|
continue
|
|
for rc in replication_controllers['items']:
|
|
rc_dimensions = dimensions.copy()
|
|
self._set_kubernetes_dimensions(
|
|
rc_dimensions,
|
|
"replication_controller",
|
|
rc['metadata'],
|
|
kubernetes_labels)
|
|
rc_status = rc['status']
|
|
if 'replicas' not in rc_status or not rc_status['replicas']:
|
|
continue
|
|
self.gauge("kubernetes.replication.controller.replicas", rc_status['replicas'],
|
|
rc_dimensions, hostname="SUPPRESS")
|
|
self.gauge(
|
|
"kubernetes.replication.controller.ready_replicas",
|
|
rc_status['readyReplicas'],
|
|
rc_dimensions,
|
|
hostname="SUPPRESS")
|