monasca-agent/monasca_agent/collector/checks_d/kubernetes_api.py

256 lines
12 KiB
Python

# (C) Copyright 2017 Hewlett Packard Enterprise Development LP
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import requests
from monasca_agent.collector import checks
from monasca_agent.collector.checks import utils
DEFAULT_TIMEOUT = 5
NODE_CONDITIONS_MAP = {
"OutOfDisk": {
"metric_name": "node.out_of_disk",
"expected_status": "False"
},
"MemoryPressure": {
"metric_name": "node.memory_pressure",
"expected_status": "False"
},
"DiskPressure": {
"metric_name": "node.disk_pressure",
"expected_status": "False"
},
"Ready": {
"metric_name": "node.ready_status",
"expected_status": "True"
}
}
class KubernetesAPI(checks.AgentCheck):
"""Queries Kubernetes API to get metrics about the Kubernetes deployment
"""
def __init__(self, name, init_config, agent_config, instances=None):
checks.AgentCheck.__init__(self, name, init_config, agent_config, instances)
if instances is not None and len(instances) > 1:
raise Exception('Kubernetes api check only supports one configured instance.')
self.connection_timeout = int(init_config.get('connection_timeout', DEFAULT_TIMEOUT))
self.kubernetes_connector = None
self.kubernetes_api = None
def prepare_run(self):
"""Set up Kubernetes connection information"""
instance = self.instances[0]
host = instance.get("host", None)
derive_api_url = instance.get("derive_api_url", None)
if not host:
if derive_api_url:
self.kubernetes_connector = utils.KubernetesConnector(self.connection_timeout)
else:
exception_message = "Either Kubernetes API url (host and port)" \
" or derive_api_url=True must be set" \
" when running Kubernetes API plugin."
self.log.error(exception_message)
raise Exception(exception_message)
else:
kubernetes_api_port = instance.get("kubernetes_api_port", "8080")
self.kubernetes_api = "http://{}:{}".format(host, kubernetes_api_port)
def check(self, instance):
kubernetes_labels = instance.get('kubernetes_labels', ["app"])
dimensions = self._set_dimensions(None, instance)
# Remove hostname from dimensions as the majority of the metrics are not
# tied to the hostname.
del dimensions['hostname']
kubernetes_api_health = self._get_api_health()
self.gauge("kubernetes.api.health_status", 0 if kubernetes_api_health else 1, dimensions,
hostname="SUPPRESS")
self._report_cluster_component_statuses(dimensions)
self._report_nodes_metrics(dimensions)
self._report_deployment_metrics(dimensions, kubernetes_labels)
self._report_replication_controller_metrics(dimensions, kubernetes_labels)
def _send_request(self, endpoint, as_json=True):
if self.kubernetes_connector:
return self.kubernetes_connector.get_request(endpoint, as_json=as_json)
else:
result = requests.get("{}/{}".format(self.kubernetes_api, endpoint))
return result.json() if as_json else result
def _get_api_health(self):
try:
result = self._send_request("healthz", as_json=False)
except Exception as e:
self.log.error("Error connecting to the health endpoint with exception {}".format(e))
return False
else:
# Return true if 'ok' is in result
return 'ok' in result.iter_lines()
def _report_cluster_component_statuses(self, dimensions):
try:
component_statuses = self._send_request("/api/v1/componentstatuses")
except Exception as e:
self.log.error("Error getting data from Kubernetes API - {}".format(e))
return
for component in component_statuses['items']:
component_dimensions = dimensions.copy()
component_dimensions['component_name'] = component['metadata']['name']
component_status = False
component_conditions = component['conditions']
for condition in component_conditions:
if 'type' in condition and condition['type'] == 'Healthy':
if condition['status']:
component_status = True
break
self.gauge(
"kubernetes.component_status",
0 if component_status else 1,
component_dimensions,
hostname="SUPPRESS")
def _set_kubernetes_dimensions(self, dimensions, type, metadata, kubernetes_labels):
dimensions['type'] = metadata['name']
dimensions['namespace'] = metadata['namespace']
if 'labels' in metadata:
labels = metadata['labels']
for label in kubernetes_labels:
if label in labels:
dimensions[label] = labels[label]
def _report_node_resource_metrics(self, resource, metrics, node_dimensions):
resource_metrics_dimensions = node_dimensions.copy()
for metric_name, metric_value in metrics.items():
if "gpu" in metric_name:
continue
if metric_name == "memory":
metric_name += "_bytes"
metric_value = utils.convert_memory_string_to_bytes(metric_value)
resource_metrics_dimensions.update({'unit': 'bytes'})
elif metric_name == "cpu":
resource_metrics_dimensions.update({'unit': 'cores'})
metric_name = "kubernetes.node.{}.{}".format(resource, metric_name)
self.gauge(metric_name, float(metric_value), resource_metrics_dimensions)
def _report_node_conditions_metrics(self, node_conditions, node_dimensions):
for condition in node_conditions:
condition_type = condition["type"]
if condition_type in NODE_CONDITIONS_MAP:
condition_map = NODE_CONDITIONS_MAP[condition_type]
condition_status = condition['status']
if condition_status == condition_map['expected_status']:
self.gauge("kubernetes." + condition_map['metric_name'], 0, node_dimensions)
else:
value_meta = {"reason": condition['message'][:1024]}
self.gauge(
"kubernetes." +
condition_map['metric_name'],
1,
node_dimensions,
value_meta=value_meta)
def _report_nodes_metrics(self, dimensions):
try:
nodes = self._send_request("/api/v1/nodes")
except Exception as e:
self.log.error("Error getting node data from Kubernetes API - {}".format(e))
return
for node in nodes['items']:
node_dimensions = dimensions.copy()
node_dimensions['hostname'] = node['metadata']['name']
node_status = node['status']
self._report_node_conditions_metrics(node_status['conditions'], node_dimensions)
if 'spec' in node and 'unschedulable' in node['spec']:
if node['spec']['unschedulable']:
continue
node_capacity = node_status['capacity']
node_allocatable = node_status['allocatable']
self._report_node_resource_metrics('capacity', node_capacity, node_dimensions)
self._report_node_resource_metrics('allocatable', node_allocatable, node_dimensions)
def _report_deployment_metrics(self, dimensions, kubernetes_labels):
try:
deployments = self._send_request("/apis/extensions/v1beta1/deployments")
except Exception as e:
self.log.error("Error getting deployment data from Kubernetes API - {}".format(e))
return
for deployment in deployments['items']:
try:
deployment_dimensions = dimensions.copy()
self._set_kubernetes_dimensions(
deployment_dimensions,
"deployment",
deployment['metadata'],
kubernetes_labels)
deployment_status = deployment['status']
deployment_replicas = deployment_status['replicas']
deployment_updated_replicas = deployment_status['updatedReplicas']
deployment_available_replicas = deployment_status['availableReplicas']
deployment_unavailable_replicas = \
deployment_available_replicas - deployment_replicas
self.gauge("kubernetes.deployment.replicas", deployment_replicas,
deployment_dimensions, hostname="SUPPRESS")
self.gauge(
"kubernetes.deployment.available_replicas",
deployment_available_replicas,
deployment_dimensions,
hostname="SUPPRESS")
self.gauge(
"kubernetes.deployment.unavailable_replicas",
deployment_unavailable_replicas,
deployment_dimensions,
hostname="SUPPRESS")
self.gauge("kubernetes.deployment.updated_replicas", deployment_updated_replicas,
deployment_dimensions, hostname="SUPPRESS")
except Exception as e:
self.log.info(
"Error {} parsing deployment {}. Skipping".format(
e, deployment), exc_info=e)
def _report_replication_controller_metrics(self, dimensions, kubernetes_labels):
# Get namespaces first
try:
namespaces = self._send_request("/api/v1/namespaces")
except Exception as e:
self.log.error("Error getting namespaces from API - {}. "
"Skipping getting replication controller metrics".format(e))
return
for namespace in namespaces['items']:
namespace_name = namespace['metadata']['name']
try:
replication_controllers = self._send_request(
"/api/v1/namespaces/{}/replicationcontrollers".format(namespace_name))
except Exception as e:
self.log.error("Error getting replication controllers for the namespace {} "
"with the error {}".format(namespace, e))
continue
if 'items' not in replication_controllers:
continue
for rc in replication_controllers['items']:
rc_dimensions = dimensions.copy()
self._set_kubernetes_dimensions(
rc_dimensions,
"replication_controller",
rc['metadata'],
kubernetes_labels)
rc_status = rc['status']
if 'replicas' not in rc_status or not rc_status['replicas']:
continue
self.gauge("kubernetes.replication.controller.replicas", rc_status['replicas'],
rc_dimensions, hostname="SUPPRESS")
self.gauge(
"kubernetes.replication.controller.ready_replicas",
rc_status['readyReplicas'],
rc_dimensions,
hostname="SUPPRESS")