From 44e6cadd124dfb116825a00825ad68337d7ed80c Mon Sep 17 00:00:00 2001 From: bandorf Date: Mon, 18 Dec 2017 11:18:08 +0100 Subject: [PATCH] Add new metrics for Cadvisor plugin improvements for monasca self-monitoring add metrics: - number of cores - memory used (percentage) - file system used (percentage) story: 2001407 task: 6099 Change-Id: I11dd367543b6c17b9935aa4826345dd5df721445 --- docs/Plugins.md | 3 + .../collector/checks_d/cadvisor_host.py | 65 +++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/docs/Plugins.md b/docs/Plugins.md index 5b4fe416..dc945226 100644 --- a/docs/Plugins.md +++ b/docs/Plugins.md @@ -572,6 +572,7 @@ The cAdvisor host check returns the following metrics: | Metric Name | Dimensions | Semantics | | ----------- | ---------- | --------- | +| cpu.num_cores | hostname, unit | Number of cores of host | cpu.system_time | hostname, unit | Cumulative system CPU time consumed in core seconds | cpu.system_time_sec | hostname, unit | Rate of system CPU time consumed in core seconds per second | cpu.total_time | hostname, unit | Cumulative CPU time consumed in core seconds @@ -580,6 +581,7 @@ The cAdvisor host check returns the following metrics: | cpu.user_time_sec | hostname, unit | Rate of user CPU time consumed in core seconds per second | fs.total_bytes | hostname, device, unit | Number of bytes available | fs.usage_bytes | hostname, device, unit | Number of bytes consumed +| fs.usage_perc | hostname, device, unit | Usage of fs as percentage | io.read_bytes | hostname, unit | Total number of bytes read by all devices | io.read_bytes_sec | hostname, unit | Total number of bytes read by all devices per second | io.write_bytes | hostname, unit | Total number of bytes written by all devices @@ -587,6 +589,7 @@ The cAdvisor host check returns the following metrics: | mem.cache_bytes | hostname, unit | Number of bytes of page cache memory | mem.swap_bytes | hostname, unit | Swap usage in memory in bytes | mem.used_bytes | hostname, unit | Current memory in use in bytes +| mem.used_perc | hostname, unit | Current memory usage as percentage | mem.working_set_bytes | hostname, unit | Current working set of memory in bytes (total minus cache) | net.in_bytes | hostname, interface, unit | Total network bytes received by all interfaces | net.in_bytes_sec | hostname, interface, unit | Total number of network bytes received by all interfaces per second diff --git a/monasca_agent/collector/checks_d/cadvisor_host.py b/monasca_agent/collector/checks_d/cadvisor_host.py index db21df42..daf58270 100644 --- a/monasca_agent/collector/checks_d/cadvisor_host.py +++ b/monasca_agent/collector/checks_d/cadvisor_host.py @@ -1,6 +1,9 @@ # (C) Copyright 2017 Hewlett Packard Enterprise Development LP import requests +from urlparse import urlparse +from urlparse import urlunparse + from monasca_agent.collector.checks import AgentCheck from monasca_agent.collector.checks import utils from monasca_agent.common.util import rollup_dictionaries @@ -66,6 +69,26 @@ class CadvisorHost(AgentCheck): self.connection_timeout = int(init_config.get('connection_timeout', DEFAULT_TIMEOUT)) self.cadvisor_url = None + self.cadvisor_machine_url = None + self.total_mem = 0 + self.num_cores = 0 + + def _parse_machine_info(self, machine_info): + topo_info = machine_info['topology'] + # Grab first set of info from return data + topo_info = topo_info[0] + # Store info about total machine memory + if topo_info['memory']: + self.total_mem = topo_info['memory'] + self.log.debug("host memory = {}".format(self.total_mem)) + else: + self.log.warn("Failed to retrieve host memory size") + # Store information about number of cores (incl. threads) + if machine_info['num_cores']: + self.num_cores = int(machine_info['num_cores']) + self.log.debug("number of cores of machine: {}".format(self.num_cores)) + else: + self.log.warn("Failed to retrieve number of cores of host") def check(self, instance): if not self.cadvisor_url: @@ -89,6 +112,17 @@ class CadvisorHost(AgentCheck): except Exception as e: self.log.error("Error communicating with cAdvisor to collect data - {}".format(e)) else: + # Retrieve machine info only once + if not self.cadvisor_machine_url: + # Replace path in current cadvisor_url + result = urlparse(self.cadvisor_url) + self.cadvisor_machine_url = urlunparse(result._replace(path="api/v2.0/machine")) + try: + machine_info = requests.get(self.cadvisor_machine_url).json() + except Exception as ex: + self.log.error("Error communicating with cAdvisor to collect machine data - {}".format(ex)) + else: + self._parse_machine_info(machine_info) self._parse_send_metrics(host_metrics, dimensions) def _send_metrics(self, metric_name, value, dimensions, metric_types, @@ -103,24 +137,52 @@ class CadvisorHost(AgentCheck): def _parse_memory(self, memory_data, dimensions): memory_metrics = METRICS['memory_metrics'] + used_mem = -1 for cadvisor_key, (metric_name, metric_types, metric_units) in memory_metrics.items(): if cadvisor_key in memory_data: self._send_metrics("mem." + metric_name, memory_data[cadvisor_key], dimensions, metric_types, metric_units) + if cadvisor_key == "usage": + used_mem = int(memory_data[cadvisor_key]) + # Calculate memory used percent + if used_mem < 0: + self.log.warn("no value for used memory, memory usage (percent) couldn't be calculated") + elif self.total_mem > 0: + used_mem_perc = (float(used_mem) / float(self.total_mem)) * 100 + # Send metric percent used + self._send_metrics("mem.used_perc", + used_mem_perc, + dimensions, + metric_types, metric_units) def _parse_filesystem(self, filesystem_data, dimensions): filesystem_metrics = METRICS['filesystem_metrics'] for filesystem in filesystem_data: file_dimensions = dimensions.copy() file_dimensions['device'] = filesystem['device'] + usage_fs = -1 + capacity_fs = 0 for cadvisor_key, (metric_name, metric_types, metric_units) in filesystem_metrics.items(): if cadvisor_key in filesystem: self._send_metrics("fs." + metric_name, filesystem[cadvisor_key], file_dimensions, metric_types, metric_units) + if cadvisor_key == "usage": + usage_fs = int(filesystem[cadvisor_key]) + elif cadvisor_key == "capacity": + capacity_fs = int(filesystem[cadvisor_key]) + if usage_fs < 0: + self.log.warn("no value for usage size of {}, file system usage (percent) couldn't be calculated".format(filesystem['device'])) + elif capacity_fs > 0: + self._send_metrics("fs.usage_perc", + (float(usage_fs) / capacity_fs) * 100, + file_dimensions, + ["gauge"], ["percent"]) + else: + self.log.warn("no value for capacity of {}, file system usage (percent) couldn't be calculated".format(filesystem['device'])) def _parse_network(self, network_data, dimensions): network_interfaces = network_data['interfaces'] @@ -160,6 +222,9 @@ class CadvisorHost(AgentCheck): # Convert nanoseconds to seconds cpu_usage_sec = cpu_usage[cadvisor_key] / 1000000000.0 self._send_metrics("cpu." + metric_name, cpu_usage_sec, dimensions, metric_types, metric_units) + # Provide metrics for number of cores if given + if self.num_cores > 0: + self._send_metrics("cpu.num_cores", self.num_cores, dimensions, ["gauge"], ["number_of_cores"]) def _parse_send_metrics(self, metrics, dimensions): for host, cadvisor_metrics in metrics.items():