# (C) Copyright 2015,2016 Hewlett Packard Enterprise Development LP # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain # a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. from __future__ import absolute_import import os import re import docker from monasca_agent.collector import checks CONTAINER_ID_RE = re.compile('[0-9a-f]{64}') DEFAULT_BASE_URL = "unix://var/run/docker.sock" DEFAULT_VERSION = "auto" DEFAULT_TIMEOUT = 3 DEFAULT_ADD_KUBERNETES_DIMENSIONS = False JIFFY_HZ = os.sysconf(os.sysconf_names['SC_CLK_TCK']) CGROUPS = ['cpuacct', 'memory', 'blkio'] class Docker(checks.AgentCheck): """Collect metrics and events from Docker API and cgroups""" def __init__(self, name, init_config, agent_config, instances=None): checks.AgentCheck.__init__(self, name, init_config, agent_config, instances) if instances is not None and len(instances) > 1: raise Exception('Docker check only supports one configured instance.') self.connection_timeout = int(init_config.get('connection_timeout', DEFAULT_TIMEOUT)) self.docker_version = init_config.get('version', DEFAULT_VERSION) self.docker_root = init_config.get('docker_root', '/') # Locate cgroups directories self._mount_points = {} self._cgroup_filename_pattern = None for cgroup in CGROUPS: self._mount_points[cgroup] = self._find_cgroup(cgroup) self._prev_cpu = {} self._curr_cpu = {} self._cpu_count = None self._prev_system_cpu = None def check(self, instance): docker_url = instance.get('url', DEFAULT_BASE_URL) try: docker_client = docker.Client(base_url=docker_url, version=self.docker_version, timeout=self.connection_timeout) running_containers = { container['Id']: container for container in self._get_containers(docker_client)} except Exception as e: self.log.error( "Could not get containers from Docker API skipping Docker check - {}".format(e)) return add_kubernetes_dimensions = instance.get( 'add_kubernetes_dimensions', DEFAULT_ADD_KUBERNETES_DIMENSIONS) dimensions = self._set_dimensions(None, instance) self.gauge("container.running_count", len(running_containers), dimensions=dimensions) self._set_container_pids(running_containers) # Report container metrics from cgroups self._report_container_metrics(running_containers, add_kubernetes_dimensions, dimensions) def _report_rate_gauge_metric(self, metric_name, value, dimensions): self.rate(metric_name + "_sec", value, dimensions=dimensions) self.gauge(metric_name, value, dimensions=dimensions) def _report_container_metrics(self, container_dict, add_kubernetes_dimensions, dimensions): self._curr_system_cpu, self._cpu_count = self._get_system_cpu_ns() system_memory = self._get_total_memory() for container in container_dict.itervalues(): try: container_dimensions = dimensions.copy() container_id = container['Id'] container_dimensions['name'] = self._get_container_name( container['Names'], container_id) container_dimensions['image'] = container['Image'] container_labels = container['Labels'] if add_kubernetes_dimensions: if 'io.kubernetes.pod.name' in container_labels: container_dimensions['kubernetes_pod_name'] = \ container_labels['io.kubernetes.pod.name'] if 'io.kubernetes.pod.namespace' in container_labels: container_dimensions['kubernetes_namespace'] = \ container_labels['io.kubernetes.pod.namespace'] self._report_cgroup_cpuacct(container_id, container_dimensions) self._report_cgroup_memory(container_id, container_dimensions, system_memory) self._report_cgroup_blkio(container_id, container_dimensions) if "_proc_root" in container: self._report_net_metrics(container, container_dimensions) self._report_cgroup_cpu_pct(container_id, container_dimensions) except IOError as err: # It is possible that the container got stopped between the # API call and now self.log.info("IO error while collecting cgroup metrics, " "skipping container...", exc_info=err) except Exception as err: self.log.error("Error when collecting data about container {}".format(err)) self._prev_system_cpu = self._curr_system_cpu def _get_container_name(self, container_names, container_id): container_name = None if container_names: for name in container_names: # if there is more than one / the name is actually an alias if name.count('/') <= 1: container_name = str(name).lstrip('/') break return container_name if container_name else container_id def _report_cgroup_cpuacct(self, container_id, container_dimensions): stat_file = self._get_cgroup_file('cpuacct', container_id, 'cpuacct.stat') stats = self._parse_cgroup_pairs(stat_file) self._report_rate_gauge_metric( 'container.cpu.user_time', stats['user'], container_dimensions) self._report_rate_gauge_metric( 'container.cpu.system_time', stats['system'], container_dimensions) def _report_cgroup_memory(self, container_id, container_dimensions, system_memory_limit): stat_file = self._get_cgroup_file('memory', container_id, 'memory.stat') stats = self._parse_cgroup_pairs(stat_file) cache_memory = stats['cache'] rss_memory = stats['rss'] self.gauge('container.mem.cache', cache_memory, dimensions=container_dimensions) self.gauge('container.mem.rss', rss_memory, dimensions=container_dimensions) swap_memory = 0 if 'swap' in stats: swap_memory = stats['swap'] self.gauge('container.mem.swap', swap_memory, dimensions=container_dimensions) # Get container max memory memory_limit_file = self._get_cgroup_file('memory', container_id, 'memory.limit_in_bytes') memory_limit = self._parse_cgroup_value(memory_limit_file, convert=float) if memory_limit > system_memory_limit: memory_limit = float(system_memory_limit) used_perc = round((((cache_memory + rss_memory + swap_memory) / memory_limit) * 100), 2) self.gauge('container.mem.used_perc', used_perc, dimensions=container_dimensions) def _report_cgroup_blkio(self, container_id, container_dimensions): stat_file = self._get_cgroup_file('blkio', container_id, 'blkio.throttle.io_service_bytes') stats = self._parse_cgroup_blkio_metrics(stat_file) self._report_rate_gauge_metric( 'container.io.read_bytes', stats['io_read'], container_dimensions) self._report_rate_gauge_metric( 'container.io.write_bytes', stats['io_write'], container_dimensions) def _report_cgroup_cpu_pct(self, container_id, container_dimensions): usage_file = self._get_cgroup_file('cpuacct', container_id, 'cpuacct.usage') prev_cpu = self._prev_cpu.get(container_id, None) curr_cpu = self._parse_cgroup_value(usage_file) self._prev_cpu[container_id] = curr_cpu if prev_cpu is None: # probably first run, we need 2 data points return system_cpu_delta = float(self._curr_system_cpu - self._prev_system_cpu) container_cpu_delta = float(curr_cpu - prev_cpu) if system_cpu_delta > 0 and container_cpu_delta > 0: cpu_pct = (container_cpu_delta / system_cpu_delta) * self._cpu_count * 100 self.gauge('container.cpu.utilization_perc', cpu_pct, dimensions=container_dimensions) def _report_net_metrics(self, container, container_dimensions): """Find container network metrics by looking at /proc/$PID/net/dev of the container process. """ proc_net_file = os.path.join(container['_proc_root'], 'net/dev') try: with open(proc_net_file, 'r') as f: lines = f.readlines() """Two first lines are headers: Inter-| Receive bytes packets errs drop | Transmit bytes packkets errs drop face | fifo frame compressed multicast | fifo colls carrier compressed """ for line in lines[2:]: cols = line.split(':', 1) interface_name = str(cols[0]).strip() if interface_name != 'lo': container_network_dimensions = container_dimensions.copy() container_network_dimensions['interface'] = interface_name network_values = cols[1].split() self._report_rate_gauge_metric( "container.net.in_bytes", long( network_values[0]), container_network_dimensions) self._report_rate_gauge_metric( "container.net.out_bytes", long( network_values[8]), container_network_dimensions) break except Exception as e: self.log.error( "Failed to report network metrics from file {0}. Exception: {1}".format( proc_net_file, e)) # Docker API def _get_containers(self, docker_client): """Gets the list of running containers in Docker.""" return docker_client.containers() def _find_cgroup_filename_pattern(self, container_id): # We try with different cgroups so that it works even if only one is properly working for mountpoint in self._mount_points.itervalues(): stat_file_path_lxc = os.path.join(mountpoint, "lxc") stat_file_path_docker = os.path.join(mountpoint, "docker") stat_file_path_coreos = os.path.join(mountpoint, "system.slice") stat_file_path_kubernetes = os.path.join(mountpoint, container_id) stat_file_path_kubernetes_docker = os.path.join( mountpoint, "system", "docker", container_id) stat_file_path_docker_daemon = os.path.join( mountpoint, "docker-daemon", "docker", container_id) if os.path.exists(stat_file_path_lxc): return '%(mountpoint)s/lxc/%(id)s/%(file)s' elif os.path.exists(stat_file_path_docker): return '%(mountpoint)s/docker/%(id)s/%(file)s' elif os.path.exists(stat_file_path_coreos): return '%(mountpoint)s/system.slice/docker-%(id)s.scope/%(file)s' elif os.path.exists(stat_file_path_kubernetes): return '%(mountpoint)s/%(id)s/%(file)s' elif os.path.exists(stat_file_path_kubernetes_docker): return '%(mountpoint)s/system/docker/%(id)s/%(file)s' elif os.path.exists(stat_file_path_docker_daemon): return '%(mountpoint)s/docker-daemon/docker/%(id)s/%(file)s' raise Exception("Cannot find Docker cgroup directory. Be sure your system is supported.") def _get_cgroup_file(self, cgroup, container_id, filename): # This can't be initialized at startup because cgroups may not be mounted yet if not self._cgroup_filename_pattern: self._cgroup_filename_pattern = self._find_cgroup_filename_pattern(container_id) return self._cgroup_filename_pattern % (dict( mountpoint=self._mount_points[cgroup], id=container_id, file=filename, )) def _get_total_memory(self): with open(os.path.join(self.docker_root, '/proc/meminfo')) as f: for line in f.readlines(): tokens = line.split() if tokens[0] == 'MemTotal:': return int(tokens[1]) * 1024 raise Exception('Invalid formatting in /proc/meminfo: unable to ' 'determine MemTotal') def _get_system_cpu_ns(self): # see also: getSystemCPUUsage of docker's stats_collector_unix.go total_jiffies = None cpu_count = 0 with open(os.path.join(self.docker_root, '/proc/stat'), 'r') as f: for line in f.readlines(): tokens = line.split() if tokens[0] == 'cpu': if len(tokens) < 8: raise Exception("Invalid formatting in /proc/stat") total_jiffies = sum(map(lambda t: int(t), tokens[1:8])) elif tokens[0].startswith('cpu'): # startswith but does not equal implies /cpu\d+/ or so # we don't need full per-cpu usage to calculate %, # so just count cores cpu_count += 1 if not total_jiffies: raise Exception("Unable to find CPU usage in /proc/stat") cpu_time_ns = (total_jiffies / JIFFY_HZ) * 1e9 return cpu_time_ns, cpu_count def _find_cgroup(self, hierarchy): """Finds the mount point for a specified cgroup hierarchy. Works with old style and new style mounts. """ with open(os.path.join(self.docker_root, "/proc/mounts"), 'r') as f: mounts = map(lambda x: x.split(), f.read().splitlines()) cgroup_mounts = filter(lambda x: x[2] == "cgroup", mounts) if len(cgroup_mounts) == 0: raise Exception("Can't find mounted cgroups. If you run the Agent inside a container," " please refer to the documentation.") # Old cgroup style if len(cgroup_mounts) == 1: return os.path.join(self.docker_root, cgroup_mounts[0][1]) candidate = None for _, mountpoint, _, opts, _, _ in cgroup_mounts: if hierarchy in opts: if mountpoint.startswith("/host/"): return os.path.join(self.docker_root, mountpoint) candidate = mountpoint if candidate is not None: return os.path.join(self.docker_root, candidate) raise Exception("Can't find mounted %s cgroups." % hierarchy) def _parse_cgroup_value(self, stat_file, convert=int): """Parse a cgroup info file containing a single value.""" with open(stat_file, 'r') as f: return convert(f.read().strip()) def _parse_cgroup_pairs(self, stat_file, convert=int): """Parse a cgroup file for key/values.""" with open(stat_file, 'r') as f: split_lines = map(lambda x: x.split(' ', 1), f.readlines()) return {k: convert(v) for k, v in split_lines} def _parse_cgroup_blkio_metrics(self, stat_file): """Parse the blkio metrics.""" with open(stat_file, 'r') as f: stats = f.read().splitlines() metrics = { 'io_read': 0, 'io_write': 0, } for line in stats: if 'Read' in line: metrics['io_read'] += int(line.split()[2]) if 'Write' in line: metrics['io_write'] += int(line.split()[2]) return metrics # checking if cgroup is a container cgroup def _is_container_cgroup(self, line, selinux_policy): if line[1] not in ('cpu,cpuacct', 'cpuacct,cpu', 'cpuacct') or line[2] == '/docker-daemon': return False if 'docker' in line[2]: return True if 'docker' in selinux_policy: return True if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2][1:]): # kubernetes return True return False def _set_container_pids(self, containers): """Find all proc paths for running containers.""" proc_path = os.path.join(self.docker_root, 'proc') pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()] for pid_dir in pid_dirs: try: path = os.path.join(proc_path, pid_dir, 'cgroup') with open(path, 'r') as f: content = [line.strip().split(':') for line in f.readlines()] selinux_policy = '' path = os.path.join(proc_path, pid_dir, 'attr', 'current') if os.path.exists(path): with open(path, 'r') as f: selinux_policy = f.readlines()[0] except IOError as e: self.log.debug("Cannot read %s, " "process likely raced to finish : %s" % (path, str(e))) continue except Exception as e: self.log.warning("Cannot read %s : %s" % (path, str(e))) continue try: cpuacct = None for line in content: if self._is_container_cgroup(line, selinux_policy): cpuacct = line[2] break matches = re.findall(CONTAINER_ID_RE, cpuacct) if cpuacct else None if matches: container_id = matches[-1] if container_id not in containers: self.log.debug( "Container %s not in container_dict, it's likely excluded", container_id) continue containers[container_id]['_pid'] = pid_dir containers[container_id]['_proc_root'] = os.path.join(proc_path, pid_dir) except Exception as e: self.log.warning("Cannot parse %s content: %s" % (path, str(e))) continue