monasca-agent/monasca_agent/collector/checks_d/docker.py

404 lines
18 KiB
Python

# (C) Copyright 2015,2016 Hewlett Packard Enterprise Development LP
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from __future__ import absolute_import
import os
import re
import docker
from monasca_agent.collector import checks
CONTAINER_ID_RE = re.compile('[0-9a-f]{64}')
DEFAULT_BASE_URL = "unix://var/run/docker.sock"
DEFAULT_VERSION = "auto"
DEFAULT_TIMEOUT = 3
DEFAULT_ADD_KUBERNETES_DIMENSIONS = False
JIFFY_HZ = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
CGROUPS = ['cpuacct', 'memory', 'blkio']
class Docker(checks.AgentCheck):
"""Collect metrics and events from Docker API and cgroups"""
def __init__(self, name, init_config, agent_config, instances=None):
checks.AgentCheck.__init__(self, name, init_config, agent_config, instances)
if instances is not None and len(instances) > 1:
raise Exception('Docker check only supports one configured instance.')
self.connection_timeout = int(init_config.get('connection_timeout', DEFAULT_TIMEOUT))
self.docker_version = init_config.get('version', DEFAULT_VERSION)
self.docker_root = init_config.get('docker_root', '/')
# Locate cgroups directories
self._mount_points = {}
self._cgroup_filename_pattern = None
for cgroup in CGROUPS:
self._mount_points[cgroup] = self._find_cgroup(cgroup)
self._prev_cpu = {}
self._curr_cpu = {}
self._cpu_count = None
self._prev_system_cpu = None
def check(self, instance):
docker_url = instance.get('url', DEFAULT_BASE_URL)
try:
docker_client = docker.Client(base_url=docker_url, version=self.docker_version,
timeout=self.connection_timeout)
running_containers = {
container['Id']: container for container in self._get_containers(docker_client)}
except Exception as e:
self.log.error(
"Could not get containers from Docker API skipping Docker check - {}".format(e))
return
add_kubernetes_dimensions = instance.get(
'add_kubernetes_dimensions',
DEFAULT_ADD_KUBERNETES_DIMENSIONS)
dimensions = self._set_dimensions(None, instance)
self.gauge("container.running_count", len(running_containers), dimensions=dimensions)
self._set_container_pids(running_containers)
# Report container metrics from cgroups
self._report_container_metrics(running_containers, add_kubernetes_dimensions, dimensions)
def _report_rate_gauge_metric(self, metric_name, value, dimensions):
self.rate(metric_name + "_sec", value, dimensions=dimensions)
self.gauge(metric_name, value, dimensions=dimensions)
def _report_container_metrics(self, container_dict, add_kubernetes_dimensions, dimensions):
self._curr_system_cpu, self._cpu_count = self._get_system_cpu_ns()
system_memory = self._get_total_memory()
for container in container_dict.itervalues():
try:
container_dimensions = dimensions.copy()
container_id = container['Id']
container_dimensions['name'] = self._get_container_name(
container['Names'], container_id)
container_dimensions['image'] = container['Image']
container_labels = container['Labels']
if add_kubernetes_dimensions:
if 'io.kubernetes.pod.name' in container_labels:
container_dimensions['kubernetes_pod_name'] = \
container_labels['io.kubernetes.pod.name']
if 'io.kubernetes.pod.namespace' in container_labels:
container_dimensions['kubernetes_namespace'] = \
container_labels['io.kubernetes.pod.namespace']
self._report_cgroup_cpuacct(container_id, container_dimensions)
self._report_cgroup_memory(container_id, container_dimensions, system_memory)
self._report_cgroup_blkio(container_id, container_dimensions)
if "_proc_root" in container:
self._report_net_metrics(container, container_dimensions)
self._report_cgroup_cpu_pct(container_id, container_dimensions)
except IOError as err:
# It is possible that the container got stopped between the
# API call and now
self.log.info("IO error while collecting cgroup metrics, "
"skipping container...", exc_info=err)
except Exception as err:
self.log.error("Error when collecting data about container {}".format(err))
self._prev_system_cpu = self._curr_system_cpu
def _get_container_name(self, container_names, container_id):
container_name = None
if container_names:
for name in container_names:
# if there is more than one / the name is actually an alias
if name.count('/') <= 1:
container_name = str(name).lstrip('/')
break
return container_name if container_name else container_id
def _report_cgroup_cpuacct(self, container_id, container_dimensions):
stat_file = self._get_cgroup_file('cpuacct', container_id, 'cpuacct.stat')
stats = self._parse_cgroup_pairs(stat_file)
self._report_rate_gauge_metric(
'container.cpu.user_time',
stats['user'],
container_dimensions)
self._report_rate_gauge_metric(
'container.cpu.system_time',
stats['system'],
container_dimensions)
def _report_cgroup_memory(self, container_id, container_dimensions, system_memory_limit):
stat_file = self._get_cgroup_file('memory', container_id, 'memory.stat')
stats = self._parse_cgroup_pairs(stat_file)
cache_memory = stats['cache']
rss_memory = stats['rss']
self.gauge('container.mem.cache', cache_memory, dimensions=container_dimensions)
self.gauge('container.mem.rss', rss_memory, dimensions=container_dimensions)
swap_memory = 0
if 'swap' in stats:
swap_memory = stats['swap']
self.gauge('container.mem.swap', swap_memory, dimensions=container_dimensions)
# Get container max memory
memory_limit_file = self._get_cgroup_file('memory', container_id, 'memory.limit_in_bytes')
memory_limit = self._parse_cgroup_value(memory_limit_file, convert=float)
if memory_limit > system_memory_limit:
memory_limit = float(system_memory_limit)
used_perc = round((((cache_memory + rss_memory + swap_memory) / memory_limit) * 100), 2)
self.gauge('container.mem.used_perc', used_perc, dimensions=container_dimensions)
def _report_cgroup_blkio(self, container_id, container_dimensions):
stat_file = self._get_cgroup_file('blkio', container_id,
'blkio.throttle.io_service_bytes')
stats = self._parse_cgroup_blkio_metrics(stat_file)
self._report_rate_gauge_metric(
'container.io.read_bytes',
stats['io_read'],
container_dimensions)
self._report_rate_gauge_metric(
'container.io.write_bytes',
stats['io_write'],
container_dimensions)
def _report_cgroup_cpu_pct(self, container_id, container_dimensions):
usage_file = self._get_cgroup_file('cpuacct', container_id, 'cpuacct.usage')
prev_cpu = self._prev_cpu.get(container_id, None)
curr_cpu = self._parse_cgroup_value(usage_file)
self._prev_cpu[container_id] = curr_cpu
if prev_cpu is None:
# probably first run, we need 2 data points
return
system_cpu_delta = float(self._curr_system_cpu - self._prev_system_cpu)
container_cpu_delta = float(curr_cpu - prev_cpu)
if system_cpu_delta > 0 and container_cpu_delta > 0:
cpu_pct = (container_cpu_delta / system_cpu_delta) * self._cpu_count * 100
self.gauge('container.cpu.utilization_perc', cpu_pct, dimensions=container_dimensions)
def _report_net_metrics(self, container, container_dimensions):
"""Find container network metrics by looking at /proc/$PID/net/dev
of the container process.
"""
proc_net_file = os.path.join(container['_proc_root'], 'net/dev')
try:
with open(proc_net_file, 'r') as f:
lines = f.readlines()
"""Two first lines are headers:
Inter-| Receive bytes packets errs drop | Transmit bytes packkets errs drop
face | fifo frame compressed multicast | fifo colls carrier compressed
"""
for line in lines[2:]:
cols = line.split(':', 1)
interface_name = str(cols[0]).strip()
if interface_name != 'lo':
container_network_dimensions = container_dimensions.copy()
container_network_dimensions['interface'] = interface_name
network_values = cols[1].split()
self._report_rate_gauge_metric(
"container.net.in_bytes", long(
network_values[0]), container_network_dimensions)
self._report_rate_gauge_metric(
"container.net.out_bytes", long(
network_values[8]), container_network_dimensions)
break
except Exception as e:
self.log.error(
"Failed to report network metrics from file {0}. Exception: {1}".format(
proc_net_file, e))
# Docker API
def _get_containers(self, docker_client):
"""Gets the list of running containers in Docker."""
return docker_client.containers()
def _find_cgroup_filename_pattern(self, container_id):
# We try with different cgroups so that it works even if only one is properly working
for mountpoint in self._mount_points.itervalues():
stat_file_path_lxc = os.path.join(mountpoint, "lxc")
stat_file_path_docker = os.path.join(mountpoint, "docker")
stat_file_path_coreos = os.path.join(mountpoint, "system.slice")
stat_file_path_kubernetes = os.path.join(mountpoint, container_id)
stat_file_path_kubernetes_docker = os.path.join(
mountpoint, "system", "docker", container_id)
stat_file_path_docker_daemon = os.path.join(
mountpoint, "docker-daemon", "docker", container_id)
if os.path.exists(stat_file_path_lxc):
return '%(mountpoint)s/lxc/%(id)s/%(file)s'
elif os.path.exists(stat_file_path_docker):
return '%(mountpoint)s/docker/%(id)s/%(file)s'
elif os.path.exists(stat_file_path_coreos):
return '%(mountpoint)s/system.slice/docker-%(id)s.scope/%(file)s'
elif os.path.exists(stat_file_path_kubernetes):
return '%(mountpoint)s/%(id)s/%(file)s'
elif os.path.exists(stat_file_path_kubernetes_docker):
return '%(mountpoint)s/system/docker/%(id)s/%(file)s'
elif os.path.exists(stat_file_path_docker_daemon):
return '%(mountpoint)s/docker-daemon/docker/%(id)s/%(file)s'
raise Exception("Cannot find Docker cgroup directory. Be sure your system is supported.")
def _get_cgroup_file(self, cgroup, container_id, filename):
# This can't be initialized at startup because cgroups may not be mounted yet
if not self._cgroup_filename_pattern:
self._cgroup_filename_pattern = self._find_cgroup_filename_pattern(container_id)
return self._cgroup_filename_pattern % (dict(
mountpoint=self._mount_points[cgroup],
id=container_id,
file=filename,
))
def _get_total_memory(self):
with open(os.path.join(self.docker_root, '/proc/meminfo')) as f:
for line in f.readlines():
tokens = line.split()
if tokens[0] == 'MemTotal:':
return int(tokens[1]) * 1024
raise Exception('Invalid formatting in /proc/meminfo: unable to '
'determine MemTotal')
def _get_system_cpu_ns(self):
# see also: getSystemCPUUsage of docker's stats_collector_unix.go
total_jiffies = None
cpu_count = 0
with open(os.path.join(self.docker_root, '/proc/stat'), 'r') as f:
for line in f.readlines():
tokens = line.split()
if tokens[0] == 'cpu':
if len(tokens) < 8:
raise Exception("Invalid formatting in /proc/stat")
total_jiffies = sum(map(lambda t: int(t), tokens[1:8]))
elif tokens[0].startswith('cpu'):
# startswith but does not equal implies /cpu\d+/ or so
# we don't need full per-cpu usage to calculate %,
# so just count cores
cpu_count += 1
if not total_jiffies:
raise Exception("Unable to find CPU usage in /proc/stat")
cpu_time_ns = (total_jiffies / JIFFY_HZ) * 1e9
return cpu_time_ns, cpu_count
def _find_cgroup(self, hierarchy):
"""Finds the mount point for a specified cgroup hierarchy. Works with
old style and new style mounts.
"""
with open(os.path.join(self.docker_root, "/proc/mounts"), 'r') as f:
mounts = map(lambda x: x.split(), f.read().splitlines())
cgroup_mounts = filter(lambda x: x[2] == "cgroup", mounts)
if len(cgroup_mounts) == 0:
raise Exception("Can't find mounted cgroups. If you run the Agent inside a container,"
" please refer to the documentation.")
# Old cgroup style
if len(cgroup_mounts) == 1:
return os.path.join(self.docker_root, cgroup_mounts[0][1])
candidate = None
for _, mountpoint, _, opts, _, _ in cgroup_mounts:
if hierarchy in opts:
if mountpoint.startswith("/host/"):
return os.path.join(self.docker_root, mountpoint)
candidate = mountpoint
if candidate is not None:
return os.path.join(self.docker_root, candidate)
raise Exception("Can't find mounted %s cgroups." % hierarchy)
def _parse_cgroup_value(self, stat_file, convert=int):
"""Parse a cgroup info file containing a single value."""
with open(stat_file, 'r') as f:
return convert(f.read().strip())
def _parse_cgroup_pairs(self, stat_file, convert=int):
"""Parse a cgroup file for key/values."""
with open(stat_file, 'r') as f:
split_lines = map(lambda x: x.split(' ', 1), f.readlines())
return {k: convert(v) for k, v in split_lines}
def _parse_cgroup_blkio_metrics(self, stat_file):
"""Parse the blkio metrics."""
with open(stat_file, 'r') as f:
stats = f.read().splitlines()
metrics = {
'io_read': 0,
'io_write': 0,
}
for line in stats:
if 'Read' in line:
metrics['io_read'] += int(line.split()[2])
if 'Write' in line:
metrics['io_write'] += int(line.split()[2])
return metrics
# checking if cgroup is a container cgroup
def _is_container_cgroup(self, line, selinux_policy):
if line[1] not in ('cpu,cpuacct', 'cpuacct,cpu', 'cpuacct') or line[2] == '/docker-daemon':
return False
if 'docker' in line[2]:
return True
if 'docker' in selinux_policy:
return True
if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2][1:]): # kubernetes
return True
return False
def _set_container_pids(self, containers):
"""Find all proc paths for running containers."""
proc_path = os.path.join(self.docker_root, 'proc')
pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()]
for pid_dir in pid_dirs:
try:
path = os.path.join(proc_path, pid_dir, 'cgroup')
with open(path, 'r') as f:
content = [line.strip().split(':') for line in f.readlines()]
selinux_policy = ''
path = os.path.join(proc_path, pid_dir, 'attr', 'current')
if os.path.exists(path):
with open(path, 'r') as f:
selinux_policy = f.readlines()[0]
except IOError as e:
self.log.debug("Cannot read %s, "
"process likely raced to finish : %s" %
(path, str(e)))
continue
except Exception as e:
self.log.warning("Cannot read %s : %s" % (path, str(e)))
continue
try:
cpuacct = None
for line in content:
if self._is_container_cgroup(line, selinux_policy):
cpuacct = line[2]
break
matches = re.findall(CONTAINER_ID_RE, cpuacct) if cpuacct else None
if matches:
container_id = matches[-1]
if container_id not in containers:
self.log.debug(
"Container %s not in container_dict, it's likely excluded",
container_id)
continue
containers[container_id]['_pid'] = pid_dir
containers[container_id]['_proc_root'] = os.path.join(proc_path, pid_dir)
except Exception as e:
self.log.warning("Cannot parse %s content: %s" % (path, str(e)))
continue