monasca-agent/monasca_agent/collector/checks_d/libvirt.py

943 lines
48 KiB
Python

#!/bin/env python
# (c) Copyright 2014-2016 Hewlett Packard Enterprise Development LP
# Copyright 2017 Fujitsu LIMITED
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Monasca Agent interface for libvirt metrics"""
import json
import libvirt
import math
import monasca_agent.collector.checks.utils as utils
import os
import re
import stat
import subprocess
import time
from calendar import timegm
from copy import deepcopy
from datetime import datetime
from datetime import timedelta
from multiprocessing.dummy import Pool
from netaddr import all_matching_cidrs
from neutronclient.v2_0 import client as neutron_client
from novaclient import client as n_client
from novaclient.exceptions import NotFound
from monasca_agent.collector.checks import AgentCheck
from monasca_agent.collector.virt import inspector
from monasca_agent.common import keystone
from monasca_agent import version as ma_version
DOM_STATES = {libvirt.VIR_DOMAIN_BLOCKED: 'VM is blocked',
libvirt.VIR_DOMAIN_CRASHED: 'VM has crashed',
libvirt.VIR_DOMAIN_NONE: 'VM has no state',
libvirt.VIR_DOMAIN_PAUSED: 'VM is paused',
libvirt.VIR_DOMAIN_PMSUSPENDED: 'VM is in power management (s3) suspend',
libvirt.VIR_DOMAIN_SHUTDOWN: 'VM is shutting down',
libvirt.VIR_DOMAIN_SHUTOFF: 'VM has been shut off (other reason)'}
DOM_ALIVE_NAMES = {libvirt.VIR_DOMAIN_BLOCKED: 'blocked',
libvirt.VIR_DOMAIN_CRASHED: 'crashed',
libvirt.VIR_DOMAIN_NONE: 'nostate',
libvirt.VIR_DOMAIN_PAUSED: 'paused',
libvirt.VIR_DOMAIN_PMSUSPENDED: 'suspended',
libvirt.VIR_DOMAIN_RUNNING: 'running',
libvirt.VIR_DOMAIN_SHUTDOWN: 'shuttingdown',
libvirt.VIR_DOMAIN_SHUTOFF: 'shutoff'} # shut off/nova suspend
DOM_SHUTOFF_STATES = {libvirt.VIR_DOMAIN_SHUTOFF_UNKNOWN: 'VM has been shutoff (reason unknown)',
libvirt.VIR_DOMAIN_SHUTOFF_SHUTDOWN: 'VM has been shut down',
libvirt.VIR_DOMAIN_SHUTOFF_DESTROYED: 'VM has been destroyed (forced off)',
libvirt.VIR_DOMAIN_SHUTOFF_CRASHED: 'VM has crashed',
libvirt.VIR_DOMAIN_SHUTOFF_MIGRATED: 'VM has been migrated',
libvirt.VIR_DOMAIN_SHUTOFF_SAVED: 'VM has been suspended',
libvirt.VIR_DOMAIN_SHUTOFF_FAILED: 'VM has failed to start',
libvirt.VIR_DOMAIN_SHUTOFF_FROM_SNAPSHOT: 'VM has been restored from powered off snapshot'}
class LibvirtCheck(AgentCheck):
"""Inherit Agent class and gather libvirt metrics"""
def __init__(self, name, init_config, agent_config, instances=None):
AgentCheck.__init__(self, name, init_config, agent_config, instances=[{}])
self.instance_cache_file = "{0}/{1}".format(self.init_config.get('cache_dir'),
'libvirt_instances.json')
self.metric_cache_file = "{0}/{1}".format(self.init_config.get('cache_dir'),
'libvirt_metrics.json')
self.use_bits = self.init_config.get('network_use_bits')
self._collect_intervals = {}
self._host_aggregate = None
self._nova_host = None
self._set_collection_intervals('disk', 'disk_collection_period')
self._set_collection_intervals('vnic', 'vnic_collection_period')
pool_size = self.init_config.get('max_ping_concurrency', 8)
self.pool = Pool(pool_size)
def _set_collection_intervals(self, interval_name, config_name):
self._collect_intervals[interval_name] = {
'period': int(self.init_config.get(config_name, 0)),
'last_collect': datetime.fromordinal(1),
'skip': False}
def _test_vm_probation(self, created):
"""Test to see if a VM was created within the probation period.
Convert an ISO-8601 timestamp into UNIX epoch timestamp from now
and compare that against configured vm_probation. Return the
number of seconds this VM will remain in probation.
"""
dt = datetime.strptime(created, '%Y-%m-%dT%H:%M:%SZ')
created_sec = (time.time() - timegm(dt.timetuple()))
probation_time = self.init_config.get('vm_probation', 300) - created_sec
return int(probation_time)
def _get_metric_name(self, orig_name):
# Rename "tx" to "out" and "rx" to "in"
metric_name = orig_name.replace("tx", "out").replace("rx", "in")
if self.use_bits:
metric_name = metric_name.replace("bytes", "bits")
return metric_name
@staticmethod
def _get_metric_rate_name(metric_name):
"""Change the metric name to a rate, i.e. "net.rx_bytes"
gets converted to "net.rx_bytes_sec"
"""
return "{0}_sec".format(metric_name)
@staticmethod
def _validate_secgroup(cache, instance, source_ip):
"""Search through an instance's security groups for pingability
"""
for instance_secgroup in instance.security_groups:
for secgroup in cache:
if ((secgroup['tenant_id'] == instance.tenant_id and
secgroup['name'] == instance_secgroup['name'])):
for rule in secgroup['security_group_rules']:
if rule['protocol'] == 'icmp':
if ((not rule['remote_ip_prefix'] or
all_matching_cidrs(source_ip,
[rule['remote_ip_prefix']]))):
return True
def _get_nova_host(self, nova_client):
if not self._nova_host:
# Find `nova-compute` on current node
services = nova_client.services.list(host=self.hostname,
binary='nova-compute')
if not services:
# Catch the case when `nova-compute` is registered with
# unqualified hostname
services = nova_client.services.list(
host=self.hostname.split('.')[0], binary='nova-compute')
if services:
self._nova_host = services[0].host
self.log.info("Found 'nova-compute' registered with host: {}"
.format(self._nova_host))
if self._nova_host:
return self._nova_host
else:
self.log.warn("No 'nova-compute' service found on host: {}"
.format(self.hostname))
# Return hostname as fallback value
return self.hostname
def _update_instance_cache(self):
"""Collect instance_id, project_id, and AZ for all instance UUIDs
"""
id_cache = {}
flavor_cache = {}
port_cache = None
netns = None
# Get a list of all instances from the Nova API
session = keystone.get_session(**self.init_config)
nova_client = n_client.Client(
"2.1", session=session,
endpoint_type=self.init_config.get("endpoint_type", "publicURL"),
service_type="compute",
region_name=self.init_config.get('region_name'),
client_name='monasca-agent[libvirt]',
client_version=ma_version.version_string)
self._get_this_host_aggregate(nova_client)
instances = nova_client.servers.list(
search_opts={'all_tenants': 1,
'host': self._get_nova_host(nova_client)})
# Lay the groundwork for fetching VM IPs and network namespaces
if self.init_config.get('ping_check'):
nu = neutron_client.Client(
session=session,
endpoint_type=self.init_config.get("endpoint_type", "publicURL"),
region_name=self.init_config.get('region_name'),
client_name='monasca-agent[libvirt]',
client_version=ma_version.version_string)
port_cache = nu.list_ports()['ports']
# Finding existing network namespaces is an indication that either
# DVR agent_mode is enabled, or this is all-in-one (like devstack)
netns = subprocess.check_output(['ip', 'netns', 'list'])
if netns == '':
self.log.warn("Unable to ping VMs, no network namespaces found." +
"Either no VMs are present, or routing is centralized.")
#
# Only make the keystone call to get the tenant list
# if we are configured to publish tenant names.
#
tenants = []
if self.init_config.get('metadata') and 'tenant_name' in self.init_config.get('metadata'):
tenants = utils.get_tenant_list(self.init_config, self.log)
for instance in instances:
instance_ports = []
inst_name = instance.__getattr__('OS-EXT-SRV-ATTR:instance_name')
inst_az = instance.__getattr__('OS-EXT-AZ:availability_zone')
if instance.flavor['id'] in flavor_cache:
inst_flavor = flavor_cache[instance.flavor['id']]
else:
try:
inst_flavor = nova_client.flavors.get(instance.flavor['id'])
except NotFound as e:
self.log.error('Skipping VM {}: {}'.format(inst_name, e))
continue
flavor_cache[instance.flavor['id']] = inst_flavor
if port_cache:
instance_ports = [p['id'] for p in port_cache if p['device_id'] == instance.id]
id_cache[inst_name] = {'instance_uuid': instance.id,
'hostname': instance.name,
'zone': inst_az,
'created': instance.created,
'tenant_id': instance.tenant_id,
'vcpus': inst_flavor.vcpus,
'ram': inst_flavor.ram,
'disk': inst_flavor.disk,
'instance_ports': instance_ports}
tenant_name = utils.get_tenant_name(tenants, instance.tenant_id)
if tenant_name:
id_cache[inst_name]['tenant_name'] = tenant_name
for config_var in ['metadata', 'customer_metadata']:
if self.init_config.get(config_var):
for metadata in self.init_config.get(config_var):
if instance.metadata.get(metadata):
id_cache[inst_name][metadata] = (instance.metadata.
get(metadata))
# Build a list of pingable IP addresses attached to this VM and the
# appropriate namespace, for use in ping tests
if netns:
secgroup_cache = nu.list_security_groups()['security_groups']
self._build_ip_list(instance, inst_name,
secgroup_cache, port_cache, id_cache)
id_cache['last_update'] = int(time.time())
# Write the updated cache
try:
with open(self.instance_cache_file, 'w') as cache_json:
json.dump(id_cache, cache_json)
if stat.S_IMODE(os.stat(self.instance_cache_file).st_mode) != 0o600:
os.chmod(self.instance_cache_file, 0o600)
except IOError as e:
self.log.error("Cannot write to {0}: {1}".format(self.instance_cache_file, e))
return id_cache
def _build_ip_list(self, instance, inst_name, secgroup_cache, port_cache, id_cache):
# Find all active fixed IPs for this VM, fetch each subnet_id
for net in instance.addresses:
for ip in instance.addresses[net]:
if ip['OS-EXT-IPS:type'] == 'fixed' and ip['version'] == 4:
subnet_id = None
nsuuid = None
for port in port_cache:
if ((port['mac_address'] == ip['OS-EXT-IPS-MAC:mac_addr'] and
port['tenant_id'] == instance.tenant_id and
port['status'] == 'ACTIVE')):
for fixed in port['fixed_ips']:
if fixed['ip_address'] == ip['addr']:
subnet_id = fixed['subnet_id']
break
# Use the subnet_id to find the router
ping_allowed = False
if subnet_id is not None:
for port in port_cache:
if ((port['device_owner'].startswith('network:router_interface') and
port['tenant_id'] == instance.tenant_id and
port['status'] == 'ACTIVE')):
nsuuid = port['device_id']
for fixed in port['fixed_ips']:
if fixed['subnet_id'] == subnet_id:
# Validate security group
if self._validate_secgroup(secgroup_cache,
instance,
fixed['ip_address']):
ping_allowed = True
break
if nsuuid is not None:
break
if nsuuid is not None and ping_allowed:
if 'network' not in id_cache[inst_name]:
id_cache[inst_name]['network'] = []
id_cache[inst_name]['network'].append({'namespace': "qrouter-{0}".format(nsuuid),
'ip': ip['addr']})
elif ping_allowed is False:
self.log.debug("ICMP disallowed for {0} on {1}".format(inst_name,
ip['addr']))
def _load_instance_cache(self):
"""Load the cache map of instance names to Nova data.
If the cache does not yet exist or is damaged, (re-)build it.
"""
instance_cache = {}
try:
with open(self.instance_cache_file, 'r') as cache_json:
instance_cache = json.load(cache_json)
# Is it time to force a refresh of this data?
if self.init_config.get('nova_refresh') is not None:
time_diff = time.time() - instance_cache['last_update']
if time_diff > self.init_config.get('nova_refresh'):
self._update_instance_cache()
except (IOError, TypeError, ValueError):
# The file may not exist yet, or is corrupt. Rebuild it now.
self.log.warning("Instance cache missing or corrupt, rebuilding.")
instance_cache = self._update_instance_cache()
pass
return instance_cache
def _load_metric_cache(self):
"""Load the counter metrics from the previous collection iteration
"""
metric_cache = {}
try:
with open(self.metric_cache_file, 'r') as cache_json:
metric_cache = json.load(cache_json)
except (IOError, TypeError, ValueError):
# The file may not exist yet.
self.log.warning("Metrics cache missing or corrupt, rebuilding.")
metric_cache = {}
pass
return metric_cache
def _update_metric_cache(self, metric_cache, run_time):
# Remove inactive VMs from the metric cache
write_metric_cache = deepcopy(metric_cache)
for instance in metric_cache:
if (('cpu.time' not in metric_cache[instance] or
self._test_vm_probation(time.strftime('%Y-%m-%dT%H:%M:%SZ',
time.gmtime(metric_cache[instance]['cpu.time']['timestamp'] + run_time))) < 0)):
self.log.info("Expiring old/empty {0} from cache".format(instance))
del(write_metric_cache[instance])
try:
with open(self.metric_cache_file, 'w') as cache_json:
json.dump(write_metric_cache, cache_json)
if stat.S_IMODE(os.stat(self.metric_cache_file).st_mode) != 0o600:
os.chmod(self.metric_cache_file, 0o600)
except IOError as e:
self.log.error("Cannot write to {0}: {1}".format(self.metric_cache_file, e))
def _inspect_network(self, insp, inst, inst_name, instance_cache, metric_cache, dims_customer, dims_operations):
"""Inspect network metrics for an instance"""
for vnic in insp.inspect_vnics(inst):
sample_time = time.time()
vnic_dimensions = {'device': vnic[0].name}
instance_ports = instance_cache.get(inst_name)['instance_ports']
partial_port_id = vnic[0].name.split('tap')[1]
# Multiple networked guest
for port in instance_ports:
if partial_port_id == port[:11]:
vnic_dimensions['port_id'] = port
break
for metric in vnic[1]._fields:
metric_name = "net.{0}".format(metric)
if metric_name not in metric_cache[inst_name]:
metric_cache[inst_name][metric_name] = {}
value = int(vnic[1].__getattribute__(metric))
if vnic[0].name in metric_cache[inst_name][metric_name]:
last_update_time = metric_cache[inst_name][metric_name][vnic[0].name]['timestamp']
time_diff = sample_time - float(last_update_time)
rate_value = self._calculate_rate(value,
metric_cache[inst_name][metric_name][vnic[0].name]['value'],
time_diff)
if rate_value < 0:
# Bad value, save current reading and skip
self.log.warn("Ignoring negative network sample for: "
"{0} new value: {1} old value: {2}"
.format(inst_name, value,
metric_cache[inst_name][metric_name][vnic[0].name]['value']))
metric_cache[inst_name][metric_name][vnic[0].name] = {
'timestamp': sample_time,
'value': value}
continue
rate_name = self._get_metric_rate_name(metric_name)
rate_name = self._get_metric_name(rate_name)
if self.use_bits:
rate_value *= 8
# Customer
this_dimensions = vnic_dimensions.copy()
this_dimensions.update(dims_customer)
self.gauge(rate_name, rate_value,
dimensions=this_dimensions,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
# Operations (metric name prefixed with "vm."
this_dimensions = vnic_dimensions.copy()
this_dimensions.update(dims_operations)
self.gauge("vm.{0}".format(rate_name), rate_value,
dimensions=this_dimensions)
# Report raw counters.
mapped_name = self._get_metric_name(metric_name)
weighted_value = value
if self.use_bits:
weighted_value = value * 8
# Customer
this_dimensions = vnic_dimensions.copy()
this_dimensions.update(dims_customer)
self.gauge(mapped_name, weighted_value,
dimensions=this_dimensions,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
# Operations (metric name prefixed with "vm.")
this_dimensions = vnic_dimensions.copy()
this_dimensions.update(dims_operations)
self.gauge("vm.{0}".format(mapped_name),
weighted_value, dimensions=this_dimensions)
# Save this metric to the cache
metric_cache[inst_name][metric_name][vnic[0].name] = {
'timestamp': sample_time,
'value': value}
def _inspect_cpu(self, insp, inst, inst_name, instance_cache, metric_cache, dims_customer, dims_operations):
"""Inspect cpu metrics for an instance"""
sample_time = float("{:9f}".format(time.time()))
cpu_info = insp.inspect_cpus(inst)
if 'cpu.time' in metric_cache[inst_name]:
# I have a prior value, so calculate the used_cores & push the metric
cpu_diff = cpu_info.time - metric_cache[inst_name]['cpu.time']['value']
time_diff = sample_time - float(metric_cache[inst_name]['cpu.time']['timestamp'])
# Convert time_diff to nanoseconds, and calculate percentage
used_cores = (cpu_diff / (time_diff * 1000000000))
# Divide by the number of cores to normalize the percentage
normalized_perc = (used_cores / cpu_info.number) * 100
if used_cores < 0:
# Bad value, save current reading and skip
self.log.warn("Ignoring negative CPU sample for: "
"{0} new cpu time: {1} old cpu time: {2}"
.format(inst_name, cpu_info.time,
metric_cache[inst_name]['cpu.time']['value']))
metric_cache[inst_name]['cpu.time'] = {'timestamp': sample_time,
'value': cpu_info.time}
return
self.gauge('cpu.total_cores', float(cpu_info.number),
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge('cpu.used_cores', float(used_cores),
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge('cpu.utilization_perc', int(round(used_cores * 100, 0)),
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge('cpu.utilization_norm_perc', int(round(normalized_perc, 0)),
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge('vm.cpu.total_cores', float(cpu_info.number),
dimensions=dims_operations)
self.gauge('vm.cpu.used_cores', float(used_cores),
dimensions=dims_operations)
self.gauge('vm.cpu.utilization_perc', int(round(used_cores * 100, 0)),
dimensions=dims_operations)
self.gauge('vm.cpu.utilization_norm_perc', int(round(normalized_perc, 0)),
dimensions=dims_operations)
cpu_time_name = 'cpu.time_ns'
# cpu.time_ns for owning tenant
self.gauge(cpu_time_name, cpu_info.time,
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
# vm..cpu.time_ns for operations tenant
self.gauge("vm.{0}".format(cpu_time_name), cpu_info.time,
dimensions=dims_operations)
metric_cache[inst_name]['cpu.time'] = {'timestamp': sample_time,
'value': cpu_info.time}
def _inspect_disks(self, insp, inst, inst_name, instance_cache, metric_cache, dims_customer, dims_operations):
"""Inspect disk metrics for an instance"""
metric_aggregate = {}
for disk in insp.inspect_disks(inst):
sample_time = time.time()
disk_dimensions = {'device': disk[0].device}
for metric in disk[1]._fields:
metric_name = "io.{0}".format(metric.replace('requests', 'ops'))
if metric_name not in metric_cache[inst_name]:
metric_cache[inst_name][metric_name] = {}
value = int(disk[1].__getattribute__(metric))
metric_aggregate[metric_name] = metric_aggregate.get(
metric_name, 0) + value
if disk[0].device in metric_cache[inst_name][metric_name]:
cached_val = metric_cache[inst_name][metric_name][disk[
0].device]['value']
last_update_time = metric_cache[inst_name][metric_name][disk[
0].device]['timestamp']
time_diff = sample_time - float(last_update_time)
rate_value = self._calculate_rate(value, cached_val, time_diff)
if rate_value < 0:
# Bad value, save current reading and skip
self.log.warn("Ignoring negative disk sample for: "
"{0} new value: {1} old value: {2}"
.format(inst_name, value, cached_val))
metric_cache[inst_name][metric_name][disk[0].device] = {
'timestamp': sample_time,
'value': value}
continue
# Change the metric name to a rate, ie. "io.read_requests"
# gets converted to "io.read_ops_sec"
rate_name = "{0}_sec".format(metric_name.replace('requests', 'ops'))
# Customer
this_dimensions = disk_dimensions.copy()
this_dimensions.update(dims_customer)
self.gauge(rate_name, rate_value, dimensions=this_dimensions,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge(metric_name, value, dimensions=this_dimensions,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
# Operations (metric name prefixed with "vm."
this_dimensions = disk_dimensions.copy()
this_dimensions.update(dims_operations)
self.gauge("vm.{0}".format(rate_name), rate_value,
dimensions=this_dimensions)
self.gauge("vm.{0}".format(metric_name), value,
dimensions=this_dimensions)
# Save this metric to the cache
metric_cache[inst_name][metric_name][disk[0].device] = {
'timestamp': sample_time,
'value': value}
if self.init_config.get('vm_extended_disks_check_enable'):
for metric in metric_aggregate:
sample_time = time.time()
rate_name = "{0}_total_sec".format(metric)
if rate_name not in metric_cache[inst_name]:
metric_cache[inst_name][rate_name] = {}
else:
last_update_time = metric_cache[inst_name][
rate_name]['timestamp']
time_diff = sample_time - float(last_update_time)
rate_value = self._calculate_rate(metric_aggregate[metric],
metric_cache[inst_name][rate_name]['value'],
time_diff)
if rate_value < 0:
# Bad value, save current reading and skip
self.log.warn("Ignoring negative disk sample for: "
"{0} new value: {1} old value: {2}"
.format(inst_name, metric_aggregate[metric],
metric_cache[inst_name][rate_name][
'value']))
metric_cache[inst_name][rate_name] = {
'timestamp': sample_time,
'value': metric_aggregate[metric]}
continue
self.gauge(rate_name, rate_value, dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge("vm.{0}".format(rate_name), rate_value,
dimensions=dims_operations)
self.gauge("{0}_total".format(metric), metric_aggregate[metric],
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge("vm.{0}_total".format(metric),
metric_aggregate[metric],
dimensions=dims_operations)
# Save this metric to the cache
metric_cache[inst_name][rate_name] = {
'timestamp': sample_time,
'value': metric_aggregate[metric]}
def _inspect_disk_info(self, insp, inst, inst_name, instance_cache, metric_cache,
dims_customer, dims_operations):
"""Inspect disk metrics for an instance"""
metric_aggregate = {}
for disk in insp.inspect_disk_info(inst):
disk_dimensions = {'device': disk[0].device}
for metric in disk[1]._fields:
metric_name = "disk.{0}".format(metric)
value = int(disk[1].__getattribute__(metric))
metric_aggregate[metric_name] = metric_aggregate.get(
metric_name, 0) + value
this_dimensions = disk_dimensions.copy()
this_dimensions.update(dims_customer)
self.gauge(metric_name, value, dimensions=this_dimensions,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
# Operations (metric name prefixed with "vm."
this_dimensions = disk_dimensions.copy()
this_dimensions.update(dims_operations)
self.gauge("vm.{0}".format(metric_name), value,
dimensions=this_dimensions)
for metric in metric_aggregate:
self.gauge("{0}_total".format(metric), metric_aggregate[metric],
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge("vm.{0}_total".format(metric),
metric_aggregate[metric],
dimensions=dims_operations)
def _inspect_state(self, insp, inst, inst_name, instance_cache, dims_customer, dims_operations):
"""Look at the state of the instance, publish a metric using a
user-friendly description in the 'detail' metadata, and return
a status code (calibrated to UNIX status codes where 0 is OK)
so that remaining metrics can be skipped if the VM is not OK
"""
inst_state = inst.state()
dom_status = inst_state[0] - 1
health_status = 0 if dom_status == 0 else 1 # anything other than 'running' is considered unhealthy
metatag = None
if inst_state[0] in DOM_STATES:
metatag = {'detail': DOM_STATES[inst_state[0]]}
# A VM being in SHUTOFF state may have many reasons, we try to be more specific here
if inst_state[0] == libvirt.VIR_DOMAIN_SHUTOFF:
if inst_state[1] in DOM_SHUTOFF_STATES:
metatag = {'detail': DOM_SHUTOFF_STATES[inst_state[1]]}
self.gauge('host_alive_status', dom_status, dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'],
value_meta=metatag)
self.gauge('vm.host_alive_status', dom_status,
dimensions=dims_operations,
value_meta=metatag)
self.gauge('health_status', health_status,
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge('vm.health_status', health_status,
dimensions=dims_operations)
return inst_state[0]
def prepare_run(self):
"""Check if it is time for measurements to be collected"""
for name, collection in self._collect_intervals.items():
if collection['period'] <= 0:
continue
time_since_last = datetime.now() - collection['last_collect']
# Handle times that are really close to the collection period
period_with_fudge_factor = timedelta(0, collection['period'] - 1,
500000)
if time_since_last < period_with_fudge_factor:
self.log.debug('Skipping {} collection for {} seconds'.format(
name,
(collection['period'] - time_since_last.seconds)))
collection['skip'] = True
else:
collection['skip'] = False
collection['last_collect'] = datetime.now()
def _run_ping(self, dims_customer, dims_operations, inst_name, instance_cache, net):
"""Create a ping command and hand it off to the Thread Pool"""
ping_cmd = self.init_config.get('ping_check').replace('NAMESPACE',
net['namespace']).split()
ping_cmd.append(net['ip'])
dims_customer_ip = dims_customer.copy()
dims_operations_ip = dims_operations.copy()
dims_customer_ip['ip'] = net['ip']
dims_operations_ip['ip'] = net['ip']
with open(os.devnull, "w") as fnull:
try:
self.log.debug("Running ping test: {0}".format(' '.join(ping_cmd)))
res = subprocess.call(ping_cmd,
stdout=fnull,
stderr=fnull)
tenant_id = instance_cache.get(inst_name)['tenant_id']
hostname = instance_cache.get(inst_name)['hostname']
return (res, dims_customer_ip, dims_operations_ip, tenant_id,
hostname)
except Exception as e:
self.log.exception("OS error running '{0}' failed".format(ping_cmd), e)
raise e
def _check_ping_results(self, ping_results):
"""Iterate through ping results and create measurements"""
for result in ping_results:
result.wait()
# If it wasn't successful, a message was already logged in _run_ping
if result.successful():
(res, dims_customer_ip, dims_operations_ip, delegated_tenant,
hostname) = result.get()
self.gauge('ping_status', res, dimensions=dims_customer_ip,
delegated_tenant=delegated_tenant,
hostname=hostname)
self.gauge('vm.ping_status', res, dimensions=dims_operations_ip)
def check(self, instance):
"""Gather VM metrics for each instance"""
time_start = time.time()
# Load metric cache
metric_cache = self._load_metric_cache()
# Load the nova-obtained instance data cache
instance_cache = self._load_instance_cache()
# Build dimensions for both the customer and for operations
dims_base = self._set_dimensions({'service': 'compute', 'component': 'vm'}, instance)
# Initialize aggregate alive status data structure (separate from
# aggregate gauges because every possible value needs to be counted
# separately)
agg_alive_counts = {}
for code in DOM_ALIVE_NAMES:
agg_alive_counts[code] = 0
# Per host total VM count
vm_count = 0
# Define aggregate gauges, gauge name to metric name
agg_gauges = {'vcpus': 'nova.vm.cpu.total_allocated',
'ram': 'nova.vm.mem.total_allocated_mb',
'disk': 'nova.vm.disk.total_allocated_gb'}
agg_values = {}
for gauge in agg_gauges.keys():
agg_values[gauge] = 0
insp = inspector.get_hypervisor_inspector()
updated_cache_this_time = False
ping_results = []
for inst in insp._get_connection().listAllDomains():
# Verify that this instance exists in the cache. Add if necessary.
inst_name = inst.name()
if inst_name not in instance_cache and not updated_cache_this_time:
#
# If we have multiple ghost VMs, we'll needlessly
# update the instance cache. Let's limit the cache
# update to once per agent wakeup.
#
updated_cache_this_time = True
instance_cache = self._update_instance_cache()
# Build customer dimensions
try:
dims_customer = dims_base.copy()
dims_customer['resource_id'] = instance_cache.get(inst_name)['instance_uuid']
dims_customer['zone'] = instance_cache.get(inst_name)['zone']
# Add dimensions that would be helpful for operations
dims_operations = dims_customer.copy()
dims_operations['tenant_id'] = instance_cache.get(inst_name)['tenant_id']
dims_operations = self._update_dims_with_metadata(instance_cache, inst_name, dims_operations)
if self.init_config.get('customer_metadata'):
for metadata in self.init_config.get('customer_metadata'):
metadata_value = (instance_cache.get(inst_name).
get(metadata))
if metadata_value:
dims_customer[metadata] = metadata_value
# Remove customer 'hostname' dimension, this will be replaced by the VM name
del(dims_customer['hostname'])
#
# Add this hypervisor's host aggregate as a dimension if
# configured to do so and we had a match on the regex for
# this host.
#
if self._host_aggregate:
dims_operations['host_aggregate'] = self._host_aggregate
except TypeError:
# Nova can potentially get into a state where it can't see an
# instance, but libvirt can. This would cause TypeErrors as
# incomplete data is cached for this instance. Log and skip.
self.log.error("{0} is not known to nova after instance cache update -- skipping this ghost VM.".format(inst_name))
continue
# Accumulate aggregate data
for gauge in agg_gauges:
if gauge in instance_cache.get(inst_name):
agg_values[gauge] += instance_cache.get(inst_name)[gauge]
# Skip instances created within the probation period
vm_probation_remaining = self._test_vm_probation(instance_cache.get(inst_name)['created'])
if (vm_probation_remaining >= 0):
self.log.info("Libvirt: {0} in probation for another {1} seconds".format(instance_cache.get(inst_name)['hostname'].encode('utf8'),
vm_probation_remaining))
continue
vm_dom_state = self._inspect_state(insp, inst, inst_name,
instance_cache, dims_customer,
dims_operations)
agg_alive_counts[vm_dom_state] += 1
vm_count += 1
# Skip further processing on VMs that are not in an active state
if vm_dom_state != libvirt.VIR_DOMAIN_RUNNING:
continue
# Skip the remainder of the checks if alive_only is True in the config
if self.init_config.get('alive_only'):
continue
if inst_name not in metric_cache:
metric_cache[inst_name] = {}
if self.init_config.get('vm_cpu_check_enable'):
self._inspect_cpu(insp, inst, inst_name, instance_cache, metric_cache, dims_customer, dims_operations)
if not self._collect_intervals['disk']['skip']:
if self.init_config.get('vm_disks_check_enable'):
self._inspect_disks(insp, inst, inst_name, instance_cache, metric_cache, dims_customer,
dims_operations)
if self.init_config.get('vm_extended_disks_check_enable'):
self._inspect_disk_info(insp, inst, inst_name, instance_cache, metric_cache, dims_customer,
dims_operations)
if not self._collect_intervals['vnic']['skip']:
if self.init_config.get('vm_network_check_enable'):
self._inspect_network(insp, inst, inst_name, instance_cache, metric_cache, dims_customer, dims_operations)
# Memory utilization
# (req. balloon driver; Linux kernel param CONFIG_VIRTIO_BALLOON)
try:
mem_stats = inst.memoryStats()
mem_metrics = {'mem.free_gb': float(mem_stats['unused']) / 1024 / 1024,
'mem.swap_used_gb': float(mem_stats['swap_out']) / 1024 / 1024,
'mem.total_gb': float(mem_stats['available']) / 1024 / 1024,
'mem.used_gb': float(mem_stats['available'] - mem_stats['unused']) / 1024 / 1024,
'mem.free_perc': float(mem_stats['unused']) / float(mem_stats['available']) * 100}
for name in mem_metrics:
self.gauge(name, mem_metrics[name], dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge("vm.{0}".format(name), mem_metrics[name],
dimensions=dims_operations)
memory_info = insp.inspect_memory_resident(inst)
self.gauge('vm.mem.resident_gb', float(memory_info.resident) / 1024, dimensions=dims_operations)
except KeyError:
self.log.debug("Balloon driver not active/available on guest {0} ({1})".format(inst_name,
instance_cache.get(inst_name)['hostname'].encode('utf8')))
# Test instance's remote responsiveness (ping check) if possible
if (self.init_config.get('vm_ping_check_enable')) and self.init_config.get('ping_check') and 'network' in instance_cache.get(inst_name):
for net in instance_cache.get(inst_name)['network']:
ping_args = [dims_customer, dims_operations, inst_name, instance_cache, net]
ping_results.append(self.pool.apply_async(self._run_ping, ping_args))
# Save these metrics for the next collector invocation
self._update_metric_cache(metric_cache, math.ceil(time.time() - time_start))
# Publish aggregate metrics
for gauge in agg_gauges:
self.gauge(agg_gauges[gauge], agg_values[gauge], dimensions=dims_base)
# Publish aggregate VM counts
self._gauge_agg_alive_counts(agg_alive_counts, vm_count, dims_base)
# Check results of ping tests
self._check_ping_results(ping_results)
def _calculate_rate(self, current_value, cache_value, time_diff):
"""Calculate rate based on current, cache value and time_diff."""
try:
rate_value = (current_value - cache_value) / time_diff
except ZeroDivisionError as e:
self.log.error("Time difference between current time and "
"last_update time is 0 . {0}".format(e))
#
# Being extra safe here, in case we divide by zero
# just skip this reading with check below.
#
rate_value = -1
return rate_value
def _gauge_agg_alive_counts(self, agg_alive_counts, vm_count, dims_base):
count_pfx = "nova.vm."
total_frac = (float(vm_count) / 100)
self.gauge(count_pfx + 'total_count', vm_count, dimensions=dims_base)
for agg in agg_alive_counts:
self.gauge(count_pfx + DOM_ALIVE_NAMES[agg] + "_count",
agg_alive_counts[agg],
dimensions=dims_base)
if total_frac != 0:
self.gauge(count_pfx + DOM_ALIVE_NAMES[agg] + "_perc",
agg_alive_counts[agg] / total_frac,
dimensions=dims_base)
else:
self.gauge(count_pfx + DOM_ALIVE_NAMES[agg] + "_perc",
0, dimensions=dims_base)
def _update_dims_with_metadata(self, instance_cache, inst_name, dim_operations):
"""Update operations dimensions with metadata."""
dims = dim_operations
if self.init_config.get('metadata'):
for metadata in self.init_config.get('metadata'):
if 'vm_name' == metadata:
metadata_value = (instance_cache.get(inst_name).
get('hostname'))
else:
metadata_value = (instance_cache.get(inst_name).
get(metadata))
if metadata_value:
dims[metadata] = metadata_value
return dims
def _get_this_host_aggregate(self, nova_client):
"""Determine the host aggregate for this hypervisor."""
host_agg_cfg_re = self.init_config.get('host_aggregate_re', None)
if not host_agg_cfg_re:
return
try:
agg_re = re.compile(host_agg_cfg_re)
aggs = nova_client.aggregates.list()
for idx, agg in enumerate(aggs):
if re.match(agg_re, aggs[idx].name) and self.hostname in aggs[idx].hosts:
self._host_aggregate = str(aggs[idx].name)
#
# Not expecting multiple matches, if we've got a match we're done.
#
break
except Exception as e:
msg = "Failed to list host aggregates, won't publish aggregate dimension: '{0}'"
self.log.error(msg.format(e))