943 lines
48 KiB
Python
943 lines
48 KiB
Python
#!/bin/env python
|
|
|
|
# (c) Copyright 2014-2016 Hewlett Packard Enterprise Development LP
|
|
# Copyright 2017 Fujitsu LIMITED
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
"""Monasca Agent interface for libvirt metrics"""
|
|
|
|
import json
|
|
import libvirt
|
|
import math
|
|
import monasca_agent.collector.checks.utils as utils
|
|
import os
|
|
import re
|
|
import stat
|
|
import subprocess
|
|
import time
|
|
|
|
from calendar import timegm
|
|
from copy import deepcopy
|
|
from datetime import datetime
|
|
from datetime import timedelta
|
|
from multiprocessing.dummy import Pool
|
|
from netaddr import all_matching_cidrs
|
|
from neutronclient.v2_0 import client as neutron_client
|
|
from novaclient import client as n_client
|
|
from novaclient.exceptions import NotFound
|
|
|
|
from monasca_agent.collector.checks import AgentCheck
|
|
from monasca_agent.collector.virt import inspector
|
|
from monasca_agent.common import keystone
|
|
from monasca_agent import version as ma_version
|
|
|
|
DOM_STATES = {libvirt.VIR_DOMAIN_BLOCKED: 'VM is blocked',
|
|
libvirt.VIR_DOMAIN_CRASHED: 'VM has crashed',
|
|
libvirt.VIR_DOMAIN_NONE: 'VM has no state',
|
|
libvirt.VIR_DOMAIN_PAUSED: 'VM is paused',
|
|
libvirt.VIR_DOMAIN_PMSUSPENDED: 'VM is in power management (s3) suspend',
|
|
libvirt.VIR_DOMAIN_SHUTDOWN: 'VM is shutting down',
|
|
libvirt.VIR_DOMAIN_SHUTOFF: 'VM has been shut off (other reason)'}
|
|
|
|
DOM_ALIVE_NAMES = {libvirt.VIR_DOMAIN_BLOCKED: 'blocked',
|
|
libvirt.VIR_DOMAIN_CRASHED: 'crashed',
|
|
libvirt.VIR_DOMAIN_NONE: 'nostate',
|
|
libvirt.VIR_DOMAIN_PAUSED: 'paused',
|
|
libvirt.VIR_DOMAIN_PMSUSPENDED: 'suspended',
|
|
libvirt.VIR_DOMAIN_RUNNING: 'running',
|
|
libvirt.VIR_DOMAIN_SHUTDOWN: 'shuttingdown',
|
|
libvirt.VIR_DOMAIN_SHUTOFF: 'shutoff'} # shut off/nova suspend
|
|
|
|
DOM_SHUTOFF_STATES = {libvirt.VIR_DOMAIN_SHUTOFF_UNKNOWN: 'VM has been shutoff (reason unknown)',
|
|
libvirt.VIR_DOMAIN_SHUTOFF_SHUTDOWN: 'VM has been shut down',
|
|
libvirt.VIR_DOMAIN_SHUTOFF_DESTROYED: 'VM has been destroyed (forced off)',
|
|
libvirt.VIR_DOMAIN_SHUTOFF_CRASHED: 'VM has crashed',
|
|
libvirt.VIR_DOMAIN_SHUTOFF_MIGRATED: 'VM has been migrated',
|
|
libvirt.VIR_DOMAIN_SHUTOFF_SAVED: 'VM has been suspended',
|
|
libvirt.VIR_DOMAIN_SHUTOFF_FAILED: 'VM has failed to start',
|
|
libvirt.VIR_DOMAIN_SHUTOFF_FROM_SNAPSHOT: 'VM has been restored from powered off snapshot'}
|
|
|
|
|
|
class LibvirtCheck(AgentCheck):
|
|
|
|
"""Inherit Agent class and gather libvirt metrics"""
|
|
|
|
def __init__(self, name, init_config, agent_config, instances=None):
|
|
AgentCheck.__init__(self, name, init_config, agent_config, instances=[{}])
|
|
self.instance_cache_file = "{0}/{1}".format(self.init_config.get('cache_dir'),
|
|
'libvirt_instances.json')
|
|
self.metric_cache_file = "{0}/{1}".format(self.init_config.get('cache_dir'),
|
|
'libvirt_metrics.json')
|
|
self.use_bits = self.init_config.get('network_use_bits')
|
|
|
|
self._collect_intervals = {}
|
|
self._host_aggregate = None
|
|
self._nova_host = None
|
|
|
|
self._set_collection_intervals('disk', 'disk_collection_period')
|
|
self._set_collection_intervals('vnic', 'vnic_collection_period')
|
|
|
|
pool_size = self.init_config.get('max_ping_concurrency', 8)
|
|
self.pool = Pool(pool_size)
|
|
|
|
def _set_collection_intervals(self, interval_name, config_name):
|
|
self._collect_intervals[interval_name] = {
|
|
'period': int(self.init_config.get(config_name, 0)),
|
|
'last_collect': datetime.fromordinal(1),
|
|
'skip': False}
|
|
|
|
def _test_vm_probation(self, created):
|
|
"""Test to see if a VM was created within the probation period.
|
|
|
|
Convert an ISO-8601 timestamp into UNIX epoch timestamp from now
|
|
and compare that against configured vm_probation. Return the
|
|
number of seconds this VM will remain in probation.
|
|
"""
|
|
dt = datetime.strptime(created, '%Y-%m-%dT%H:%M:%SZ')
|
|
created_sec = (time.time() - timegm(dt.timetuple()))
|
|
probation_time = self.init_config.get('vm_probation', 300) - created_sec
|
|
return int(probation_time)
|
|
|
|
def _get_metric_name(self, orig_name):
|
|
# Rename "tx" to "out" and "rx" to "in"
|
|
metric_name = orig_name.replace("tx", "out").replace("rx", "in")
|
|
if self.use_bits:
|
|
metric_name = metric_name.replace("bytes", "bits")
|
|
return metric_name
|
|
|
|
@staticmethod
|
|
def _get_metric_rate_name(metric_name):
|
|
"""Change the metric name to a rate, i.e. "net.rx_bytes"
|
|
gets converted to "net.rx_bytes_sec"
|
|
"""
|
|
return "{0}_sec".format(metric_name)
|
|
|
|
@staticmethod
|
|
def _validate_secgroup(cache, instance, source_ip):
|
|
"""Search through an instance's security groups for pingability
|
|
"""
|
|
for instance_secgroup in instance.security_groups:
|
|
for secgroup in cache:
|
|
if ((secgroup['tenant_id'] == instance.tenant_id and
|
|
secgroup['name'] == instance_secgroup['name'])):
|
|
for rule in secgroup['security_group_rules']:
|
|
if rule['protocol'] == 'icmp':
|
|
if ((not rule['remote_ip_prefix'] or
|
|
all_matching_cidrs(source_ip,
|
|
[rule['remote_ip_prefix']]))):
|
|
return True
|
|
|
|
def _get_nova_host(self, nova_client):
|
|
if not self._nova_host:
|
|
# Find `nova-compute` on current node
|
|
services = nova_client.services.list(host=self.hostname,
|
|
binary='nova-compute')
|
|
if not services:
|
|
# Catch the case when `nova-compute` is registered with
|
|
# unqualified hostname
|
|
services = nova_client.services.list(
|
|
host=self.hostname.split('.')[0], binary='nova-compute')
|
|
if services:
|
|
self._nova_host = services[0].host
|
|
self.log.info("Found 'nova-compute' registered with host: {}"
|
|
.format(self._nova_host))
|
|
|
|
if self._nova_host:
|
|
return self._nova_host
|
|
else:
|
|
self.log.warn("No 'nova-compute' service found on host: {}"
|
|
.format(self.hostname))
|
|
# Return hostname as fallback value
|
|
return self.hostname
|
|
|
|
def _update_instance_cache(self):
|
|
"""Collect instance_id, project_id, and AZ for all instance UUIDs
|
|
"""
|
|
|
|
id_cache = {}
|
|
flavor_cache = {}
|
|
port_cache = None
|
|
netns = None
|
|
# Get a list of all instances from the Nova API
|
|
session = keystone.get_session(**self.init_config)
|
|
nova_client = n_client.Client(
|
|
"2.1", session=session,
|
|
endpoint_type=self.init_config.get("endpoint_type", "publicURL"),
|
|
service_type="compute",
|
|
region_name=self.init_config.get('region_name'),
|
|
client_name='monasca-agent[libvirt]',
|
|
client_version=ma_version.version_string)
|
|
self._get_this_host_aggregate(nova_client)
|
|
instances = nova_client.servers.list(
|
|
search_opts={'all_tenants': 1,
|
|
'host': self._get_nova_host(nova_client)})
|
|
# Lay the groundwork for fetching VM IPs and network namespaces
|
|
if self.init_config.get('ping_check'):
|
|
nu = neutron_client.Client(
|
|
session=session,
|
|
endpoint_type=self.init_config.get("endpoint_type", "publicURL"),
|
|
region_name=self.init_config.get('region_name'),
|
|
client_name='monasca-agent[libvirt]',
|
|
client_version=ma_version.version_string)
|
|
port_cache = nu.list_ports()['ports']
|
|
# Finding existing network namespaces is an indication that either
|
|
# DVR agent_mode is enabled, or this is all-in-one (like devstack)
|
|
netns = subprocess.check_output(['ip', 'netns', 'list'])
|
|
if netns == '':
|
|
self.log.warn("Unable to ping VMs, no network namespaces found." +
|
|
"Either no VMs are present, or routing is centralized.")
|
|
|
|
#
|
|
# Only make the keystone call to get the tenant list
|
|
# if we are configured to publish tenant names.
|
|
#
|
|
tenants = []
|
|
if self.init_config.get('metadata') and 'tenant_name' in self.init_config.get('metadata'):
|
|
tenants = utils.get_tenant_list(self.init_config, self.log)
|
|
|
|
for instance in instances:
|
|
instance_ports = []
|
|
inst_name = instance.__getattr__('OS-EXT-SRV-ATTR:instance_name')
|
|
inst_az = instance.__getattr__('OS-EXT-AZ:availability_zone')
|
|
if instance.flavor['id'] in flavor_cache:
|
|
inst_flavor = flavor_cache[instance.flavor['id']]
|
|
else:
|
|
try:
|
|
inst_flavor = nova_client.flavors.get(instance.flavor['id'])
|
|
except NotFound as e:
|
|
self.log.error('Skipping VM {}: {}'.format(inst_name, e))
|
|
continue
|
|
flavor_cache[instance.flavor['id']] = inst_flavor
|
|
if port_cache:
|
|
instance_ports = [p['id'] for p in port_cache if p['device_id'] == instance.id]
|
|
id_cache[inst_name] = {'instance_uuid': instance.id,
|
|
'hostname': instance.name,
|
|
'zone': inst_az,
|
|
'created': instance.created,
|
|
'tenant_id': instance.tenant_id,
|
|
'vcpus': inst_flavor.vcpus,
|
|
'ram': inst_flavor.ram,
|
|
'disk': inst_flavor.disk,
|
|
'instance_ports': instance_ports}
|
|
|
|
tenant_name = utils.get_tenant_name(tenants, instance.tenant_id)
|
|
if tenant_name:
|
|
id_cache[inst_name]['tenant_name'] = tenant_name
|
|
|
|
for config_var in ['metadata', 'customer_metadata']:
|
|
if self.init_config.get(config_var):
|
|
for metadata in self.init_config.get(config_var):
|
|
if instance.metadata.get(metadata):
|
|
id_cache[inst_name][metadata] = (instance.metadata.
|
|
get(metadata))
|
|
|
|
# Build a list of pingable IP addresses attached to this VM and the
|
|
# appropriate namespace, for use in ping tests
|
|
if netns:
|
|
secgroup_cache = nu.list_security_groups()['security_groups']
|
|
self._build_ip_list(instance, inst_name,
|
|
secgroup_cache, port_cache, id_cache)
|
|
|
|
id_cache['last_update'] = int(time.time())
|
|
|
|
# Write the updated cache
|
|
try:
|
|
with open(self.instance_cache_file, 'w') as cache_json:
|
|
json.dump(id_cache, cache_json)
|
|
if stat.S_IMODE(os.stat(self.instance_cache_file).st_mode) != 0o600:
|
|
os.chmod(self.instance_cache_file, 0o600)
|
|
except IOError as e:
|
|
self.log.error("Cannot write to {0}: {1}".format(self.instance_cache_file, e))
|
|
|
|
return id_cache
|
|
|
|
def _build_ip_list(self, instance, inst_name, secgroup_cache, port_cache, id_cache):
|
|
# Find all active fixed IPs for this VM, fetch each subnet_id
|
|
for net in instance.addresses:
|
|
for ip in instance.addresses[net]:
|
|
if ip['OS-EXT-IPS:type'] == 'fixed' and ip['version'] == 4:
|
|
subnet_id = None
|
|
nsuuid = None
|
|
for port in port_cache:
|
|
if ((port['mac_address'] == ip['OS-EXT-IPS-MAC:mac_addr'] and
|
|
port['tenant_id'] == instance.tenant_id and
|
|
port['status'] == 'ACTIVE')):
|
|
for fixed in port['fixed_ips']:
|
|
if fixed['ip_address'] == ip['addr']:
|
|
subnet_id = fixed['subnet_id']
|
|
break
|
|
# Use the subnet_id to find the router
|
|
ping_allowed = False
|
|
if subnet_id is not None:
|
|
for port in port_cache:
|
|
if ((port['device_owner'].startswith('network:router_interface') and
|
|
port['tenant_id'] == instance.tenant_id and
|
|
port['status'] == 'ACTIVE')):
|
|
nsuuid = port['device_id']
|
|
for fixed in port['fixed_ips']:
|
|
if fixed['subnet_id'] == subnet_id:
|
|
# Validate security group
|
|
if self._validate_secgroup(secgroup_cache,
|
|
instance,
|
|
fixed['ip_address']):
|
|
ping_allowed = True
|
|
break
|
|
if nsuuid is not None:
|
|
break
|
|
if nsuuid is not None and ping_allowed:
|
|
if 'network' not in id_cache[inst_name]:
|
|
id_cache[inst_name]['network'] = []
|
|
id_cache[inst_name]['network'].append({'namespace': "qrouter-{0}".format(nsuuid),
|
|
'ip': ip['addr']})
|
|
elif ping_allowed is False:
|
|
self.log.debug("ICMP disallowed for {0} on {1}".format(inst_name,
|
|
ip['addr']))
|
|
|
|
def _load_instance_cache(self):
|
|
"""Load the cache map of instance names to Nova data.
|
|
If the cache does not yet exist or is damaged, (re-)build it.
|
|
"""
|
|
instance_cache = {}
|
|
try:
|
|
with open(self.instance_cache_file, 'r') as cache_json:
|
|
instance_cache = json.load(cache_json)
|
|
|
|
# Is it time to force a refresh of this data?
|
|
if self.init_config.get('nova_refresh') is not None:
|
|
time_diff = time.time() - instance_cache['last_update']
|
|
if time_diff > self.init_config.get('nova_refresh'):
|
|
self._update_instance_cache()
|
|
except (IOError, TypeError, ValueError):
|
|
# The file may not exist yet, or is corrupt. Rebuild it now.
|
|
self.log.warning("Instance cache missing or corrupt, rebuilding.")
|
|
instance_cache = self._update_instance_cache()
|
|
pass
|
|
|
|
return instance_cache
|
|
|
|
def _load_metric_cache(self):
|
|
"""Load the counter metrics from the previous collection iteration
|
|
"""
|
|
metric_cache = {}
|
|
try:
|
|
with open(self.metric_cache_file, 'r') as cache_json:
|
|
metric_cache = json.load(cache_json)
|
|
except (IOError, TypeError, ValueError):
|
|
# The file may not exist yet.
|
|
self.log.warning("Metrics cache missing or corrupt, rebuilding.")
|
|
metric_cache = {}
|
|
pass
|
|
|
|
return metric_cache
|
|
|
|
def _update_metric_cache(self, metric_cache, run_time):
|
|
# Remove inactive VMs from the metric cache
|
|
write_metric_cache = deepcopy(metric_cache)
|
|
for instance in metric_cache:
|
|
if (('cpu.time' not in metric_cache[instance] or
|
|
self._test_vm_probation(time.strftime('%Y-%m-%dT%H:%M:%SZ',
|
|
time.gmtime(metric_cache[instance]['cpu.time']['timestamp'] + run_time))) < 0)):
|
|
self.log.info("Expiring old/empty {0} from cache".format(instance))
|
|
del(write_metric_cache[instance])
|
|
try:
|
|
with open(self.metric_cache_file, 'w') as cache_json:
|
|
json.dump(write_metric_cache, cache_json)
|
|
if stat.S_IMODE(os.stat(self.metric_cache_file).st_mode) != 0o600:
|
|
os.chmod(self.metric_cache_file, 0o600)
|
|
except IOError as e:
|
|
self.log.error("Cannot write to {0}: {1}".format(self.metric_cache_file, e))
|
|
|
|
def _inspect_network(self, insp, inst, inst_name, instance_cache, metric_cache, dims_customer, dims_operations):
|
|
"""Inspect network metrics for an instance"""
|
|
for vnic in insp.inspect_vnics(inst):
|
|
sample_time = time.time()
|
|
vnic_dimensions = {'device': vnic[0].name}
|
|
instance_ports = instance_cache.get(inst_name)['instance_ports']
|
|
partial_port_id = vnic[0].name.split('tap')[1]
|
|
# Multiple networked guest
|
|
for port in instance_ports:
|
|
if partial_port_id == port[:11]:
|
|
vnic_dimensions['port_id'] = port
|
|
break
|
|
for metric in vnic[1]._fields:
|
|
metric_name = "net.{0}".format(metric)
|
|
if metric_name not in metric_cache[inst_name]:
|
|
metric_cache[inst_name][metric_name] = {}
|
|
|
|
value = int(vnic[1].__getattribute__(metric))
|
|
if vnic[0].name in metric_cache[inst_name][metric_name]:
|
|
last_update_time = metric_cache[inst_name][metric_name][vnic[0].name]['timestamp']
|
|
time_diff = sample_time - float(last_update_time)
|
|
rate_value = self._calculate_rate(value,
|
|
metric_cache[inst_name][metric_name][vnic[0].name]['value'],
|
|
time_diff)
|
|
if rate_value < 0:
|
|
# Bad value, save current reading and skip
|
|
self.log.warn("Ignoring negative network sample for: "
|
|
"{0} new value: {1} old value: {2}"
|
|
.format(inst_name, value,
|
|
metric_cache[inst_name][metric_name][vnic[0].name]['value']))
|
|
metric_cache[inst_name][metric_name][vnic[0].name] = {
|
|
'timestamp': sample_time,
|
|
'value': value}
|
|
continue
|
|
rate_name = self._get_metric_rate_name(metric_name)
|
|
rate_name = self._get_metric_name(rate_name)
|
|
if self.use_bits:
|
|
rate_value *= 8
|
|
# Customer
|
|
this_dimensions = vnic_dimensions.copy()
|
|
this_dimensions.update(dims_customer)
|
|
self.gauge(rate_name, rate_value,
|
|
dimensions=this_dimensions,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'])
|
|
# Operations (metric name prefixed with "vm."
|
|
this_dimensions = vnic_dimensions.copy()
|
|
this_dimensions.update(dims_operations)
|
|
self.gauge("vm.{0}".format(rate_name), rate_value,
|
|
dimensions=this_dimensions)
|
|
# Report raw counters.
|
|
mapped_name = self._get_metric_name(metric_name)
|
|
weighted_value = value
|
|
if self.use_bits:
|
|
weighted_value = value * 8
|
|
# Customer
|
|
this_dimensions = vnic_dimensions.copy()
|
|
this_dimensions.update(dims_customer)
|
|
self.gauge(mapped_name, weighted_value,
|
|
dimensions=this_dimensions,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'])
|
|
# Operations (metric name prefixed with "vm.")
|
|
this_dimensions = vnic_dimensions.copy()
|
|
this_dimensions.update(dims_operations)
|
|
self.gauge("vm.{0}".format(mapped_name),
|
|
weighted_value, dimensions=this_dimensions)
|
|
# Save this metric to the cache
|
|
metric_cache[inst_name][metric_name][vnic[0].name] = {
|
|
'timestamp': sample_time,
|
|
'value': value}
|
|
|
|
def _inspect_cpu(self, insp, inst, inst_name, instance_cache, metric_cache, dims_customer, dims_operations):
|
|
"""Inspect cpu metrics for an instance"""
|
|
|
|
sample_time = float("{:9f}".format(time.time()))
|
|
cpu_info = insp.inspect_cpus(inst)
|
|
|
|
if 'cpu.time' in metric_cache[inst_name]:
|
|
# I have a prior value, so calculate the used_cores & push the metric
|
|
cpu_diff = cpu_info.time - metric_cache[inst_name]['cpu.time']['value']
|
|
time_diff = sample_time - float(metric_cache[inst_name]['cpu.time']['timestamp'])
|
|
# Convert time_diff to nanoseconds, and calculate percentage
|
|
used_cores = (cpu_diff / (time_diff * 1000000000))
|
|
# Divide by the number of cores to normalize the percentage
|
|
normalized_perc = (used_cores / cpu_info.number) * 100
|
|
if used_cores < 0:
|
|
# Bad value, save current reading and skip
|
|
self.log.warn("Ignoring negative CPU sample for: "
|
|
"{0} new cpu time: {1} old cpu time: {2}"
|
|
.format(inst_name, cpu_info.time,
|
|
metric_cache[inst_name]['cpu.time']['value']))
|
|
metric_cache[inst_name]['cpu.time'] = {'timestamp': sample_time,
|
|
'value': cpu_info.time}
|
|
return
|
|
|
|
self.gauge('cpu.total_cores', float(cpu_info.number),
|
|
dimensions=dims_customer,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'])
|
|
self.gauge('cpu.used_cores', float(used_cores),
|
|
dimensions=dims_customer,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'])
|
|
self.gauge('cpu.utilization_perc', int(round(used_cores * 100, 0)),
|
|
dimensions=dims_customer,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'])
|
|
self.gauge('cpu.utilization_norm_perc', int(round(normalized_perc, 0)),
|
|
dimensions=dims_customer,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'])
|
|
|
|
self.gauge('vm.cpu.total_cores', float(cpu_info.number),
|
|
dimensions=dims_operations)
|
|
self.gauge('vm.cpu.used_cores', float(used_cores),
|
|
dimensions=dims_operations)
|
|
self.gauge('vm.cpu.utilization_perc', int(round(used_cores * 100, 0)),
|
|
dimensions=dims_operations)
|
|
self.gauge('vm.cpu.utilization_norm_perc', int(round(normalized_perc, 0)),
|
|
dimensions=dims_operations)
|
|
|
|
cpu_time_name = 'cpu.time_ns'
|
|
# cpu.time_ns for owning tenant
|
|
self.gauge(cpu_time_name, cpu_info.time,
|
|
dimensions=dims_customer,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'])
|
|
# vm..cpu.time_ns for operations tenant
|
|
self.gauge("vm.{0}".format(cpu_time_name), cpu_info.time,
|
|
dimensions=dims_operations)
|
|
metric_cache[inst_name]['cpu.time'] = {'timestamp': sample_time,
|
|
'value': cpu_info.time}
|
|
|
|
def _inspect_disks(self, insp, inst, inst_name, instance_cache, metric_cache, dims_customer, dims_operations):
|
|
"""Inspect disk metrics for an instance"""
|
|
|
|
metric_aggregate = {}
|
|
for disk in insp.inspect_disks(inst):
|
|
sample_time = time.time()
|
|
disk_dimensions = {'device': disk[0].device}
|
|
for metric in disk[1]._fields:
|
|
metric_name = "io.{0}".format(metric.replace('requests', 'ops'))
|
|
if metric_name not in metric_cache[inst_name]:
|
|
metric_cache[inst_name][metric_name] = {}
|
|
|
|
value = int(disk[1].__getattribute__(metric))
|
|
metric_aggregate[metric_name] = metric_aggregate.get(
|
|
metric_name, 0) + value
|
|
if disk[0].device in metric_cache[inst_name][metric_name]:
|
|
cached_val = metric_cache[inst_name][metric_name][disk[
|
|
0].device]['value']
|
|
last_update_time = metric_cache[inst_name][metric_name][disk[
|
|
0].device]['timestamp']
|
|
time_diff = sample_time - float(last_update_time)
|
|
rate_value = self._calculate_rate(value, cached_val, time_diff)
|
|
if rate_value < 0:
|
|
# Bad value, save current reading and skip
|
|
self.log.warn("Ignoring negative disk sample for: "
|
|
"{0} new value: {1} old value: {2}"
|
|
.format(inst_name, value, cached_val))
|
|
metric_cache[inst_name][metric_name][disk[0].device] = {
|
|
'timestamp': sample_time,
|
|
'value': value}
|
|
continue
|
|
# Change the metric name to a rate, ie. "io.read_requests"
|
|
# gets converted to "io.read_ops_sec"
|
|
rate_name = "{0}_sec".format(metric_name.replace('requests', 'ops'))
|
|
# Customer
|
|
this_dimensions = disk_dimensions.copy()
|
|
this_dimensions.update(dims_customer)
|
|
self.gauge(rate_name, rate_value, dimensions=this_dimensions,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'])
|
|
self.gauge(metric_name, value, dimensions=this_dimensions,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'])
|
|
|
|
# Operations (metric name prefixed with "vm."
|
|
this_dimensions = disk_dimensions.copy()
|
|
this_dimensions.update(dims_operations)
|
|
self.gauge("vm.{0}".format(rate_name), rate_value,
|
|
dimensions=this_dimensions)
|
|
self.gauge("vm.{0}".format(metric_name), value,
|
|
dimensions=this_dimensions)
|
|
# Save this metric to the cache
|
|
metric_cache[inst_name][metric_name][disk[0].device] = {
|
|
'timestamp': sample_time,
|
|
'value': value}
|
|
|
|
if self.init_config.get('vm_extended_disks_check_enable'):
|
|
for metric in metric_aggregate:
|
|
sample_time = time.time()
|
|
rate_name = "{0}_total_sec".format(metric)
|
|
if rate_name not in metric_cache[inst_name]:
|
|
metric_cache[inst_name][rate_name] = {}
|
|
else:
|
|
last_update_time = metric_cache[inst_name][
|
|
rate_name]['timestamp']
|
|
time_diff = sample_time - float(last_update_time)
|
|
rate_value = self._calculate_rate(metric_aggregate[metric],
|
|
metric_cache[inst_name][rate_name]['value'],
|
|
time_diff)
|
|
if rate_value < 0:
|
|
# Bad value, save current reading and skip
|
|
self.log.warn("Ignoring negative disk sample for: "
|
|
"{0} new value: {1} old value: {2}"
|
|
.format(inst_name, metric_aggregate[metric],
|
|
metric_cache[inst_name][rate_name][
|
|
'value']))
|
|
metric_cache[inst_name][rate_name] = {
|
|
'timestamp': sample_time,
|
|
'value': metric_aggregate[metric]}
|
|
continue
|
|
self.gauge(rate_name, rate_value, dimensions=dims_customer,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'])
|
|
self.gauge("vm.{0}".format(rate_name), rate_value,
|
|
dimensions=dims_operations)
|
|
self.gauge("{0}_total".format(metric), metric_aggregate[metric],
|
|
dimensions=dims_customer,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'])
|
|
self.gauge("vm.{0}_total".format(metric),
|
|
metric_aggregate[metric],
|
|
dimensions=dims_operations)
|
|
# Save this metric to the cache
|
|
metric_cache[inst_name][rate_name] = {
|
|
'timestamp': sample_time,
|
|
'value': metric_aggregate[metric]}
|
|
|
|
def _inspect_disk_info(self, insp, inst, inst_name, instance_cache, metric_cache,
|
|
dims_customer, dims_operations):
|
|
"""Inspect disk metrics for an instance"""
|
|
|
|
metric_aggregate = {}
|
|
for disk in insp.inspect_disk_info(inst):
|
|
disk_dimensions = {'device': disk[0].device}
|
|
for metric in disk[1]._fields:
|
|
metric_name = "disk.{0}".format(metric)
|
|
value = int(disk[1].__getattribute__(metric))
|
|
metric_aggregate[metric_name] = metric_aggregate.get(
|
|
metric_name, 0) + value
|
|
this_dimensions = disk_dimensions.copy()
|
|
this_dimensions.update(dims_customer)
|
|
self.gauge(metric_name, value, dimensions=this_dimensions,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'])
|
|
# Operations (metric name prefixed with "vm."
|
|
this_dimensions = disk_dimensions.copy()
|
|
this_dimensions.update(dims_operations)
|
|
self.gauge("vm.{0}".format(metric_name), value,
|
|
dimensions=this_dimensions)
|
|
|
|
for metric in metric_aggregate:
|
|
self.gauge("{0}_total".format(metric), metric_aggregate[metric],
|
|
dimensions=dims_customer,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'])
|
|
self.gauge("vm.{0}_total".format(metric),
|
|
metric_aggregate[metric],
|
|
dimensions=dims_operations)
|
|
|
|
def _inspect_state(self, insp, inst, inst_name, instance_cache, dims_customer, dims_operations):
|
|
"""Look at the state of the instance, publish a metric using a
|
|
user-friendly description in the 'detail' metadata, and return
|
|
a status code (calibrated to UNIX status codes where 0 is OK)
|
|
so that remaining metrics can be skipped if the VM is not OK
|
|
"""
|
|
inst_state = inst.state()
|
|
dom_status = inst_state[0] - 1
|
|
health_status = 0 if dom_status == 0 else 1 # anything other than 'running' is considered unhealthy
|
|
metatag = None
|
|
|
|
if inst_state[0] in DOM_STATES:
|
|
metatag = {'detail': DOM_STATES[inst_state[0]]}
|
|
# A VM being in SHUTOFF state may have many reasons, we try to be more specific here
|
|
if inst_state[0] == libvirt.VIR_DOMAIN_SHUTOFF:
|
|
if inst_state[1] in DOM_SHUTOFF_STATES:
|
|
metatag = {'detail': DOM_SHUTOFF_STATES[inst_state[1]]}
|
|
|
|
self.gauge('host_alive_status', dom_status, dimensions=dims_customer,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'],
|
|
value_meta=metatag)
|
|
self.gauge('vm.host_alive_status', dom_status,
|
|
dimensions=dims_operations,
|
|
value_meta=metatag)
|
|
|
|
self.gauge('health_status', health_status,
|
|
dimensions=dims_customer,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'])
|
|
self.gauge('vm.health_status', health_status,
|
|
dimensions=dims_operations)
|
|
|
|
return inst_state[0]
|
|
|
|
def prepare_run(self):
|
|
"""Check if it is time for measurements to be collected"""
|
|
for name, collection in self._collect_intervals.items():
|
|
if collection['period'] <= 0:
|
|
continue
|
|
|
|
time_since_last = datetime.now() - collection['last_collect']
|
|
# Handle times that are really close to the collection period
|
|
period_with_fudge_factor = timedelta(0, collection['period'] - 1,
|
|
500000)
|
|
|
|
if time_since_last < period_with_fudge_factor:
|
|
self.log.debug('Skipping {} collection for {} seconds'.format(
|
|
name,
|
|
(collection['period'] - time_since_last.seconds)))
|
|
collection['skip'] = True
|
|
else:
|
|
collection['skip'] = False
|
|
collection['last_collect'] = datetime.now()
|
|
|
|
def _run_ping(self, dims_customer, dims_operations, inst_name, instance_cache, net):
|
|
"""Create a ping command and hand it off to the Thread Pool"""
|
|
ping_cmd = self.init_config.get('ping_check').replace('NAMESPACE',
|
|
net['namespace']).split()
|
|
ping_cmd.append(net['ip'])
|
|
dims_customer_ip = dims_customer.copy()
|
|
dims_operations_ip = dims_operations.copy()
|
|
dims_customer_ip['ip'] = net['ip']
|
|
dims_operations_ip['ip'] = net['ip']
|
|
with open(os.devnull, "w") as fnull:
|
|
try:
|
|
self.log.debug("Running ping test: {0}".format(' '.join(ping_cmd)))
|
|
res = subprocess.call(ping_cmd,
|
|
stdout=fnull,
|
|
stderr=fnull)
|
|
tenant_id = instance_cache.get(inst_name)['tenant_id']
|
|
hostname = instance_cache.get(inst_name)['hostname']
|
|
return (res, dims_customer_ip, dims_operations_ip, tenant_id,
|
|
hostname)
|
|
|
|
except Exception as e:
|
|
self.log.exception("OS error running '{0}' failed".format(ping_cmd), e)
|
|
raise e
|
|
|
|
def _check_ping_results(self, ping_results):
|
|
"""Iterate through ping results and create measurements"""
|
|
for result in ping_results:
|
|
result.wait()
|
|
# If it wasn't successful, a message was already logged in _run_ping
|
|
if result.successful():
|
|
(res, dims_customer_ip, dims_operations_ip, delegated_tenant,
|
|
hostname) = result.get()
|
|
self.gauge('ping_status', res, dimensions=dims_customer_ip,
|
|
delegated_tenant=delegated_tenant,
|
|
hostname=hostname)
|
|
self.gauge('vm.ping_status', res, dimensions=dims_operations_ip)
|
|
|
|
def check(self, instance):
|
|
"""Gather VM metrics for each instance"""
|
|
|
|
time_start = time.time()
|
|
|
|
# Load metric cache
|
|
metric_cache = self._load_metric_cache()
|
|
|
|
# Load the nova-obtained instance data cache
|
|
instance_cache = self._load_instance_cache()
|
|
|
|
# Build dimensions for both the customer and for operations
|
|
dims_base = self._set_dimensions({'service': 'compute', 'component': 'vm'}, instance)
|
|
|
|
# Initialize aggregate alive status data structure (separate from
|
|
# aggregate gauges because every possible value needs to be counted
|
|
# separately)
|
|
agg_alive_counts = {}
|
|
for code in DOM_ALIVE_NAMES:
|
|
agg_alive_counts[code] = 0
|
|
|
|
# Per host total VM count
|
|
vm_count = 0
|
|
|
|
# Define aggregate gauges, gauge name to metric name
|
|
agg_gauges = {'vcpus': 'nova.vm.cpu.total_allocated',
|
|
'ram': 'nova.vm.mem.total_allocated_mb',
|
|
'disk': 'nova.vm.disk.total_allocated_gb'}
|
|
agg_values = {}
|
|
for gauge in agg_gauges.keys():
|
|
agg_values[gauge] = 0
|
|
|
|
insp = inspector.get_hypervisor_inspector()
|
|
updated_cache_this_time = False
|
|
ping_results = []
|
|
for inst in insp._get_connection().listAllDomains():
|
|
# Verify that this instance exists in the cache. Add if necessary.
|
|
inst_name = inst.name()
|
|
if inst_name not in instance_cache and not updated_cache_this_time:
|
|
#
|
|
# If we have multiple ghost VMs, we'll needlessly
|
|
# update the instance cache. Let's limit the cache
|
|
# update to once per agent wakeup.
|
|
#
|
|
updated_cache_this_time = True
|
|
instance_cache = self._update_instance_cache()
|
|
|
|
# Build customer dimensions
|
|
try:
|
|
dims_customer = dims_base.copy()
|
|
dims_customer['resource_id'] = instance_cache.get(inst_name)['instance_uuid']
|
|
dims_customer['zone'] = instance_cache.get(inst_name)['zone']
|
|
# Add dimensions that would be helpful for operations
|
|
dims_operations = dims_customer.copy()
|
|
dims_operations['tenant_id'] = instance_cache.get(inst_name)['tenant_id']
|
|
dims_operations = self._update_dims_with_metadata(instance_cache, inst_name, dims_operations)
|
|
if self.init_config.get('customer_metadata'):
|
|
for metadata in self.init_config.get('customer_metadata'):
|
|
metadata_value = (instance_cache.get(inst_name).
|
|
get(metadata))
|
|
if metadata_value:
|
|
dims_customer[metadata] = metadata_value
|
|
# Remove customer 'hostname' dimension, this will be replaced by the VM name
|
|
del(dims_customer['hostname'])
|
|
#
|
|
# Add this hypervisor's host aggregate as a dimension if
|
|
# configured to do so and we had a match on the regex for
|
|
# this host.
|
|
#
|
|
if self._host_aggregate:
|
|
dims_operations['host_aggregate'] = self._host_aggregate
|
|
except TypeError:
|
|
# Nova can potentially get into a state where it can't see an
|
|
# instance, but libvirt can. This would cause TypeErrors as
|
|
# incomplete data is cached for this instance. Log and skip.
|
|
self.log.error("{0} is not known to nova after instance cache update -- skipping this ghost VM.".format(inst_name))
|
|
continue
|
|
|
|
# Accumulate aggregate data
|
|
for gauge in agg_gauges:
|
|
if gauge in instance_cache.get(inst_name):
|
|
agg_values[gauge] += instance_cache.get(inst_name)[gauge]
|
|
|
|
# Skip instances created within the probation period
|
|
vm_probation_remaining = self._test_vm_probation(instance_cache.get(inst_name)['created'])
|
|
if (vm_probation_remaining >= 0):
|
|
self.log.info("Libvirt: {0} in probation for another {1} seconds".format(instance_cache.get(inst_name)['hostname'].encode('utf8'),
|
|
vm_probation_remaining))
|
|
continue
|
|
|
|
vm_dom_state = self._inspect_state(insp, inst, inst_name,
|
|
instance_cache, dims_customer,
|
|
dims_operations)
|
|
|
|
agg_alive_counts[vm_dom_state] += 1
|
|
vm_count += 1
|
|
|
|
# Skip further processing on VMs that are not in an active state
|
|
if vm_dom_state != libvirt.VIR_DOMAIN_RUNNING:
|
|
continue
|
|
|
|
# Skip the remainder of the checks if alive_only is True in the config
|
|
if self.init_config.get('alive_only'):
|
|
continue
|
|
|
|
if inst_name not in metric_cache:
|
|
metric_cache[inst_name] = {}
|
|
|
|
if self.init_config.get('vm_cpu_check_enable'):
|
|
self._inspect_cpu(insp, inst, inst_name, instance_cache, metric_cache, dims_customer, dims_operations)
|
|
if not self._collect_intervals['disk']['skip']:
|
|
if self.init_config.get('vm_disks_check_enable'):
|
|
self._inspect_disks(insp, inst, inst_name, instance_cache, metric_cache, dims_customer,
|
|
dims_operations)
|
|
if self.init_config.get('vm_extended_disks_check_enable'):
|
|
self._inspect_disk_info(insp, inst, inst_name, instance_cache, metric_cache, dims_customer,
|
|
dims_operations)
|
|
|
|
if not self._collect_intervals['vnic']['skip']:
|
|
if self.init_config.get('vm_network_check_enable'):
|
|
self._inspect_network(insp, inst, inst_name, instance_cache, metric_cache, dims_customer, dims_operations)
|
|
|
|
# Memory utilization
|
|
# (req. balloon driver; Linux kernel param CONFIG_VIRTIO_BALLOON)
|
|
try:
|
|
mem_stats = inst.memoryStats()
|
|
mem_metrics = {'mem.free_gb': float(mem_stats['unused']) / 1024 / 1024,
|
|
'mem.swap_used_gb': float(mem_stats['swap_out']) / 1024 / 1024,
|
|
'mem.total_gb': float(mem_stats['available']) / 1024 / 1024,
|
|
'mem.used_gb': float(mem_stats['available'] - mem_stats['unused']) / 1024 / 1024,
|
|
'mem.free_perc': float(mem_stats['unused']) / float(mem_stats['available']) * 100}
|
|
for name in mem_metrics:
|
|
self.gauge(name, mem_metrics[name], dimensions=dims_customer,
|
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
|
hostname=instance_cache.get(inst_name)['hostname'])
|
|
self.gauge("vm.{0}".format(name), mem_metrics[name],
|
|
dimensions=dims_operations)
|
|
memory_info = insp.inspect_memory_resident(inst)
|
|
self.gauge('vm.mem.resident_gb', float(memory_info.resident) / 1024, dimensions=dims_operations)
|
|
except KeyError:
|
|
self.log.debug("Balloon driver not active/available on guest {0} ({1})".format(inst_name,
|
|
instance_cache.get(inst_name)['hostname'].encode('utf8')))
|
|
# Test instance's remote responsiveness (ping check) if possible
|
|
if (self.init_config.get('vm_ping_check_enable')) and self.init_config.get('ping_check') and 'network' in instance_cache.get(inst_name):
|
|
for net in instance_cache.get(inst_name)['network']:
|
|
ping_args = [dims_customer, dims_operations, inst_name, instance_cache, net]
|
|
ping_results.append(self.pool.apply_async(self._run_ping, ping_args))
|
|
|
|
# Save these metrics for the next collector invocation
|
|
self._update_metric_cache(metric_cache, math.ceil(time.time() - time_start))
|
|
|
|
# Publish aggregate metrics
|
|
for gauge in agg_gauges:
|
|
self.gauge(agg_gauges[gauge], agg_values[gauge], dimensions=dims_base)
|
|
|
|
# Publish aggregate VM counts
|
|
|
|
self._gauge_agg_alive_counts(agg_alive_counts, vm_count, dims_base)
|
|
|
|
# Check results of ping tests
|
|
self._check_ping_results(ping_results)
|
|
|
|
def _calculate_rate(self, current_value, cache_value, time_diff):
|
|
"""Calculate rate based on current, cache value and time_diff."""
|
|
try:
|
|
rate_value = (current_value - cache_value) / time_diff
|
|
except ZeroDivisionError as e:
|
|
self.log.error("Time difference between current time and "
|
|
"last_update time is 0 . {0}".format(e))
|
|
#
|
|
# Being extra safe here, in case we divide by zero
|
|
# just skip this reading with check below.
|
|
#
|
|
rate_value = -1
|
|
return rate_value
|
|
|
|
def _gauge_agg_alive_counts(self, agg_alive_counts, vm_count, dims_base):
|
|
count_pfx = "nova.vm."
|
|
total_frac = (float(vm_count) / 100)
|
|
self.gauge(count_pfx + 'total_count', vm_count, dimensions=dims_base)
|
|
|
|
for agg in agg_alive_counts:
|
|
self.gauge(count_pfx + DOM_ALIVE_NAMES[agg] + "_count",
|
|
agg_alive_counts[agg],
|
|
dimensions=dims_base)
|
|
if total_frac != 0:
|
|
self.gauge(count_pfx + DOM_ALIVE_NAMES[agg] + "_perc",
|
|
agg_alive_counts[agg] / total_frac,
|
|
dimensions=dims_base)
|
|
else:
|
|
self.gauge(count_pfx + DOM_ALIVE_NAMES[agg] + "_perc",
|
|
0, dimensions=dims_base)
|
|
|
|
def _update_dims_with_metadata(self, instance_cache, inst_name, dim_operations):
|
|
"""Update operations dimensions with metadata."""
|
|
dims = dim_operations
|
|
if self.init_config.get('metadata'):
|
|
for metadata in self.init_config.get('metadata'):
|
|
if 'vm_name' == metadata:
|
|
metadata_value = (instance_cache.get(inst_name).
|
|
get('hostname'))
|
|
else:
|
|
metadata_value = (instance_cache.get(inst_name).
|
|
get(metadata))
|
|
if metadata_value:
|
|
dims[metadata] = metadata_value
|
|
return dims
|
|
|
|
def _get_this_host_aggregate(self, nova_client):
|
|
"""Determine the host aggregate for this hypervisor."""
|
|
host_agg_cfg_re = self.init_config.get('host_aggregate_re', None)
|
|
if not host_agg_cfg_re:
|
|
return
|
|
|
|
try:
|
|
agg_re = re.compile(host_agg_cfg_re)
|
|
aggs = nova_client.aggregates.list()
|
|
for idx, agg in enumerate(aggs):
|
|
if re.match(agg_re, aggs[idx].name) and self.hostname in aggs[idx].hosts:
|
|
self._host_aggregate = str(aggs[idx].name)
|
|
#
|
|
# Not expecting multiple matches, if we've got a match we're done.
|
|
#
|
|
break
|
|
|
|
except Exception as e:
|
|
msg = "Failed to list host aggregates, won't publish aggregate dimension: '{0}'"
|
|
self.log.error(msg.format(e))
|