monasca-agent/monasca_agent/collector/checks_d/libvirt.py

339 lines
17 KiB
Python

#!/bin/env python
# Copyright (c) 2014 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Monasca Agent interface for libvirt metrics"""
import os
import stat
import subprocess
import time
import yaml
from calendar import timegm
from datetime import datetime
from distutils.version import LooseVersion
from monasca_agent.collector.checks import AgentCheck
from monasca_agent.collector.virt import inspector
class LibvirtCheck(AgentCheck):
"""Inherit Agent class and gather libvirt metrics"""
def __init__(self, name, init_config, agent_config):
AgentCheck.__init__(self, name, init_config, agent_config)
self.instance_cache_file = "{0}/{1}".format(self.init_config.get('cache_dir'),
'libvirt_instances.yaml')
self.metric_cache_file = "{0}/{1}".format(self.init_config.get('cache_dir'),
'libvirt_metrics.yaml')
def _test_vm_probation(self, created):
"""Test to see if a VM was created within the probation period.
Convert an ISO-8601 timestamp into UNIX epoch timestamp from now
and compare that against configured vm_probation. Return the
number of seconds this VM will remain in probation.
"""
dt = datetime.strptime(created, '%Y-%m-%dT%H:%M:%SZ')
created_sec = (time.time() - timegm(dt.timetuple()))
probation_time = self.init_config.get('vm_probation') - created_sec
return int(probation_time)
def _update_instance_cache(self):
"""Collect instance_id, project_id, and AZ for all instance UUIDs
"""
# novaclient module versions were renamed in version 2.22
try:
from novaclient.v2 import client
except ImportError:
from novaclient.v1_1 import client
id_cache = {}
# Get a list of all instances from the Nova API
nova_client = client.Client(self.init_config.get('admin_user'),
self.init_config.get('admin_password'),
self.init_config.get('admin_tenant_name'),
self.init_config.get('identity_uri'),
service_type="compute",
region_name=self.init_config.get('region_name'))
instances = nova_client.servers.list(search_opts={'all_tenants': 1})
for instance in instances:
inst_name = instance.__getattr__('OS-EXT-SRV-ATTR:instance_name')
inst_az = instance.__getattr__('OS-EXT-AZ:availability_zone')
id_cache[inst_name] = {'instance_uuid': instance.id,
'hostname': instance.name,
'zone': inst_az,
'created': instance.created,
'tenant_id': instance.tenant_id,
'vcpus': nova_client.flavors.get(instance.flavor['id']).vcpus,
'ram': nova_client.flavors.get(instance.flavor['id']).ram,
'disk': nova_client.flavors.get(instance.flavor['id']).disk}
# Try to add private_ip to id_cache[inst_name]. This may fail on ERROR'ed VMs.
try:
id_cache[inst_name]['private_ip'] = instance.addresses['private'][0]['addr']
except KeyError:
pass
id_cache['last_update'] = int(time.time())
# Write the updated cache
try:
with open(self.instance_cache_file, 'w') as cache_yaml:
yaml.safe_dump(id_cache, cache_yaml)
if stat.S_IMODE(os.stat(self.instance_cache_file).st_mode) != 0o600:
os.chmod(self.instance_cache_file, 0o600)
except IOError as e:
self.log.error("Cannot write to {0}: {1}".format(self.instance_cache_file, e))
return id_cache
def _load_instance_cache(self):
"""Load the cache if instance names to IDs.
If the cache does not yet exist, return an empty one.
"""
instance_cache = {}
try:
with open(self.instance_cache_file, 'r') as cache_yaml:
instance_cache = yaml.safe_load(cache_yaml)
# Is it time to force a refresh of this data?
if self.init_config.get('nova_refresh') is not None:
time_diff = time.time() - instance_cache['last_update']
if time_diff > self.init_config.get('nova_refresh'):
self._update_instance_cache()
except IOError:
# The file may not exist yet, and that's OK. Build it now.
instance_cache = self._update_instance_cache()
pass
return instance_cache
def _load_metric_cache(self):
"""Load the counter metrics from the previous collection iteration
"""
metric_cache = {}
try:
with open(self.metric_cache_file, 'r') as cache_yaml:
metric_cache = yaml.safe_load(cache_yaml)
except IOError:
# The file may not exist yet.
pass
return metric_cache
def _update_metric_cache(self, metric_cache):
try:
with open(self.metric_cache_file, 'w') as cache_yaml:
yaml.safe_dump(metric_cache, cache_yaml)
if stat.S_IMODE(os.stat(self.metric_cache_file).st_mode) != 0o600:
os.chmod(self.metric_cache_file, 0o600)
except IOError as e:
self.log.error("Cannot write to {0}: {1}".format(self.metric_cache_file, e))
def check(self, instance):
"""Gather VM metrics for each instance"""
# Load metric cache
metric_cache = self._load_metric_cache()
# Load the nova-obtained instance data cache
instance_cache = self._load_instance_cache()
# Build dimensions for both the customer and for operations
dims_base = self._set_dimensions({'service': 'compute', 'component': 'vm'}, instance)
# Define aggregate gauges, gauge name to metric name
agg_gauges = {'vcpus': 'nova.vm.cpu.total_allocated',
'ram': 'nova.vm.mem.total_allocated_mb',
'disk': 'nova.vm.disk.total_allocated_gb'}
agg_values = {}
for gauge in agg_gauges.keys():
agg_values[gauge] = 0
insp = inspector.get_hypervisor_inspector()
for inst in insp._get_connection().listAllDomains():
# Verify that this instance exists in the cache. Add if necessary.
inst_name = inst.name()
# Build customer dimensions
try:
dims_customer = dims_base.copy()
dims_customer['resource_id'] = instance_cache.get(inst_name)['instance_uuid']
dims_customer['zone'] = instance_cache.get(inst_name)['zone']
# Add dimensions that would be helpful for operations
dims_operations = dims_customer.copy()
dims_operations['tenant_id'] = instance_cache.get(inst_name)['tenant_id']
# Remove customer 'hostname' dimension, this will be replaced by the VM name
del(dims_customer['hostname'])
except TypeError:
# Nova can potentially get into a state where it can't see an
# instance, but libvirt can. This would cause TypeErrors as
# incomplete data is cached for this instance. Log and skip.
self.log.error("{0} is not known to nova after instance cache update -- skipping this ghost VM.".format(inst_name))
continue
# Skip instances that are inactive
if inst.isActive() == 0:
detail = 'Instance is not active'
self.gauge('host_alive_status', 2, dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'],
value_meta={'detail': detail})
self.gauge('vm.host_alive_status', 2, dimensions=dims_operations,
value_meta={'detail': detail})
continue
if inst_name not in instance_cache:
instance_cache = self._update_instance_cache()
if inst_name not in metric_cache:
metric_cache[inst_name] = {}
# Skip instances created within the probation period
vm_probation_remaining = self._test_vm_probation(instance_cache.get(inst_name)['created'])
if (vm_probation_remaining >= 0):
self.log.info("Libvirt: {0} in probation for another {1} seconds".format(instance_cache.get(inst_name)['hostname'],
vm_probation_remaining))
continue
# Test instance's general responsiveness (ping check) if so configured
if self.init_config.get('ping_check') and 'private_ip' in instance_cache.get(inst_name):
detail = 'Ping check OK'
ping_cmd = self.init_config.get('ping_check').split()
ping_cmd.append(instance_cache.get(inst_name)['private_ip'])
with open(os.devnull, "w") as fnull:
try:
res = subprocess.call(ping_cmd,
stdout=fnull,
stderr=fnull)
if res > 0:
detail = 'Host failed ping check'
self.gauge('host_alive_status', res, dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'],
value_meta={'detail': detail})
self.gauge('vm.host_alive_status', res, dimensions=dims_operations,
value_meta={'detail': detail})
# Do not attempt to process any more metrics for offline hosts
if res > 0:
continue
except OSError as e:
self.log.warn("OS error running '{0}' returned {1}".format(ping_cmd, e))
# Accumulate aggregate data
for gauge in agg_gauges:
if gauge in instance_cache.get(inst_name):
agg_values[gauge] += instance_cache.get(inst_name)[gauge]
# CPU utilization percentage
sample_time = float("{:9f}".format(time.time()))
if 'cpu.time' in metric_cache[inst_name]:
# I have a prior value, so calculate the rate & push the metric
cpu_diff = insp.inspect_cpus(inst).time - metric_cache[inst_name]['cpu.time']['value']
time_diff = sample_time - float(metric_cache[inst_name]['cpu.time']['timestamp'])
# Convert time_diff to nanoseconds, and calculate percentage
rate = (cpu_diff / (time_diff * 1000000000)) * 100
self.gauge('cpu.utilization_perc', int(round(rate, 0)),
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge('vm.cpu.utilization_perc', int(round(rate, 0)),
dimensions=dims_operations)
metric_cache[inst_name]['cpu.time'] = {'timestamp': sample_time,
'value': insp.inspect_cpus(inst).time}
# Disk activity
for disk in insp.inspect_disks(inst):
sample_time = time.time()
disk_dimensions = {'device': disk[0].device}
for metric in disk[1]._fields:
metric_name = "io.{0}".format(metric)
if metric_name not in metric_cache[inst_name]:
metric_cache[inst_name][metric_name] = {}
value = int(disk[1].__getattribute__(metric))
if disk[0].device in metric_cache[inst_name][metric_name]:
time_diff = sample_time - metric_cache[inst_name][metric_name][disk[0].device]['timestamp']
val_diff = value - metric_cache[inst_name][metric_name][disk[0].device]['value']
# Change the metric name to a rate, ie. "io.read_requests"
# gets converted to "io.read_ops_sec"
rate_name = "{0}_sec".format(metric_name.replace('requests', 'ops'))
# Customer
this_dimensions = disk_dimensions.copy()
this_dimensions.update(dims_customer)
self.gauge(rate_name, val_diff, dimensions=this_dimensions,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
# Operations (metric name prefixed with "vm."
this_dimensions = disk_dimensions.copy()
this_dimensions.update(dims_operations)
self.gauge("vm.{0}".format(rate_name), val_diff,
dimensions=this_dimensions)
# Save this metric to the cache
metric_cache[inst_name][metric_name][disk[0].device] = {
'timestamp': sample_time,
'value': value}
# Disk utilization
# TODO(dschroeder)
# Memory utilizaion
# TODO(dschroeder)
# Network activity
for vnic in insp.inspect_vnics(inst):
sample_time = time.time()
vnic_dimensions = {'device': vnic[0].name}
for metric in vnic[1]._fields:
metric_name = "net.{0}".format(metric)
if metric_name not in metric_cache[inst_name]:
metric_cache[inst_name][metric_name] = {}
value = int(vnic[1].__getattribute__(metric))
if vnic[0].name in metric_cache[inst_name][metric_name]:
time_diff = sample_time - metric_cache[inst_name][metric_name][vnic[0].name]['timestamp']
val_diff = value - metric_cache[inst_name][metric_name][vnic[0].name]['value']
# Change the metric name to a rate, ie. "net.rx_bytes"
# gets converted to "net.rx_bytes_sec"
rate_name = "{0}_sec".format(metric_name)
# Rename "tx" to "out" and "rx" to "in"
rate_name = rate_name.replace("tx", "out")
rate_name = rate_name.replace("rx", "in")
# Customer
this_dimensions = vnic_dimensions.copy()
this_dimensions.update(dims_customer)
self.gauge(rate_name, val_diff,
dimensions=this_dimensions,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
# Operations (metric name prefixed with "vm."
this_dimensions = vnic_dimensions.copy()
this_dimensions.update(dims_operations)
self.gauge("vm.{0}".format(rate_name), val_diff,
dimensions=this_dimensions)
# Save this metric to the cache
metric_cache[inst_name][metric_name][vnic[0].name] = {
'timestamp': sample_time,
'value': value}
# Save these metrics for the next collector invocation
self._update_metric_cache(metric_cache)
# Publish aggregate metrics
for gauge in agg_gauges:
self.gauge(agg_gauges[gauge], agg_values[gauge], dimensions=dims_base)