monasca-agent/monasca_agent/collector/checks_d/libvirt.py

#!/bin/env python

# Copyright (c) 2014 Hewlett-Packard Development Company, L.P.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.
"""Monasca Agent interface for libvirt metrics"""

import os
import stat
import subprocess
import time
import yaml

from calendar import timegm
from datetime import datetime
from distutils.version import LooseVersion
from monasca_agent.collector.checks import AgentCheck
from monasca_agent.collector.virt import inspector


class LibvirtCheck(AgentCheck):

    """Inherit Agent class and gather libvirt metrics"""

    def __init__(self, name, init_config, agent_config):
        AgentCheck.__init__(self, name, init_config, agent_config)
        self.instance_cache_file = "{0}/{1}".format(self.init_config.get('cache_dir'),
                                                    'libvirt_instances.yaml')
        self.metric_cache_file = "{0}/{1}".format(self.init_config.get('cache_dir'),
                                                  'libvirt_metrics.yaml')

    def _test_vm_probation(self, created):
        """Test to see if a VM was created within the probation period.

        Convert an ISO-8601 timestamp into UNIX epoch timestamp from now
        and compare that against configured vm_probation.  Return the
        number of seconds this VM will remain in probation.
        """
        dt = datetime.strptime(created, '%Y-%m-%dT%H:%M:%SZ')
        created_sec = (time.time() - timegm(dt.timetuple()))
        probation_time = self.init_config.get('vm_probation') - created_sec
        return int(probation_time)

    def _update_instance_cache(self):
        """Collect instance_id, project_id, and AZ for all instance UUIDs
        """
        # novaclient module versions were renamed in version 2.22
        try:
            from novaclient.v2 import client
        except ImportError:
            from novaclient.v1_1 import client

        id_cache = {}
        # Get a list of all instances from the Nova API
        nova_client = client.Client(self.init_config.get('admin_user'),
                                    self.init_config.get('admin_password'),
                                    self.init_config.get('admin_tenant_name'),
                                    self.init_config.get('identity_uri'),
                                    service_type="compute",
                                    region_name=self.init_config.get('region_name'))
        instances = nova_client.servers.list(search_opts={'all_tenants': 1})

        for instance in instances:
            inst_name = instance.__getattr__('OS-EXT-SRV-ATTR:instance_name')
            inst_az = instance.__getattr__('OS-EXT-AZ:availability_zone')
            id_cache[inst_name] = {'instance_uuid': instance.id,
                                   'hostname': instance.name,
                                   'zone': inst_az,
                                   'created': instance.created,
                                   'tenant_id': instance.tenant_id,
                                   'vcpus': nova_client.flavors.get(instance.flavor['id']).vcpus,
                                   'ram': nova_client.flavors.get(instance.flavor['id']).ram,
                                   'disk': nova_client.flavors.get(instance.flavor['id']).disk}
            # Try to add private_ip to id_cache[inst_name].  This may fail on ERROR'ed VMs.
            try:
                id_cache[inst_name]['private_ip'] = instance.addresses['private'][0]['addr']
            except KeyError:
                pass

        id_cache['last_update'] = int(time.time())

        # Write the updated cache
        try:
            with open(self.instance_cache_file, 'w') as cache_yaml:
                yaml.safe_dump(id_cache, cache_yaml)
            if stat.S_IMODE(os.stat(self.instance_cache_file).st_mode) != 0o600:
                os.chmod(self.instance_cache_file, 0o600)
        except IOError as e:
            self.log.error("Cannot write to {0}: {1}".format(self.instance_cache_file, e))

        return id_cache

    def _load_instance_cache(self):
        """Load the cache if instance names to IDs.

           If the cache does not yet exist, return an empty one.
        """
        instance_cache = {}
        try:
            with open(self.instance_cache_file, 'r') as cache_yaml:
                instance_cache = yaml.safe_load(cache_yaml)

                # Is it time to force a refresh of this data?
                if self.init_config.get('nova_refresh') is not None:
                    time_diff = time.time() - instance_cache['last_update']
                    if time_diff > self.init_config.get('nova_refresh'):
                        self._update_instance_cache()
        except IOError:
            # The file may not exist yet, and that's OK.  Build it now.
            instance_cache = self._update_instance_cache()
            pass

        return instance_cache

    def _load_metric_cache(self):
        """Load the counter metrics from the previous collection iteration
        """
        metric_cache = {}
        try:
            with open(self.metric_cache_file, 'r') as cache_yaml:
                metric_cache = yaml.safe_load(cache_yaml)
        except IOError:
            # The file may not exist yet.
            pass

        return metric_cache

    def _update_metric_cache(self, metric_cache):
        try:
            with open(self.metric_cache_file, 'w') as cache_yaml:
                yaml.safe_dump(metric_cache, cache_yaml)
            if stat.S_IMODE(os.stat(self.metric_cache_file).st_mode) != 0o600:
                os.chmod(self.metric_cache_file, 0o600)
        except IOError as e:
            self.log.error("Cannot write to {0}: {1}".format(self.metric_cache_file, e))

    def check(self, instance):
        """Gather VM metrics for each instance"""

        # Load metric cache
        metric_cache = self._load_metric_cache()

        # Load the nova-obtained instance data cache
        instance_cache = self._load_instance_cache()

        # Build dimensions for both the customer and for operations
        dims_base = self._set_dimensions({'service': 'compute', 'component': 'vm'}, instance)

        # Define aggregate gauges, gauge name to metric name
        agg_gauges = {'vcpus': 'nova.vm.cpu.total_allocated',
                      'ram': 'nova.vm.mem.total_allocated_mb',
                      'disk': 'nova.vm.disk.total_allocated_gb'}
        agg_values = {}
        for gauge in agg_gauges.keys():
            agg_values[gauge] = 0

        insp = inspector.get_hypervisor_inspector()
        for inst in insp._get_connection().listAllDomains():
            # Verify that this instance exists in the cache.  Add if necessary.
            inst_name = inst.name()

            # Build customer dimensions
            try:
                dims_customer = dims_base.copy()
                dims_customer['resource_id'] = instance_cache.get(inst_name)['instance_uuid']
                dims_customer['zone'] = instance_cache.get(inst_name)['zone']
                # Add dimensions that would be helpful for operations
                dims_operations = dims_customer.copy()
                dims_operations['tenant_id'] = instance_cache.get(inst_name)['tenant_id']
                # Remove customer 'hostname' dimension, this will be replaced by the VM name
                del(dims_customer['hostname'])
            except TypeError:
                # Nova can potentially get into a state where it can't see an
                # instance, but libvirt can.  This would cause TypeErrors as
                # incomplete data is cached for this instance.  Log and skip.
                self.log.error("{0} is not known to nova after instance cache update -- skipping this ghost VM.".format(inst_name))
                continue

            # Skip instances that are inactive
            if inst.isActive() == 0:
                detail = 'Instance is not active'
                self.gauge('host_alive_status', 2, dimensions=dims_customer,
                           delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
                           hostname=instance_cache.get(inst_name)['hostname'],
                           value_meta={'detail': detail})
                self.gauge('vm.host_alive_status', 2, dimensions=dims_operations,
                           value_meta={'detail': detail})
                continue
            if inst_name not in instance_cache:
                instance_cache = self._update_instance_cache()
            if inst_name not in metric_cache:
                metric_cache[inst_name] = {}

            # Skip instances created within the probation period
            vm_probation_remaining = self._test_vm_probation(instance_cache.get(inst_name)['created'])
            if (vm_probation_remaining >= 0):
                self.log.info("Libvirt: {0} in probation for another {1} seconds".format(instance_cache.get(inst_name)['hostname'],
                                                                                         vm_probation_remaining))
                continue

            # Test instance's general responsiveness (ping check) if so configured
            if self.init_config.get('ping_check') and 'private_ip' in instance_cache.get(inst_name):
                detail = 'Ping check OK'
                ping_cmd = self.init_config.get('ping_check').split()
                ping_cmd.append(instance_cache.get(inst_name)['private_ip'])
                with open(os.devnull, "w") as fnull:
                    try:
                        res = subprocess.call(ping_cmd,
                                              stdout=fnull,
                                              stderr=fnull)
                        if res > 0:
                            detail = 'Host failed ping check'
                        self.gauge('host_alive_status', res, dimensions=dims_customer,
                                   delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
                                   hostname=instance_cache.get(inst_name)['hostname'],
                                   value_meta={'detail': detail})
                        self.gauge('vm.host_alive_status', res, dimensions=dims_operations,
                                   value_meta={'detail': detail})
                        # Do not attempt to process any more metrics for offline hosts
                        if res > 0:
                            continue
                    except OSError as e:
                        self.log.warn("OS error running '{0}' returned {1}".format(ping_cmd, e))

            # Accumulate aggregate data
            for gauge in agg_gauges:
                if gauge in instance_cache.get(inst_name):
                    agg_values[gauge] += instance_cache.get(inst_name)[gauge]

            # CPU utilization percentage
            sample_time = float("{:9f}".format(time.time()))
            if 'cpu.time' in metric_cache[inst_name]:
                # I have a prior value, so calculate the rate & push the metric
                cpu_diff = insp.inspect_cpus(inst).time - metric_cache[inst_name]['cpu.time']['value']
                time_diff = sample_time - float(metric_cache[inst_name]['cpu.time']['timestamp'])
                # Convert time_diff to nanoseconds, and calculate percentage
                rate = (cpu_diff / (time_diff * 1000000000)) * 100

                self.gauge('cpu.utilization_perc', int(round(rate, 0)),
                           dimensions=dims_customer,
                           delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
                           hostname=instance_cache.get(inst_name)['hostname'])
                self.gauge('vm.cpu.utilization_perc', int(round(rate, 0)),
                           dimensions=dims_operations)

            metric_cache[inst_name]['cpu.time'] = {'timestamp': sample_time,
                                                   'value': insp.inspect_cpus(inst).time}

            # Disk activity
            for disk in insp.inspect_disks(inst):
                sample_time = time.time()
                disk_dimensions = {'device': disk[0].device}
                for metric in disk[1]._fields:
                    metric_name = "io.{0}".format(metric)
                    if metric_name not in metric_cache[inst_name]:
                        metric_cache[inst_name][metric_name] = {}

                    value = int(disk[1].__getattribute__(metric))
                    if disk[0].device in metric_cache[inst_name][metric_name]:
                        time_diff = sample_time - metric_cache[inst_name][metric_name][disk[0].device]['timestamp']
                        val_diff = value - metric_cache[inst_name][metric_name][disk[0].device]['value']
                        # Change the metric name to a rate, ie. "io.read_requests"
                        # gets converted to "io.read_ops_sec"
                        rate_name = "{0}_sec".format(metric_name.replace('requests', 'ops'))
                        # Customer
                        this_dimensions = disk_dimensions.copy()
                        this_dimensions.update(dims_customer)
                        self.gauge(rate_name, val_diff, dimensions=this_dimensions,
                                   delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
                                   hostname=instance_cache.get(inst_name)['hostname'])
                        # Operations (metric name prefixed with "vm."
                        this_dimensions = disk_dimensions.copy()
                        this_dimensions.update(dims_operations)
                        self.gauge("vm.{0}".format(rate_name), val_diff,
                                   dimensions=this_dimensions)
                    # Save this metric to the cache
                    metric_cache[inst_name][metric_name][disk[0].device] = {
                        'timestamp': sample_time,
                        'value': value}

            # Disk utilization
            # TODO(dschroeder)

            # Memory utilizaion
            # TODO(dschroeder)

            # Network activity
            for vnic in insp.inspect_vnics(inst):
                sample_time = time.time()
                vnic_dimensions = {'device': vnic[0].name}
                for metric in vnic[1]._fields:
                    metric_name = "net.{0}".format(metric)
                    if metric_name not in metric_cache[inst_name]:
                        metric_cache[inst_name][metric_name] = {}

                    value = int(vnic[1].__getattribute__(metric))
                    if vnic[0].name in metric_cache[inst_name][metric_name]:
                        time_diff = sample_time - metric_cache[inst_name][metric_name][vnic[0].name]['timestamp']
                        val_diff = value - metric_cache[inst_name][metric_name][vnic[0].name]['value']
                        # Change the metric name to a rate, ie. "net.rx_bytes"
                        # gets converted to "net.rx_bytes_sec"
                        rate_name = "{0}_sec".format(metric_name)
                        # Rename "tx" to "out" and "rx" to "in"
                        rate_name = rate_name.replace("tx", "out")
                        rate_name = rate_name.replace("rx", "in")
                        # Customer
                        this_dimensions = vnic_dimensions.copy()
                        this_dimensions.update(dims_customer)
                        self.gauge(rate_name, val_diff,
                                   dimensions=this_dimensions,
                                   delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
                                   hostname=instance_cache.get(inst_name)['hostname'])
                        # Operations (metric name prefixed with "vm."
                        this_dimensions = vnic_dimensions.copy()
                        this_dimensions.update(dims_operations)
                        self.gauge("vm.{0}".format(rate_name), val_diff,
                                   dimensions=this_dimensions)
                    # Save this metric to the cache
                    metric_cache[inst_name][metric_name][vnic[0].name] = {
                        'timestamp': sample_time,
                        'value': value}

        # Save these metrics for the next collector invocation
        self._update_metric_cache(metric_cache)

        # Publish aggregate metrics
        for gauge in agg_gauges:
            self.gauge(agg_gauges[gauge], agg_values[gauge], dimensions=dims_base)