From 48e31739c9cd784e796457a5148e4e6746c3e718 Mon Sep 17 00:00:00 2001 From: Johannes Grassler Date: Tue, 21 Nov 2017 16:56:33 +0100 Subject: [PATCH] Added aggregate VM counts to livbirt check This commit adds aggregate metrics to the libvirt check that report the numbers and percentages of VMs by VM state, plus a total of VMs on the host. These aggregate metrics make it easier to visualize VM metrics in Grafana, which is good at aggregating over time but very bad at aggregating over multi dimensional (host, VM) state metrics. Change-Id: I5e553b933bb0ac49ca7fb2e7835ca3cc6aaa7ca3 --- docs/Libvirt.md | 27 +++++++++-- monasca_agent/collector/checks_d/libvirt.py | 53 +++++++++++++++++++-- 2 files changed, 72 insertions(+), 8 deletions(-) diff --git a/docs/Libvirt.md b/docs/Libvirt.md index ea19ece0..96a57a33 100644 --- a/docs/Libvirt.md +++ b/docs/Libvirt.md @@ -457,11 +457,28 @@ All metrics include `resource_id` and `zone` (availability zone) dimensions. Be In addition to per-instance metrics, the Libvirt plugin will publish aggregate metrics across all instances. -| Name | Description | -| ------------------------------- | -------------------------------------------------- | -| nova.vm.cpu.total_allocated | Total CPUs allocated across all VMs | -| nova.vm.disk.total_allocated_gb | Total Gbytes of disk space allocated to all VMs | -| nova.vm.mem.total_allocated_mb | Total Mbytes of memory allocated to all VMs | +| Name | Description | +| ----------------------------------------- | ----------------------------------------------------------- | +| nova.vm.cpu.total_allocated | Total CPUs allocated across all VMs | +| nova.vm.disk.total_allocated_gb | Total Gbytes of disk space allocated to all VMs | +| nova.vm.mem.total_allocated_mb | Total Mbytes of memory allocated to all VMs | +| nova.vm.total_count | Total number of VMs on host | +| nova.vm.blocked_count | Total number of VMs in state blocked on host | +| nova.vm.blocked_perc | Percentage of VMs in state blocked on host | +| nova.vm.crashed_count | Total number of VMs in state crashed on host | +| nova.vm.crashed_perc | Percentage of VMs in state crashed on host | +| nova.vm.nostate_count | Total number of VMs with no state on host | +| nova.vm.nostate_perc | Percentage of VMs with no state on host | +| nova.vm.paused_count | Total number of VMs in state paused on host | +| nova.vm.paused_perc | Percentage of VMs in state paused on host | +| nova.vm.suspended_count | Total number of VMs in state suspended on host | +| nova.vm.suspended_perc | Percentage of VMs in state suspended on host | +| nova.vm.running_count | Total number of VMs in state running on host | +| nova.vm.running_perc | Percentage of VMs in state running on host | +| nova.vm.shutingdown_count | Total number of VMs in state shutting down on host | +| nova.vm.shutingdown_perc | Percentage of VMs in state shutting down on host | +| nova.vm.shutoff_count | Total number of VMs in state shutoff/Nova suspended on host | +| nova.vm.shutoff_perc | Percentage of VMs in state shutoff/Nova suspended on host | Aggregate dimensions include hostname and component from the Operations Value column above. diff --git a/monasca_agent/collector/checks_d/libvirt.py b/monasca_agent/collector/checks_d/libvirt.py index 78aea08f..c7acca0f 100644 --- a/monasca_agent/collector/checks_d/libvirt.py +++ b/monasca_agent/collector/checks_d/libvirt.py @@ -48,6 +48,16 @@ DOM_STATES = {libvirt.VIR_DOMAIN_BLOCKED: 'VM is blocked', libvirt.VIR_DOMAIN_PMSUSPENDED: 'VM is in power management (s3) suspend', libvirt.VIR_DOMAIN_SHUTDOWN: 'VM is shutting down', libvirt.VIR_DOMAIN_SHUTOFF: 'VM has been shut off (other reason)'} + +DOM_ALIVE_NAMES = {libvirt.VIR_DOMAIN_BLOCKED: 'blocked', + libvirt.VIR_DOMAIN_CRASHED: 'crashed', + libvirt.VIR_DOMAIN_NONE: 'nostate', + libvirt.VIR_DOMAIN_PAUSED: 'paused', + libvirt.VIR_DOMAIN_PMSUSPENDED: 'suspended', + libvirt.VIR_DOMAIN_RUNNING: 'running', + libvirt.VIR_DOMAIN_SHUTDOWN: 'shuttingdown', + libvirt.VIR_DOMAIN_SHUTOFF: 'shutoff'} # shut off/nova suspend + DOM_SHUTOFF_STATES = {libvirt.VIR_DOMAIN_SHUTOFF_UNKNOWN: 'VM has been shutoff (reason unknown)', libvirt.VIR_DOMAIN_SHUTOFF_SHUTDOWN: 'VM has been shut down', libvirt.VIR_DOMAIN_SHUTOFF_DESTROYED: 'VM has been destroyed (forced off)', @@ -618,7 +628,7 @@ class LibvirtCheck(AgentCheck): self.gauge('vm.health_status', health_status, dimensions=dims_operations) - return dom_status + return inst_state[0] def prepare_run(self): """Check if it is time for measurements to be collected""" @@ -691,6 +701,16 @@ class LibvirtCheck(AgentCheck): # Build dimensions for both the customer and for operations dims_base = self._set_dimensions({'service': 'compute', 'component': 'vm'}, instance) + # Initialize aggregate alive status data structure (separate from + # aggregate gauges because every possible value needs to be counted + # separately) + agg_alive_counts = {} + for code in DOM_ALIVE_NAMES: + agg_alive_counts[code] = 0 + + # Per host total VM count + vm_count = 0 + # Define aggregate gauges, gauge name to metric name agg_gauges = {'vcpus': 'nova.vm.cpu.total_allocated', 'ram': 'nova.vm.mem.total_allocated_mb', @@ -757,9 +777,15 @@ class LibvirtCheck(AgentCheck): vm_probation_remaining)) continue + vm_dom_state = self._inspect_state(insp, inst, inst_name, + instance_cache, dims_customer, + dims_operations) + + agg_alive_counts[vm_dom_state] += 1 + vm_count += 1 + # Skip further processing on VMs that are not in an active state - if self._inspect_state(insp, inst, inst_name, instance_cache, - dims_customer, dims_operations) != 0: + if vm_dom_state != libvirt.VIR_DOMAIN_RUNNING: continue # Skip the remainder of the checks if alive_only is True in the config @@ -816,6 +842,10 @@ class LibvirtCheck(AgentCheck): for gauge in agg_gauges: self.gauge(agg_gauges[gauge], agg_values[gauge], dimensions=dims_base) + # Publish aggregate VM counts + + self._gauge_agg_alive_counts(agg_alive_counts, vm_count, dims_base) + # Check results of ping tests self._check_ping_results(ping_results) @@ -833,6 +863,23 @@ class LibvirtCheck(AgentCheck): rate_value = -1 return rate_value + def _gauge_agg_alive_counts(self, agg_alive_counts, vm_count, dims_base): + count_pfx = "nova.vm." + total_frac = (float(vm_count) / 100) + self.gauge(count_pfx + 'total_count', vm_count, dimensions=dims_base) + + for agg in agg_alive_counts: + self.gauge(count_pfx + DOM_ALIVE_NAMES[agg] + "_count", + agg_alive_counts[agg], + dimensions=dims_base) + if total_frac != 0: + self.gauge(count_pfx + DOM_ALIVE_NAMES[agg] + "_perc", + agg_alive_counts[agg] / total_frac, + dimensions=dims_base) + else: + self.gauge(count_pfx + DOM_ALIVE_NAMES[agg] + "_perc", + 0, dimensions=dims_base) + def _update_dims_with_metadata(self, instance_cache, inst_name, dim_operations): """Update operations dimensions with metadata.""" dims = dim_operations