Added aggregate VM counts to livbirt check

This commit adds aggregate metrics to the libvirt check that
report the numbers and percentages of VMs by VM state, plus a
total of VMs on the host. These aggregate metrics make it
easier to visualize VM metrics in Grafana, which is good at
aggregating over time but very bad at aggregating over multi
dimensional (host, VM) state metrics.

Change-Id: I5e553b933bb0ac49ca7fb2e7835ca3cc6aaa7ca3
This commit is contained in:
Johannes Grassler 2017-11-21 16:56:33 +01:00
parent 80e2d8d0a2
commit 48e31739c9
2 changed files with 72 additions and 8 deletions

View File

@ -457,11 +457,28 @@ All metrics include `resource_id` and `zone` (availability zone) dimensions. Be
In addition to per-instance metrics, the Libvirt plugin will publish aggregate metrics across all instances.
| Name | Description |
| ------------------------------- | -------------------------------------------------- |
| nova.vm.cpu.total_allocated | Total CPUs allocated across all VMs |
| nova.vm.disk.total_allocated_gb | Total Gbytes of disk space allocated to all VMs |
| nova.vm.mem.total_allocated_mb | Total Mbytes of memory allocated to all VMs |
| Name | Description |
| ----------------------------------------- | ----------------------------------------------------------- |
| nova.vm.cpu.total_allocated | Total CPUs allocated across all VMs |
| nova.vm.disk.total_allocated_gb | Total Gbytes of disk space allocated to all VMs |
| nova.vm.mem.total_allocated_mb | Total Mbytes of memory allocated to all VMs |
| nova.vm.total_count | Total number of VMs on host |
| nova.vm.blocked_count | Total number of VMs in state blocked on host |
| nova.vm.blocked_perc | Percentage of VMs in state blocked on host |
| nova.vm.crashed_count | Total number of VMs in state crashed on host |
| nova.vm.crashed_perc | Percentage of VMs in state crashed on host |
| nova.vm.nostate_count | Total number of VMs with no state on host |
| nova.vm.nostate_perc | Percentage of VMs with no state on host |
| nova.vm.paused_count | Total number of VMs in state paused on host |
| nova.vm.paused_perc | Percentage of VMs in state paused on host |
| nova.vm.suspended_count | Total number of VMs in state suspended on host |
| nova.vm.suspended_perc | Percentage of VMs in state suspended on host |
| nova.vm.running_count | Total number of VMs in state running on host |
| nova.vm.running_perc | Percentage of VMs in state running on host |
| nova.vm.shutingdown_count | Total number of VMs in state shutting down on host |
| nova.vm.shutingdown_perc | Percentage of VMs in state shutting down on host |
| nova.vm.shutoff_count | Total number of VMs in state shutoff/Nova suspended on host |
| nova.vm.shutoff_perc | Percentage of VMs in state shutoff/Nova suspended on host |
Aggregate dimensions include hostname and component from the Operations Value column above.

View File

@ -48,6 +48,16 @@ DOM_STATES = {libvirt.VIR_DOMAIN_BLOCKED: 'VM is blocked',
libvirt.VIR_DOMAIN_PMSUSPENDED: 'VM is in power management (s3) suspend',
libvirt.VIR_DOMAIN_SHUTDOWN: 'VM is shutting down',
libvirt.VIR_DOMAIN_SHUTOFF: 'VM has been shut off (other reason)'}
DOM_ALIVE_NAMES = {libvirt.VIR_DOMAIN_BLOCKED: 'blocked',
libvirt.VIR_DOMAIN_CRASHED: 'crashed',
libvirt.VIR_DOMAIN_NONE: 'nostate',
libvirt.VIR_DOMAIN_PAUSED: 'paused',
libvirt.VIR_DOMAIN_PMSUSPENDED: 'suspended',
libvirt.VIR_DOMAIN_RUNNING: 'running',
libvirt.VIR_DOMAIN_SHUTDOWN: 'shuttingdown',
libvirt.VIR_DOMAIN_SHUTOFF: 'shutoff'} # shut off/nova suspend
DOM_SHUTOFF_STATES = {libvirt.VIR_DOMAIN_SHUTOFF_UNKNOWN: 'VM has been shutoff (reason unknown)',
libvirt.VIR_DOMAIN_SHUTOFF_SHUTDOWN: 'VM has been shut down',
libvirt.VIR_DOMAIN_SHUTOFF_DESTROYED: 'VM has been destroyed (forced off)',
@ -618,7 +628,7 @@ class LibvirtCheck(AgentCheck):
self.gauge('vm.health_status', health_status,
dimensions=dims_operations)
return dom_status
return inst_state[0]
def prepare_run(self):
"""Check if it is time for measurements to be collected"""
@ -691,6 +701,16 @@ class LibvirtCheck(AgentCheck):
# Build dimensions for both the customer and for operations
dims_base = self._set_dimensions({'service': 'compute', 'component': 'vm'}, instance)
# Initialize aggregate alive status data structure (separate from
# aggregate gauges because every possible value needs to be counted
# separately)
agg_alive_counts = {}
for code in DOM_ALIVE_NAMES:
agg_alive_counts[code] = 0
# Per host total VM count
vm_count = 0
# Define aggregate gauges, gauge name to metric name
agg_gauges = {'vcpus': 'nova.vm.cpu.total_allocated',
'ram': 'nova.vm.mem.total_allocated_mb',
@ -757,9 +777,15 @@ class LibvirtCheck(AgentCheck):
vm_probation_remaining))
continue
vm_dom_state = self._inspect_state(insp, inst, inst_name,
instance_cache, dims_customer,
dims_operations)
agg_alive_counts[vm_dom_state] += 1
vm_count += 1
# Skip further processing on VMs that are not in an active state
if self._inspect_state(insp, inst, inst_name, instance_cache,
dims_customer, dims_operations) != 0:
if vm_dom_state != libvirt.VIR_DOMAIN_RUNNING:
continue
# Skip the remainder of the checks if alive_only is True in the config
@ -816,6 +842,10 @@ class LibvirtCheck(AgentCheck):
for gauge in agg_gauges:
self.gauge(agg_gauges[gauge], agg_values[gauge], dimensions=dims_base)
# Publish aggregate VM counts
self._gauge_agg_alive_counts(agg_alive_counts, vm_count, dims_base)
# Check results of ping tests
self._check_ping_results(ping_results)
@ -833,6 +863,23 @@ class LibvirtCheck(AgentCheck):
rate_value = -1
return rate_value
def _gauge_agg_alive_counts(self, agg_alive_counts, vm_count, dims_base):
count_pfx = "nova.vm."
total_frac = (float(vm_count) / 100)
self.gauge(count_pfx + 'total_count', vm_count, dimensions=dims_base)
for agg in agg_alive_counts:
self.gauge(count_pfx + DOM_ALIVE_NAMES[agg] + "_count",
agg_alive_counts[agg],
dimensions=dims_base)
if total_frac != 0:
self.gauge(count_pfx + DOM_ALIVE_NAMES[agg] + "_perc",
agg_alive_counts[agg] / total_frac,
dimensions=dims_base)
else:
self.gauge(count_pfx + DOM_ALIVE_NAMES[agg] + "_perc",
0, dimensions=dims_base)
def _update_dims_with_metadata(self, instance_cache, inst_name, dim_operations):
"""Update operations dimensions with metadata."""
dims = dim_operations