Added aggregate VM counts to livbirt check
This commit adds aggregate metrics to the libvirt check that report the numbers and percentages of VMs by VM state, plus a total of VMs on the host. These aggregate metrics make it easier to visualize VM metrics in Grafana, which is good at aggregating over time but very bad at aggregating over multi dimensional (host, VM) state metrics. Change-Id: I5e553b933bb0ac49ca7fb2e7835ca3cc6aaa7ca3
This commit is contained in:
parent
80e2d8d0a2
commit
48e31739c9
|
@ -457,11 +457,28 @@ All metrics include `resource_id` and `zone` (availability zone) dimensions. Be
|
|||
|
||||
In addition to per-instance metrics, the Libvirt plugin will publish aggregate metrics across all instances.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------- | -------------------------------------------------- |
|
||||
| nova.vm.cpu.total_allocated | Total CPUs allocated across all VMs |
|
||||
| nova.vm.disk.total_allocated_gb | Total Gbytes of disk space allocated to all VMs |
|
||||
| nova.vm.mem.total_allocated_mb | Total Mbytes of memory allocated to all VMs |
|
||||
| Name | Description |
|
||||
| ----------------------------------------- | ----------------------------------------------------------- |
|
||||
| nova.vm.cpu.total_allocated | Total CPUs allocated across all VMs |
|
||||
| nova.vm.disk.total_allocated_gb | Total Gbytes of disk space allocated to all VMs |
|
||||
| nova.vm.mem.total_allocated_mb | Total Mbytes of memory allocated to all VMs |
|
||||
| nova.vm.total_count | Total number of VMs on host |
|
||||
| nova.vm.blocked_count | Total number of VMs in state blocked on host |
|
||||
| nova.vm.blocked_perc | Percentage of VMs in state blocked on host |
|
||||
| nova.vm.crashed_count | Total number of VMs in state crashed on host |
|
||||
| nova.vm.crashed_perc | Percentage of VMs in state crashed on host |
|
||||
| nova.vm.nostate_count | Total number of VMs with no state on host |
|
||||
| nova.vm.nostate_perc | Percentage of VMs with no state on host |
|
||||
| nova.vm.paused_count | Total number of VMs in state paused on host |
|
||||
| nova.vm.paused_perc | Percentage of VMs in state paused on host |
|
||||
| nova.vm.suspended_count | Total number of VMs in state suspended on host |
|
||||
| nova.vm.suspended_perc | Percentage of VMs in state suspended on host |
|
||||
| nova.vm.running_count | Total number of VMs in state running on host |
|
||||
| nova.vm.running_perc | Percentage of VMs in state running on host |
|
||||
| nova.vm.shutingdown_count | Total number of VMs in state shutting down on host |
|
||||
| nova.vm.shutingdown_perc | Percentage of VMs in state shutting down on host |
|
||||
| nova.vm.shutoff_count | Total number of VMs in state shutoff/Nova suspended on host |
|
||||
| nova.vm.shutoff_perc | Percentage of VMs in state shutoff/Nova suspended on host |
|
||||
|
||||
Aggregate dimensions include hostname and component from the Operations Value column above.
|
||||
|
||||
|
|
|
@ -48,6 +48,16 @@ DOM_STATES = {libvirt.VIR_DOMAIN_BLOCKED: 'VM is blocked',
|
|||
libvirt.VIR_DOMAIN_PMSUSPENDED: 'VM is in power management (s3) suspend',
|
||||
libvirt.VIR_DOMAIN_SHUTDOWN: 'VM is shutting down',
|
||||
libvirt.VIR_DOMAIN_SHUTOFF: 'VM has been shut off (other reason)'}
|
||||
|
||||
DOM_ALIVE_NAMES = {libvirt.VIR_DOMAIN_BLOCKED: 'blocked',
|
||||
libvirt.VIR_DOMAIN_CRASHED: 'crashed',
|
||||
libvirt.VIR_DOMAIN_NONE: 'nostate',
|
||||
libvirt.VIR_DOMAIN_PAUSED: 'paused',
|
||||
libvirt.VIR_DOMAIN_PMSUSPENDED: 'suspended',
|
||||
libvirt.VIR_DOMAIN_RUNNING: 'running',
|
||||
libvirt.VIR_DOMAIN_SHUTDOWN: 'shuttingdown',
|
||||
libvirt.VIR_DOMAIN_SHUTOFF: 'shutoff'} # shut off/nova suspend
|
||||
|
||||
DOM_SHUTOFF_STATES = {libvirt.VIR_DOMAIN_SHUTOFF_UNKNOWN: 'VM has been shutoff (reason unknown)',
|
||||
libvirt.VIR_DOMAIN_SHUTOFF_SHUTDOWN: 'VM has been shut down',
|
||||
libvirt.VIR_DOMAIN_SHUTOFF_DESTROYED: 'VM has been destroyed (forced off)',
|
||||
|
@ -618,7 +628,7 @@ class LibvirtCheck(AgentCheck):
|
|||
self.gauge('vm.health_status', health_status,
|
||||
dimensions=dims_operations)
|
||||
|
||||
return dom_status
|
||||
return inst_state[0]
|
||||
|
||||
def prepare_run(self):
|
||||
"""Check if it is time for measurements to be collected"""
|
||||
|
@ -691,6 +701,16 @@ class LibvirtCheck(AgentCheck):
|
|||
# Build dimensions for both the customer and for operations
|
||||
dims_base = self._set_dimensions({'service': 'compute', 'component': 'vm'}, instance)
|
||||
|
||||
# Initialize aggregate alive status data structure (separate from
|
||||
# aggregate gauges because every possible value needs to be counted
|
||||
# separately)
|
||||
agg_alive_counts = {}
|
||||
for code in DOM_ALIVE_NAMES:
|
||||
agg_alive_counts[code] = 0
|
||||
|
||||
# Per host total VM count
|
||||
vm_count = 0
|
||||
|
||||
# Define aggregate gauges, gauge name to metric name
|
||||
agg_gauges = {'vcpus': 'nova.vm.cpu.total_allocated',
|
||||
'ram': 'nova.vm.mem.total_allocated_mb',
|
||||
|
@ -757,9 +777,15 @@ class LibvirtCheck(AgentCheck):
|
|||
vm_probation_remaining))
|
||||
continue
|
||||
|
||||
vm_dom_state = self._inspect_state(insp, inst, inst_name,
|
||||
instance_cache, dims_customer,
|
||||
dims_operations)
|
||||
|
||||
agg_alive_counts[vm_dom_state] += 1
|
||||
vm_count += 1
|
||||
|
||||
# Skip further processing on VMs that are not in an active state
|
||||
if self._inspect_state(insp, inst, inst_name, instance_cache,
|
||||
dims_customer, dims_operations) != 0:
|
||||
if vm_dom_state != libvirt.VIR_DOMAIN_RUNNING:
|
||||
continue
|
||||
|
||||
# Skip the remainder of the checks if alive_only is True in the config
|
||||
|
@ -816,6 +842,10 @@ class LibvirtCheck(AgentCheck):
|
|||
for gauge in agg_gauges:
|
||||
self.gauge(agg_gauges[gauge], agg_values[gauge], dimensions=dims_base)
|
||||
|
||||
# Publish aggregate VM counts
|
||||
|
||||
self._gauge_agg_alive_counts(agg_alive_counts, vm_count, dims_base)
|
||||
|
||||
# Check results of ping tests
|
||||
self._check_ping_results(ping_results)
|
||||
|
||||
|
@ -833,6 +863,23 @@ class LibvirtCheck(AgentCheck):
|
|||
rate_value = -1
|
||||
return rate_value
|
||||
|
||||
def _gauge_agg_alive_counts(self, agg_alive_counts, vm_count, dims_base):
|
||||
count_pfx = "nova.vm."
|
||||
total_frac = (float(vm_count) / 100)
|
||||
self.gauge(count_pfx + 'total_count', vm_count, dimensions=dims_base)
|
||||
|
||||
for agg in agg_alive_counts:
|
||||
self.gauge(count_pfx + DOM_ALIVE_NAMES[agg] + "_count",
|
||||
agg_alive_counts[agg],
|
||||
dimensions=dims_base)
|
||||
if total_frac != 0:
|
||||
self.gauge(count_pfx + DOM_ALIVE_NAMES[agg] + "_perc",
|
||||
agg_alive_counts[agg] / total_frac,
|
||||
dimensions=dims_base)
|
||||
else:
|
||||
self.gauge(count_pfx + DOM_ALIVE_NAMES[agg] + "_perc",
|
||||
0, dimensions=dims_base)
|
||||
|
||||
def _update_dims_with_metadata(self, instance_cache, inst_name, dim_operations):
|
||||
"""Update operations dimensions with metadata."""
|
||||
dims = dim_operations
|
||||
|
|
Loading…
Reference in New Issue