Add some new metrics and update some old ones

Add cpu total cores and used cores
Switch memory metrics to report in GB
Add simpler health metric
Add dropped and errors on network

Change-Id: I5a91dba3a3a22a3e8a25055c22c8037ecdeda6e0
This commit is contained in:
ryan-brandt 2017-08-24 11:00:53 -06:00 committed by Ryan Brandt
parent 2e5c6475e7
commit 4c2f87ce66
4 changed files with 91 additions and 57 deletions

View File

@ -243,50 +243,57 @@ Example cache (pretty-printed excerpt, see next section for complete list of ava
```
## Per-Instance Metrics
| Name | Description | Associated Dimensions |
| -------------------- | -------------------------------------- | ---------------------- |
| cpu.utilization_perc | Overall CPU utilization (percentage) | |
| cpu.utilization_norm_perc | Normalized CPU utilization (percentage) | |
| disk.allocation | Total Disk allocation for a device | 'device' (ie, 'hdd') |
| disk.capacity | Total Disk capacity for a device | 'device' (ie, 'hdd') |
| disk.physical | Total Disk usage for a device | 'device' (ie, 'hdd') |
| disk.allocation_total| Total Disk allocation across devices for instances | |
| disk.capacity_total | Total Disk capacity across devices for instances | |
| disk.physical_total | Total Disk usage across devices for instances | |
| host_alive_status | See [host_alive_status Codes](#host_alive_status-codes) below | |
| io.read_ops_sec | Disk I/O read operations per second | 'device' (ie, 'hdd') |
| io.read_ops | Disk I/O read operations val | 'device' (ie, 'hdd') |
| io.read_bytes | Disk I/O read bytes val | 'device' (ie, 'hdd') |
| io.read_bytes_sec | Disk I/O read bytes per second | 'device' (ie, 'hdd') |
| io.read_bytes_total | Total Disk I/O read bytes across all devices | |
| io.read_bytes_total_sec | Total Disk I/O read bytes per second across devices | |
| io.read_ops_total | Total Disk I/O read operations across all devices | |
| io.read_ops_total_sec | Total Disk I/O read operations across all devices per sec | |
| io.write_ops_sec | Disk I/O write operations per second | 'device' (ie, 'hdd') |
| io.write_ops | Disk I/O write operations val | 'device' (ie, 'hdd') |
| io.write_bytes | Disk I/O write bytes val | 'device' (ie, 'hdd') |
| io.write_bytes_sec | Disk I/O write bytes per second | 'device' (ie, 'hdd') |
| io.errors_sec | Disk I/O errors per second | 'device' (ie, 'hdd') |
| io.write_bytes_total | Total Disk I/O write bytes across all devices | |
| io.write_bytes_total_sec | Total Disk I/O Write bytes per second across devices | |
| io.write_ops_total | Total Disk I/O write operations across all devices | |
| io.write_ops_total_sec | Total Disk I/O write operations across all devices per sec | |
| net.in_packets_sec | Network received packets per second | 'device' (ie, 'vnet0') |
| net.out_packets_sec | Network transmitted packets per second | 'device' (ie, 'vnet0') |
| net.in_bytes_sec | Network received bytes per second | 'device' (ie, 'vnet0') |
| net.out_bytes_sec | Network transmitted bytes per second | 'device' (ie, 'vnet0') |
| net.in_packets | Network received total packets | 'device' (ie, 'vnet0') |
| net.out_packets | Network transmitted total packets | 'device' (ie, 'vnet0') |
| net.in_bytes | Network received total bytes | 'device' (ie, 'vnet0') |
| net.out_bytes | Network transmitted total bytes | 'device' (ie, 'vnet0') |
| mem.free_mb | Free memory in Mbytes | |
| mem.total_mb | Total memory in Mbytes | |
| mem.used_mb | Used memory in Mbytes | |
| mem.free_perc | Percent of memory free | |
| mem.swap_used_mb | Used swap space in Mbytes | |
| ping_status | 0 for ping success, 1 for ping failure | |
| cpu.time_ns | Cumulative CPU time (in ns) | |
| mem.resident_mb | Total memory used on host, an Operations-only metric | |
| Name | Description | Associated Dimensions |
| ------------------------- | --------------------------------------------------------- | ---------------------- |
| cpu.total_cores | Total virtual cpus allocated to vm | |
| cpu.used_cores | Number of cpu cores used | |
| cpu.utilization_perc | Overall CPU utilization (percentage) | |
| cpu.utilization_norm_perc | Normalized CPU utilization (percentage) | |
| disk.allocation | Total Disk allocation for a device | 'device' (ie, 'hdd') |
| disk.capacity | Total Disk capacity for a device | 'device' (ie, 'hdd') |
| disk.physical | Total Disk usage for a device | 'device' (ie, 'hdd') |
| disk.allocation_total | Total Disk allocation across devices for instances | |
| disk.capacity_total | Total Disk capacity across devices for instances | |
| disk.physical_total | Total Disk usage across devices for instances | |
| health_status | Reports if vm is running (0) or not (1) | |
| host_alive_status | See [host_alive_status Codes](#host_alive_status-codes) below | |
| io.read_ops_sec | Disk I/O read operations per second | 'device' (ie, 'hdd') |
| io.read_ops | Disk I/O read operations val | 'device' (ie, 'hdd') |
| io.read_bytes | Disk I/O read bytes val | 'device' (ie, 'hdd') |
| io.read_bytes_sec | Disk I/O read bytes per second | 'device' (ie, 'hdd') |
| io.read_bytes_total | Total Disk I/O read bytes across all devices | |
| io.read_bytes_total_sec | Total Disk I/O read bytes per second across devices | |
| io.read_ops_total | Total Disk I/O read operations across all devices | |
| io.read_ops_total_sec | Total Disk I/O read operations across all devices per sec | |
| io.write_ops_sec | Disk I/O write operations per second | 'device' (ie, 'hdd') |
| io.write_ops | Disk I/O write operations val | 'device' (ie, 'hdd') |
| io.write_bytes | Disk I/O write bytes val | 'device' (ie, 'hdd') |
| io.write_bytes_sec | Disk I/O write bytes per second | 'device' (ie, 'hdd') |
| io.errors_sec | Disk I/O errors per second | 'device' (ie, 'hdd') |
| io.write_bytes_total | Total Disk I/O write bytes across all devices | |
| io.write_bytes_total_sec | Total Disk I/O Write bytes per second across devices | |
| io.write_ops_total | Total Disk I/O write operations across all devices | |
| io.write_ops_total_sec | Total Disk I/O write operations across all devices per sec | |
| net.in_packets_sec | Network received packets per second | 'device' (ie, 'vnet0') |
| net.out_packets_sec | Network transmitted packets per second | 'device' (ie, 'vnet0') |
| net.in_bytes_sec | Network received bytes per second | 'device' (ie, 'vnet0') |
| net.out_bytes_sec | Network transmitted bytes per second | 'device' (ie, 'vnet0') |
| net.in_dropped_sec | Network received packets dropped per second | 'device' (ie, 'vnet0') |
| net.out_dropped_sec | Network transmitted packets dropped per second | 'device' (ie, 'vnet0') |
| net.in_errors_sec | Network received packets with errors per second | 'device' (ie, 'vnet0') |
| net.out_errors_sec | Network transmitted packets with errors per second | 'device' (ie, 'vnet0') |
| net.in_packets | Network received total packets | 'device' (ie, 'vnet0') |
| net.out_packets | Network transmitted total packets | 'device' (ie, 'vnet0') |
| net.in_bytes | Network received total bytes | 'device' (ie, 'vnet0') |
| net.out_bytes | Network transmitted total bytes | 'device' (ie, 'vnet0') |
| mem.free_gb | Free memory in Gbytes | |
| mem.total_gb | Total memory in Gbytes | |
| mem.used_gb | Used memory in Gbytes | |
| mem.free_perc | Percent of memory free | |
| mem.swap_used_gb | Used swap space in Gbytes | |
| ping_status | 0 for ping success, 1 for ping failure | |
| cpu.time_ns | Cumulative CPU time (in ns) | |
| mem.resident_gb | Total memory used on host, an Operations-only metric | |
### host_alive_status Codes
| Code | Description | value_meta 'detail' |

View File

@ -401,14 +401,14 @@ class LibvirtCheck(AgentCheck):
cpu_info = insp.inspect_cpus(inst)
if 'cpu.time' in metric_cache[inst_name]:
# I have a prior value, so calculate the raw_perc & push the metric
# I have a prior value, so calculate the used_cores & push the metric
cpu_diff = cpu_info.time - metric_cache[inst_name]['cpu.time']['value']
time_diff = sample_time - float(metric_cache[inst_name]['cpu.time']['timestamp'])
# Convert time_diff to nanoseconds, and calculate percentage
raw_perc = (cpu_diff / (time_diff * 1000000000)) * 100
used_cores = (cpu_diff / (time_diff * 1000000000))
# Divide by the number of cores to normalize the percentage
normalized_perc = (raw_perc / cpu_info.number)
if raw_perc < 0:
normalized_perc = (used_cores / cpu_info.number) * 100
if used_cores < 0:
# Bad value, save current reading and skip
self.log.warn("Ignoring negative CPU sample for: "
"{0} new cpu time: {1} old cpu time: {2}"
@ -418,7 +418,15 @@ class LibvirtCheck(AgentCheck):
'value': cpu_info.time}
return
self.gauge('cpu.utilization_perc', int(round(raw_perc, 0)),
self.gauge('cpu.total_cores', float(cpu_info.number),
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge('cpu.used_cores', float(used_cores),
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge('cpu.utilization_perc', int(round(used_cores * 100, 0)),
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
@ -426,7 +434,12 @@ class LibvirtCheck(AgentCheck):
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge('vm.cpu.utilization_perc', int(round(raw_perc, 0)),
self.gauge('vm.cpu.total_cores', float(cpu_info.number),
dimensions=dims_operations)
self.gauge('vm.cpu.used_cores', float(used_cores),
dimensions=dims_operations)
self.gauge('vm.cpu.utilization_perc', int(round(used_cores * 100, 0)),
dimensions=dims_operations)
self.gauge('vm.cpu.utilization_norm_perc', int(round(normalized_perc, 0)),
dimensions=dims_operations)
@ -580,6 +593,7 @@ class LibvirtCheck(AgentCheck):
"""
inst_state = inst.state()
dom_status = inst_state[0] - 1
health_status = 0 if dom_status == 0 else 1 # anything other than 'running' is considered unhealthy
metatag = None
if inst_state[0] in DOM_STATES:
@ -597,6 +611,13 @@ class LibvirtCheck(AgentCheck):
dimensions=dims_operations,
value_meta=metatag)
self.gauge('health_status', health_status,
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge('vm.health_status', health_status,
dimensions=dims_operations)
return dom_status
def prepare_run(self):
@ -766,10 +787,10 @@ class LibvirtCheck(AgentCheck):
# (req. balloon driver; Linux kernel param CONFIG_VIRTIO_BALLOON)
try:
mem_stats = inst.memoryStats()
mem_metrics = {'mem.free_mb': float(mem_stats['unused']) / 1024,
'mem.swap_used_mb': float(mem_stats['swap_out']) / 1024,
'mem.total_mb': float(mem_stats['available']) / 1024,
'mem.used_mb': float(mem_stats['available'] - mem_stats['unused']) / 1024,
mem_metrics = {'mem.free_gb': float(mem_stats['unused']) / 1024 / 1024,
'mem.swap_used_gb': float(mem_stats['swap_out']) / 1024 / 1024,
'mem.total_gb': float(mem_stats['available']) / 1024 / 1024,
'mem.used_gb': float(mem_stats['available'] - mem_stats['unused']) / 1024 / 1024,
'mem.free_perc': float(mem_stats['unused']) / float(mem_stats['available']) * 100}
for name in mem_metrics:
self.gauge(name, mem_metrics[name], dimensions=dims_customer,
@ -778,7 +799,7 @@ class LibvirtCheck(AgentCheck):
self.gauge("vm.{0}".format(name), mem_metrics[name],
dimensions=dims_operations)
memory_info = insp.inspect_memory_resident(inst)
self.gauge('vm.mem.resident_mb', float(memory_info.resident), dimensions=dims_operations)
self.gauge('vm.mem.resident_gb', float(memory_info.resident) / 1024, dimensions=dims_operations)
except KeyError:
self.log.debug("Balloon driver not active/available on guest {0} ({1})".format(inst_name,
instance_cache.get(inst_name)['hostname'].encode('utf8')))

View File

@ -85,7 +85,9 @@ Interface = collections.namedtuple('Interface', ['name', 'mac',
#
InterfaceStats = collections.namedtuple('InterfaceStats',
['rx_bytes', 'rx_packets',
'tx_bytes', 'tx_packets'])
'rx_errors', 'rx_dropped',
'tx_bytes', 'tx_packets',
'tx_errors', 'tx_dropped'])
# Named tuple representing vNIC rate statistics.

View File

@ -147,8 +147,12 @@ class LibvirtInspector(virt_inspector.Inspector):
dom_stats = domain.interfaceStats(name)
stats = virt_inspector.InterfaceStats(rx_bytes=dom_stats[0],
rx_packets=dom_stats[1],
rx_errors=dom_stats[2],
rx_dropped=dom_stats[3],
tx_bytes=dom_stats[4],
tx_packets=dom_stats[5])
tx_packets=dom_stats[5],
tx_errors=dom_stats[6],
tx_dropped=dom_stats[7])
yield (interface, stats)
def inspect_disks(self, instance):