Merge "Add some new metrics and update some old ones"

This commit is contained in:
Jenkins 2017-09-19 14:47:48 +00:00 committed by Gerrit Code Review
commit 1802f3dc78
4 changed files with 91 additions and 57 deletions

View File

@ -243,50 +243,57 @@ Example cache (pretty-printed excerpt, see next section for complete list of ava
```
## Per-Instance Metrics
| Name | Description | Associated Dimensions |
| -------------------- | -------------------------------------- | ---------------------- |
| cpu.utilization_perc | Overall CPU utilization (percentage) | |
| cpu.utilization_norm_perc | Normalized CPU utilization (percentage) | |
| disk.allocation | Total Disk allocation for a device | 'device' (ie, 'hdd') |
| disk.capacity | Total Disk capacity for a device | 'device' (ie, 'hdd') |
| disk.physical | Total Disk usage for a device | 'device' (ie, 'hdd') |
| disk.allocation_total| Total Disk allocation across devices for instances | |
| disk.capacity_total | Total Disk capacity across devices for instances | |
| disk.physical_total | Total Disk usage across devices for instances | |
| host_alive_status | See [host_alive_status Codes](#host_alive_status-codes) below | |
| io.read_ops_sec | Disk I/O read operations per second | 'device' (ie, 'hdd') |
| io.read_ops | Disk I/O read operations val | 'device' (ie, 'hdd') |
| io.read_bytes | Disk I/O read bytes val | 'device' (ie, 'hdd') |
| io.read_bytes_sec | Disk I/O read bytes per second | 'device' (ie, 'hdd') |
| io.read_bytes_total | Total Disk I/O read bytes across all devices | |
| io.read_bytes_total_sec | Total Disk I/O read bytes per second across devices | |
| io.read_ops_total | Total Disk I/O read operations across all devices | |
| io.read_ops_total_sec | Total Disk I/O read operations across all devices per sec | |
| io.write_ops_sec | Disk I/O write operations per second | 'device' (ie, 'hdd') |
| io.write_ops | Disk I/O write operations val | 'device' (ie, 'hdd') |
| io.write_bytes | Disk I/O write bytes val | 'device' (ie, 'hdd') |
| io.write_bytes_sec | Disk I/O write bytes per second | 'device' (ie, 'hdd') |
| io.errors_sec | Disk I/O errors per second | 'device' (ie, 'hdd') |
| io.write_bytes_total | Total Disk I/O write bytes across all devices | |
| io.write_bytes_total_sec | Total Disk I/O Write bytes per second across devices | |
| io.write_ops_total | Total Disk I/O write operations across all devices | |
| io.write_ops_total_sec | Total Disk I/O write operations across all devices per sec | |
| net.in_packets_sec | Network received packets per second | 'device' (ie, 'vnet0') |
| net.out_packets_sec | Network transmitted packets per second | 'device' (ie, 'vnet0') |
| net.in_bytes_sec | Network received bytes per second | 'device' (ie, 'vnet0') |
| net.out_bytes_sec | Network transmitted bytes per second | 'device' (ie, 'vnet0') |
| net.in_packets | Network received total packets | 'device' (ie, 'vnet0') |
| net.out_packets | Network transmitted total packets | 'device' (ie, 'vnet0') |
| net.in_bytes | Network received total bytes | 'device' (ie, 'vnet0') |
| net.out_bytes | Network transmitted total bytes | 'device' (ie, 'vnet0') |
| mem.free_mb | Free memory in Mbytes | |
| mem.total_mb | Total memory in Mbytes | |
| mem.used_mb | Used memory in Mbytes | |
| mem.free_perc | Percent of memory free | |
| mem.swap_used_mb | Used swap space in Mbytes | |
| ping_status | 0 for ping success, 1 for ping failure | |
| cpu.time_ns | Cumulative CPU time (in ns) | |
| mem.resident_mb | Total memory used on host, an Operations-only metric | |
| Name | Description | Associated Dimensions |
| ------------------------- | --------------------------------------------------------- | ---------------------- |
| cpu.total_cores | Total virtual cpus allocated to vm | |
| cpu.used_cores | Number of cpu cores used | |
| cpu.utilization_perc | Overall CPU utilization (percentage) | |
| cpu.utilization_norm_perc | Normalized CPU utilization (percentage) | |
| disk.allocation | Total Disk allocation for a device | 'device' (ie, 'hdd') |
| disk.capacity | Total Disk capacity for a device | 'device' (ie, 'hdd') |
| disk.physical | Total Disk usage for a device | 'device' (ie, 'hdd') |
| disk.allocation_total | Total Disk allocation across devices for instances | |
| disk.capacity_total | Total Disk capacity across devices for instances | |
| disk.physical_total | Total Disk usage across devices for instances | |
| health_status | Reports if vm is running (0) or not (1) | |
| host_alive_status | See [host_alive_status Codes](#host_alive_status-codes) below | |
| io.read_ops_sec | Disk I/O read operations per second | 'device' (ie, 'hdd') |
| io.read_ops | Disk I/O read operations val | 'device' (ie, 'hdd') |
| io.read_bytes | Disk I/O read bytes val | 'device' (ie, 'hdd') |
| io.read_bytes_sec | Disk I/O read bytes per second | 'device' (ie, 'hdd') |
| io.read_bytes_total | Total Disk I/O read bytes across all devices | |
| io.read_bytes_total_sec | Total Disk I/O read bytes per second across devices | |
| io.read_ops_total | Total Disk I/O read operations across all devices | |
| io.read_ops_total_sec | Total Disk I/O read operations across all devices per sec | |
| io.write_ops_sec | Disk I/O write operations per second | 'device' (ie, 'hdd') |
| io.write_ops | Disk I/O write operations val | 'device' (ie, 'hdd') |
| io.write_bytes | Disk I/O write bytes val | 'device' (ie, 'hdd') |
| io.write_bytes_sec | Disk I/O write bytes per second | 'device' (ie, 'hdd') |
| io.errors_sec | Disk I/O errors per second | 'device' (ie, 'hdd') |
| io.write_bytes_total | Total Disk I/O write bytes across all devices | |
| io.write_bytes_total_sec | Total Disk I/O Write bytes per second across devices | |
| io.write_ops_total | Total Disk I/O write operations across all devices | |
| io.write_ops_total_sec | Total Disk I/O write operations across all devices per sec | |
| net.in_packets_sec | Network received packets per second | 'device' (ie, 'vnet0') |
| net.out_packets_sec | Network transmitted packets per second | 'device' (ie, 'vnet0') |
| net.in_bytes_sec | Network received bytes per second | 'device' (ie, 'vnet0') |
| net.out_bytes_sec | Network transmitted bytes per second | 'device' (ie, 'vnet0') |
| net.in_dropped_sec | Network received packets dropped per second | 'device' (ie, 'vnet0') |
| net.out_dropped_sec | Network transmitted packets dropped per second | 'device' (ie, 'vnet0') |
| net.in_errors_sec | Network received packets with errors per second | 'device' (ie, 'vnet0') |
| net.out_errors_sec | Network transmitted packets with errors per second | 'device' (ie, 'vnet0') |
| net.in_packets | Network received total packets | 'device' (ie, 'vnet0') |
| net.out_packets | Network transmitted total packets | 'device' (ie, 'vnet0') |
| net.in_bytes | Network received total bytes | 'device' (ie, 'vnet0') |
| net.out_bytes | Network transmitted total bytes | 'device' (ie, 'vnet0') |
| mem.free_gb | Free memory in Gbytes | |
| mem.total_gb | Total memory in Gbytes | |
| mem.used_gb | Used memory in Gbytes | |
| mem.free_perc | Percent of memory free | |
| mem.swap_used_gb | Used swap space in Gbytes | |
| ping_status | 0 for ping success, 1 for ping failure | |
| cpu.time_ns | Cumulative CPU time (in ns) | |
| mem.resident_gb | Total memory used on host, an Operations-only metric | |
### host_alive_status Codes
| Code | Description | value_meta 'detail' |

View File

@ -401,14 +401,14 @@ class LibvirtCheck(AgentCheck):
cpu_info = insp.inspect_cpus(inst)
if 'cpu.time' in metric_cache[inst_name]:
# I have a prior value, so calculate the raw_perc & push the metric
# I have a prior value, so calculate the used_cores & push the metric
cpu_diff = cpu_info.time - metric_cache[inst_name]['cpu.time']['value']
time_diff = sample_time - float(metric_cache[inst_name]['cpu.time']['timestamp'])
# Convert time_diff to nanoseconds, and calculate percentage
raw_perc = (cpu_diff / (time_diff * 1000000000)) * 100
used_cores = (cpu_diff / (time_diff * 1000000000))
# Divide by the number of cores to normalize the percentage
normalized_perc = (raw_perc / cpu_info.number)
if raw_perc < 0:
normalized_perc = (used_cores / cpu_info.number) * 100
if used_cores < 0:
# Bad value, save current reading and skip
self.log.warn("Ignoring negative CPU sample for: "
"{0} new cpu time: {1} old cpu time: {2}"
@ -418,7 +418,15 @@ class LibvirtCheck(AgentCheck):
'value': cpu_info.time}
return
self.gauge('cpu.utilization_perc', int(round(raw_perc, 0)),
self.gauge('cpu.total_cores', float(cpu_info.number),
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge('cpu.used_cores', float(used_cores),
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge('cpu.utilization_perc', int(round(used_cores * 100, 0)),
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
@ -426,7 +434,12 @@ class LibvirtCheck(AgentCheck):
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge('vm.cpu.utilization_perc', int(round(raw_perc, 0)),
self.gauge('vm.cpu.total_cores', float(cpu_info.number),
dimensions=dims_operations)
self.gauge('vm.cpu.used_cores', float(used_cores),
dimensions=dims_operations)
self.gauge('vm.cpu.utilization_perc', int(round(used_cores * 100, 0)),
dimensions=dims_operations)
self.gauge('vm.cpu.utilization_norm_perc', int(round(normalized_perc, 0)),
dimensions=dims_operations)
@ -580,6 +593,7 @@ class LibvirtCheck(AgentCheck):
"""
inst_state = inst.state()
dom_status = inst_state[0] - 1
health_status = 0 if dom_status == 0 else 1 # anything other than 'running' is considered unhealthy
metatag = None
if inst_state[0] in DOM_STATES:
@ -597,6 +611,13 @@ class LibvirtCheck(AgentCheck):
dimensions=dims_operations,
value_meta=metatag)
self.gauge('health_status', health_status,
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
hostname=instance_cache.get(inst_name)['hostname'])
self.gauge('vm.health_status', health_status,
dimensions=dims_operations)
return dom_status
def prepare_run(self):
@ -766,10 +787,10 @@ class LibvirtCheck(AgentCheck):
# (req. balloon driver; Linux kernel param CONFIG_VIRTIO_BALLOON)
try:
mem_stats = inst.memoryStats()
mem_metrics = {'mem.free_mb': float(mem_stats['unused']) / 1024,
'mem.swap_used_mb': float(mem_stats['swap_out']) / 1024,
'mem.total_mb': float(mem_stats['available']) / 1024,
'mem.used_mb': float(mem_stats['available'] - mem_stats['unused']) / 1024,
mem_metrics = {'mem.free_gb': float(mem_stats['unused']) / 1024 / 1024,
'mem.swap_used_gb': float(mem_stats['swap_out']) / 1024 / 1024,
'mem.total_gb': float(mem_stats['available']) / 1024 / 1024,
'mem.used_gb': float(mem_stats['available'] - mem_stats['unused']) / 1024 / 1024,
'mem.free_perc': float(mem_stats['unused']) / float(mem_stats['available']) * 100}
for name in mem_metrics:
self.gauge(name, mem_metrics[name], dimensions=dims_customer,
@ -778,7 +799,7 @@ class LibvirtCheck(AgentCheck):
self.gauge("vm.{0}".format(name), mem_metrics[name],
dimensions=dims_operations)
memory_info = insp.inspect_memory_resident(inst)
self.gauge('vm.mem.resident_mb', float(memory_info.resident), dimensions=dims_operations)
self.gauge('vm.mem.resident_gb', float(memory_info.resident) / 1024, dimensions=dims_operations)
except KeyError:
self.log.debug("Balloon driver not active/available on guest {0} ({1})".format(inst_name,
instance_cache.get(inst_name)['hostname'].encode('utf8')))

View File

@ -85,7 +85,9 @@ Interface = collections.namedtuple('Interface', ['name', 'mac',
#
InterfaceStats = collections.namedtuple('InterfaceStats',
['rx_bytes', 'rx_packets',
'tx_bytes', 'tx_packets'])
'rx_errors', 'rx_dropped',
'tx_bytes', 'tx_packets',
'tx_errors', 'tx_dropped'])
# Named tuple representing vNIC rate statistics.

View File

@ -147,8 +147,12 @@ class LibvirtInspector(virt_inspector.Inspector):
dom_stats = domain.interfaceStats(name)
stats = virt_inspector.InterfaceStats(rx_bytes=dom_stats[0],
rx_packets=dom_stats[1],
rx_errors=dom_stats[2],
rx_dropped=dom_stats[3],
tx_bytes=dom_stats[4],
tx_packets=dom_stats[5])
tx_packets=dom_stats[5],
tx_errors=dom_stats[6],
tx_dropped=dom_stats[7])
yield (interface, stats)
def inspect_disks(self, instance):