Merge "Add some new metrics and update some old ones"
This commit is contained in:
commit
1802f3dc78
|
@ -243,50 +243,57 @@ Example cache (pretty-printed excerpt, see next section for complete list of ava
|
|||
```
|
||||
## Per-Instance Metrics
|
||||
|
||||
| Name | Description | Associated Dimensions |
|
||||
| -------------------- | -------------------------------------- | ---------------------- |
|
||||
| cpu.utilization_perc | Overall CPU utilization (percentage) | |
|
||||
| cpu.utilization_norm_perc | Normalized CPU utilization (percentage) | |
|
||||
| disk.allocation | Total Disk allocation for a device | 'device' (ie, 'hdd') |
|
||||
| disk.capacity | Total Disk capacity for a device | 'device' (ie, 'hdd') |
|
||||
| disk.physical | Total Disk usage for a device | 'device' (ie, 'hdd') |
|
||||
| disk.allocation_total| Total Disk allocation across devices for instances | |
|
||||
| disk.capacity_total | Total Disk capacity across devices for instances | |
|
||||
| disk.physical_total | Total Disk usage across devices for instances | |
|
||||
| host_alive_status | See [host_alive_status Codes](#host_alive_status-codes) below | |
|
||||
| io.read_ops_sec | Disk I/O read operations per second | 'device' (ie, 'hdd') |
|
||||
| io.read_ops | Disk I/O read operations val | 'device' (ie, 'hdd') |
|
||||
| io.read_bytes | Disk I/O read bytes val | 'device' (ie, 'hdd') |
|
||||
| io.read_bytes_sec | Disk I/O read bytes per second | 'device' (ie, 'hdd') |
|
||||
| io.read_bytes_total | Total Disk I/O read bytes across all devices | |
|
||||
| io.read_bytes_total_sec | Total Disk I/O read bytes per second across devices | |
|
||||
| io.read_ops_total | Total Disk I/O read operations across all devices | |
|
||||
| io.read_ops_total_sec | Total Disk I/O read operations across all devices per sec | |
|
||||
| io.write_ops_sec | Disk I/O write operations per second | 'device' (ie, 'hdd') |
|
||||
| io.write_ops | Disk I/O write operations val | 'device' (ie, 'hdd') |
|
||||
| io.write_bytes | Disk I/O write bytes val | 'device' (ie, 'hdd') |
|
||||
| io.write_bytes_sec | Disk I/O write bytes per second | 'device' (ie, 'hdd') |
|
||||
| io.errors_sec | Disk I/O errors per second | 'device' (ie, 'hdd') |
|
||||
| io.write_bytes_total | Total Disk I/O write bytes across all devices | |
|
||||
| io.write_bytes_total_sec | Total Disk I/O Write bytes per second across devices | |
|
||||
| io.write_ops_total | Total Disk I/O write operations across all devices | |
|
||||
| io.write_ops_total_sec | Total Disk I/O write operations across all devices per sec | |
|
||||
| net.in_packets_sec | Network received packets per second | 'device' (ie, 'vnet0') |
|
||||
| net.out_packets_sec | Network transmitted packets per second | 'device' (ie, 'vnet0') |
|
||||
| net.in_bytes_sec | Network received bytes per second | 'device' (ie, 'vnet0') |
|
||||
| net.out_bytes_sec | Network transmitted bytes per second | 'device' (ie, 'vnet0') |
|
||||
| net.in_packets | Network received total packets | 'device' (ie, 'vnet0') |
|
||||
| net.out_packets | Network transmitted total packets | 'device' (ie, 'vnet0') |
|
||||
| net.in_bytes | Network received total bytes | 'device' (ie, 'vnet0') |
|
||||
| net.out_bytes | Network transmitted total bytes | 'device' (ie, 'vnet0') |
|
||||
| mem.free_mb | Free memory in Mbytes | |
|
||||
| mem.total_mb | Total memory in Mbytes | |
|
||||
| mem.used_mb | Used memory in Mbytes | |
|
||||
| mem.free_perc | Percent of memory free | |
|
||||
| mem.swap_used_mb | Used swap space in Mbytes | |
|
||||
| ping_status | 0 for ping success, 1 for ping failure | |
|
||||
| cpu.time_ns | Cumulative CPU time (in ns) | |
|
||||
| mem.resident_mb | Total memory used on host, an Operations-only metric | |
|
||||
| Name | Description | Associated Dimensions |
|
||||
| ------------------------- | --------------------------------------------------------- | ---------------------- |
|
||||
| cpu.total_cores | Total virtual cpus allocated to vm | |
|
||||
| cpu.used_cores | Number of cpu cores used | |
|
||||
| cpu.utilization_perc | Overall CPU utilization (percentage) | |
|
||||
| cpu.utilization_norm_perc | Normalized CPU utilization (percentage) | |
|
||||
| disk.allocation | Total Disk allocation for a device | 'device' (ie, 'hdd') |
|
||||
| disk.capacity | Total Disk capacity for a device | 'device' (ie, 'hdd') |
|
||||
| disk.physical | Total Disk usage for a device | 'device' (ie, 'hdd') |
|
||||
| disk.allocation_total | Total Disk allocation across devices for instances | |
|
||||
| disk.capacity_total | Total Disk capacity across devices for instances | |
|
||||
| disk.physical_total | Total Disk usage across devices for instances | |
|
||||
| health_status | Reports if vm is running (0) or not (1) | |
|
||||
| host_alive_status | See [host_alive_status Codes](#host_alive_status-codes) below | |
|
||||
| io.read_ops_sec | Disk I/O read operations per second | 'device' (ie, 'hdd') |
|
||||
| io.read_ops | Disk I/O read operations val | 'device' (ie, 'hdd') |
|
||||
| io.read_bytes | Disk I/O read bytes val | 'device' (ie, 'hdd') |
|
||||
| io.read_bytes_sec | Disk I/O read bytes per second | 'device' (ie, 'hdd') |
|
||||
| io.read_bytes_total | Total Disk I/O read bytes across all devices | |
|
||||
| io.read_bytes_total_sec | Total Disk I/O read bytes per second across devices | |
|
||||
| io.read_ops_total | Total Disk I/O read operations across all devices | |
|
||||
| io.read_ops_total_sec | Total Disk I/O read operations across all devices per sec | |
|
||||
| io.write_ops_sec | Disk I/O write operations per second | 'device' (ie, 'hdd') |
|
||||
| io.write_ops | Disk I/O write operations val | 'device' (ie, 'hdd') |
|
||||
| io.write_bytes | Disk I/O write bytes val | 'device' (ie, 'hdd') |
|
||||
| io.write_bytes_sec | Disk I/O write bytes per second | 'device' (ie, 'hdd') |
|
||||
| io.errors_sec | Disk I/O errors per second | 'device' (ie, 'hdd') |
|
||||
| io.write_bytes_total | Total Disk I/O write bytes across all devices | |
|
||||
| io.write_bytes_total_sec | Total Disk I/O Write bytes per second across devices | |
|
||||
| io.write_ops_total | Total Disk I/O write operations across all devices | |
|
||||
| io.write_ops_total_sec | Total Disk I/O write operations across all devices per sec | |
|
||||
| net.in_packets_sec | Network received packets per second | 'device' (ie, 'vnet0') |
|
||||
| net.out_packets_sec | Network transmitted packets per second | 'device' (ie, 'vnet0') |
|
||||
| net.in_bytes_sec | Network received bytes per second | 'device' (ie, 'vnet0') |
|
||||
| net.out_bytes_sec | Network transmitted bytes per second | 'device' (ie, 'vnet0') |
|
||||
| net.in_dropped_sec | Network received packets dropped per second | 'device' (ie, 'vnet0') |
|
||||
| net.out_dropped_sec | Network transmitted packets dropped per second | 'device' (ie, 'vnet0') |
|
||||
| net.in_errors_sec | Network received packets with errors per second | 'device' (ie, 'vnet0') |
|
||||
| net.out_errors_sec | Network transmitted packets with errors per second | 'device' (ie, 'vnet0') |
|
||||
| net.in_packets | Network received total packets | 'device' (ie, 'vnet0') |
|
||||
| net.out_packets | Network transmitted total packets | 'device' (ie, 'vnet0') |
|
||||
| net.in_bytes | Network received total bytes | 'device' (ie, 'vnet0') |
|
||||
| net.out_bytes | Network transmitted total bytes | 'device' (ie, 'vnet0') |
|
||||
| mem.free_gb | Free memory in Gbytes | |
|
||||
| mem.total_gb | Total memory in Gbytes | |
|
||||
| mem.used_gb | Used memory in Gbytes | |
|
||||
| mem.free_perc | Percent of memory free | |
|
||||
| mem.swap_used_gb | Used swap space in Gbytes | |
|
||||
| ping_status | 0 for ping success, 1 for ping failure | |
|
||||
| cpu.time_ns | Cumulative CPU time (in ns) | |
|
||||
| mem.resident_gb | Total memory used on host, an Operations-only metric | |
|
||||
|
||||
### host_alive_status Codes
|
||||
| Code | Description | value_meta 'detail' |
|
||||
|
|
|
@ -401,14 +401,14 @@ class LibvirtCheck(AgentCheck):
|
|||
cpu_info = insp.inspect_cpus(inst)
|
||||
|
||||
if 'cpu.time' in metric_cache[inst_name]:
|
||||
# I have a prior value, so calculate the raw_perc & push the metric
|
||||
# I have a prior value, so calculate the used_cores & push the metric
|
||||
cpu_diff = cpu_info.time - metric_cache[inst_name]['cpu.time']['value']
|
||||
time_diff = sample_time - float(metric_cache[inst_name]['cpu.time']['timestamp'])
|
||||
# Convert time_diff to nanoseconds, and calculate percentage
|
||||
raw_perc = (cpu_diff / (time_diff * 1000000000)) * 100
|
||||
used_cores = (cpu_diff / (time_diff * 1000000000))
|
||||
# Divide by the number of cores to normalize the percentage
|
||||
normalized_perc = (raw_perc / cpu_info.number)
|
||||
if raw_perc < 0:
|
||||
normalized_perc = (used_cores / cpu_info.number) * 100
|
||||
if used_cores < 0:
|
||||
# Bad value, save current reading and skip
|
||||
self.log.warn("Ignoring negative CPU sample for: "
|
||||
"{0} new cpu time: {1} old cpu time: {2}"
|
||||
|
@ -418,7 +418,15 @@ class LibvirtCheck(AgentCheck):
|
|||
'value': cpu_info.time}
|
||||
return
|
||||
|
||||
self.gauge('cpu.utilization_perc', int(round(raw_perc, 0)),
|
||||
self.gauge('cpu.total_cores', float(cpu_info.number),
|
||||
dimensions=dims_customer,
|
||||
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
||||
hostname=instance_cache.get(inst_name)['hostname'])
|
||||
self.gauge('cpu.used_cores', float(used_cores),
|
||||
dimensions=dims_customer,
|
||||
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
||||
hostname=instance_cache.get(inst_name)['hostname'])
|
||||
self.gauge('cpu.utilization_perc', int(round(used_cores * 100, 0)),
|
||||
dimensions=dims_customer,
|
||||
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
||||
hostname=instance_cache.get(inst_name)['hostname'])
|
||||
|
@ -426,7 +434,12 @@ class LibvirtCheck(AgentCheck):
|
|||
dimensions=dims_customer,
|
||||
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
||||
hostname=instance_cache.get(inst_name)['hostname'])
|
||||
self.gauge('vm.cpu.utilization_perc', int(round(raw_perc, 0)),
|
||||
|
||||
self.gauge('vm.cpu.total_cores', float(cpu_info.number),
|
||||
dimensions=dims_operations)
|
||||
self.gauge('vm.cpu.used_cores', float(used_cores),
|
||||
dimensions=dims_operations)
|
||||
self.gauge('vm.cpu.utilization_perc', int(round(used_cores * 100, 0)),
|
||||
dimensions=dims_operations)
|
||||
self.gauge('vm.cpu.utilization_norm_perc', int(round(normalized_perc, 0)),
|
||||
dimensions=dims_operations)
|
||||
|
@ -580,6 +593,7 @@ class LibvirtCheck(AgentCheck):
|
|||
"""
|
||||
inst_state = inst.state()
|
||||
dom_status = inst_state[0] - 1
|
||||
health_status = 0 if dom_status == 0 else 1 # anything other than 'running' is considered unhealthy
|
||||
metatag = None
|
||||
|
||||
if inst_state[0] in DOM_STATES:
|
||||
|
@ -597,6 +611,13 @@ class LibvirtCheck(AgentCheck):
|
|||
dimensions=dims_operations,
|
||||
value_meta=metatag)
|
||||
|
||||
self.gauge('health_status', health_status,
|
||||
dimensions=dims_customer,
|
||||
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
||||
hostname=instance_cache.get(inst_name)['hostname'])
|
||||
self.gauge('vm.health_status', health_status,
|
||||
dimensions=dims_operations)
|
||||
|
||||
return dom_status
|
||||
|
||||
def prepare_run(self):
|
||||
|
@ -766,10 +787,10 @@ class LibvirtCheck(AgentCheck):
|
|||
# (req. balloon driver; Linux kernel param CONFIG_VIRTIO_BALLOON)
|
||||
try:
|
||||
mem_stats = inst.memoryStats()
|
||||
mem_metrics = {'mem.free_mb': float(mem_stats['unused']) / 1024,
|
||||
'mem.swap_used_mb': float(mem_stats['swap_out']) / 1024,
|
||||
'mem.total_mb': float(mem_stats['available']) / 1024,
|
||||
'mem.used_mb': float(mem_stats['available'] - mem_stats['unused']) / 1024,
|
||||
mem_metrics = {'mem.free_gb': float(mem_stats['unused']) / 1024 / 1024,
|
||||
'mem.swap_used_gb': float(mem_stats['swap_out']) / 1024 / 1024,
|
||||
'mem.total_gb': float(mem_stats['available']) / 1024 / 1024,
|
||||
'mem.used_gb': float(mem_stats['available'] - mem_stats['unused']) / 1024 / 1024,
|
||||
'mem.free_perc': float(mem_stats['unused']) / float(mem_stats['available']) * 100}
|
||||
for name in mem_metrics:
|
||||
self.gauge(name, mem_metrics[name], dimensions=dims_customer,
|
||||
|
@ -778,7 +799,7 @@ class LibvirtCheck(AgentCheck):
|
|||
self.gauge("vm.{0}".format(name), mem_metrics[name],
|
||||
dimensions=dims_operations)
|
||||
memory_info = insp.inspect_memory_resident(inst)
|
||||
self.gauge('vm.mem.resident_mb', float(memory_info.resident), dimensions=dims_operations)
|
||||
self.gauge('vm.mem.resident_gb', float(memory_info.resident) / 1024, dimensions=dims_operations)
|
||||
except KeyError:
|
||||
self.log.debug("Balloon driver not active/available on guest {0} ({1})".format(inst_name,
|
||||
instance_cache.get(inst_name)['hostname'].encode('utf8')))
|
||||
|
|
|
@ -85,7 +85,9 @@ Interface = collections.namedtuple('Interface', ['name', 'mac',
|
|||
#
|
||||
InterfaceStats = collections.namedtuple('InterfaceStats',
|
||||
['rx_bytes', 'rx_packets',
|
||||
'tx_bytes', 'tx_packets'])
|
||||
'rx_errors', 'rx_dropped',
|
||||
'tx_bytes', 'tx_packets',
|
||||
'tx_errors', 'tx_dropped'])
|
||||
|
||||
|
||||
# Named tuple representing vNIC rate statistics.
|
||||
|
|
|
@ -147,8 +147,12 @@ class LibvirtInspector(virt_inspector.Inspector):
|
|||
dom_stats = domain.interfaceStats(name)
|
||||
stats = virt_inspector.InterfaceStats(rx_bytes=dom_stats[0],
|
||||
rx_packets=dom_stats[1],
|
||||
rx_errors=dom_stats[2],
|
||||
rx_dropped=dom_stats[3],
|
||||
tx_bytes=dom_stats[4],
|
||||
tx_packets=dom_stats[5])
|
||||
tx_packets=dom_stats[5],
|
||||
tx_errors=dom_stats[6],
|
||||
tx_dropped=dom_stats[7])
|
||||
yield (interface, stats)
|
||||
|
||||
def inspect_disks(self, instance):
|
||||
|
|
Loading…
Reference in New Issue