diff --git a/docs/Libvirt.md b/docs/Libvirt.md index d7daf8ff..ea19ece0 100644 --- a/docs/Libvirt.md +++ b/docs/Libvirt.md @@ -243,50 +243,57 @@ Example cache (pretty-printed excerpt, see next section for complete list of ava ``` ## Per-Instance Metrics -| Name | Description | Associated Dimensions | -| -------------------- | -------------------------------------- | ---------------------- | -| cpu.utilization_perc | Overall CPU utilization (percentage) | | -| cpu.utilization_norm_perc | Normalized CPU utilization (percentage) | | -| disk.allocation | Total Disk allocation for a device | 'device' (ie, 'hdd') | -| disk.capacity | Total Disk capacity for a device | 'device' (ie, 'hdd') | -| disk.physical | Total Disk usage for a device | 'device' (ie, 'hdd') | -| disk.allocation_total| Total Disk allocation across devices for instances | | -| disk.capacity_total | Total Disk capacity across devices for instances | | -| disk.physical_total | Total Disk usage across devices for instances | | -| host_alive_status | See [host_alive_status Codes](#host_alive_status-codes) below | | -| io.read_ops_sec | Disk I/O read operations per second | 'device' (ie, 'hdd') | -| io.read_ops | Disk I/O read operations val | 'device' (ie, 'hdd') | -| io.read_bytes | Disk I/O read bytes val | 'device' (ie, 'hdd') | -| io.read_bytes_sec | Disk I/O read bytes per second | 'device' (ie, 'hdd') | -| io.read_bytes_total | Total Disk I/O read bytes across all devices | | -| io.read_bytes_total_sec | Total Disk I/O read bytes per second across devices | | -| io.read_ops_total | Total Disk I/O read operations across all devices | | -| io.read_ops_total_sec | Total Disk I/O read operations across all devices per sec | | -| io.write_ops_sec | Disk I/O write operations per second | 'device' (ie, 'hdd') | -| io.write_ops | Disk I/O write operations val | 'device' (ie, 'hdd') | -| io.write_bytes | Disk I/O write bytes val | 'device' (ie, 'hdd') | -| io.write_bytes_sec | Disk I/O write bytes per second | 'device' (ie, 'hdd') | -| io.errors_sec | Disk I/O errors per second | 'device' (ie, 'hdd') | -| io.write_bytes_total | Total Disk I/O write bytes across all devices | | -| io.write_bytes_total_sec | Total Disk I/O Write bytes per second across devices | | -| io.write_ops_total | Total Disk I/O write operations across all devices | | -| io.write_ops_total_sec | Total Disk I/O write operations across all devices per sec | | -| net.in_packets_sec | Network received packets per second | 'device' (ie, 'vnet0') | -| net.out_packets_sec | Network transmitted packets per second | 'device' (ie, 'vnet0') | -| net.in_bytes_sec | Network received bytes per second | 'device' (ie, 'vnet0') | -| net.out_bytes_sec | Network transmitted bytes per second | 'device' (ie, 'vnet0') | -| net.in_packets | Network received total packets | 'device' (ie, 'vnet0') | -| net.out_packets | Network transmitted total packets | 'device' (ie, 'vnet0') | -| net.in_bytes | Network received total bytes | 'device' (ie, 'vnet0') | -| net.out_bytes | Network transmitted total bytes | 'device' (ie, 'vnet0') | -| mem.free_mb | Free memory in Mbytes | | -| mem.total_mb | Total memory in Mbytes | | -| mem.used_mb | Used memory in Mbytes | | -| mem.free_perc | Percent of memory free | | -| mem.swap_used_mb | Used swap space in Mbytes | | -| ping_status | 0 for ping success, 1 for ping failure | | -| cpu.time_ns | Cumulative CPU time (in ns) | | -| mem.resident_mb | Total memory used on host, an Operations-only metric | | +| Name | Description | Associated Dimensions | +| ------------------------- | --------------------------------------------------------- | ---------------------- | +| cpu.total_cores | Total virtual cpus allocated to vm | | +| cpu.used_cores | Number of cpu cores used | | +| cpu.utilization_perc | Overall CPU utilization (percentage) | | +| cpu.utilization_norm_perc | Normalized CPU utilization (percentage) | | +| disk.allocation | Total Disk allocation for a device | 'device' (ie, 'hdd') | +| disk.capacity | Total Disk capacity for a device | 'device' (ie, 'hdd') | +| disk.physical | Total Disk usage for a device | 'device' (ie, 'hdd') | +| disk.allocation_total | Total Disk allocation across devices for instances | | +| disk.capacity_total | Total Disk capacity across devices for instances | | +| disk.physical_total | Total Disk usage across devices for instances | | +| health_status | Reports if vm is running (0) or not (1) | | +| host_alive_status | See [host_alive_status Codes](#host_alive_status-codes) below | | +| io.read_ops_sec | Disk I/O read operations per second | 'device' (ie, 'hdd') | +| io.read_ops | Disk I/O read operations val | 'device' (ie, 'hdd') | +| io.read_bytes | Disk I/O read bytes val | 'device' (ie, 'hdd') | +| io.read_bytes_sec | Disk I/O read bytes per second | 'device' (ie, 'hdd') | +| io.read_bytes_total | Total Disk I/O read bytes across all devices | | +| io.read_bytes_total_sec | Total Disk I/O read bytes per second across devices | | +| io.read_ops_total | Total Disk I/O read operations across all devices | | +| io.read_ops_total_sec | Total Disk I/O read operations across all devices per sec | | +| io.write_ops_sec | Disk I/O write operations per second | 'device' (ie, 'hdd') | +| io.write_ops | Disk I/O write operations val | 'device' (ie, 'hdd') | +| io.write_bytes | Disk I/O write bytes val | 'device' (ie, 'hdd') | +| io.write_bytes_sec | Disk I/O write bytes per second | 'device' (ie, 'hdd') | +| io.errors_sec | Disk I/O errors per second | 'device' (ie, 'hdd') | +| io.write_bytes_total | Total Disk I/O write bytes across all devices | | +| io.write_bytes_total_sec | Total Disk I/O Write bytes per second across devices | | +| io.write_ops_total | Total Disk I/O write operations across all devices | | +| io.write_ops_total_sec | Total Disk I/O write operations across all devices per sec | | +| net.in_packets_sec | Network received packets per second | 'device' (ie, 'vnet0') | +| net.out_packets_sec | Network transmitted packets per second | 'device' (ie, 'vnet0') | +| net.in_bytes_sec | Network received bytes per second | 'device' (ie, 'vnet0') | +| net.out_bytes_sec | Network transmitted bytes per second | 'device' (ie, 'vnet0') | +| net.in_dropped_sec | Network received packets dropped per second | 'device' (ie, 'vnet0') | +| net.out_dropped_sec | Network transmitted packets dropped per second | 'device' (ie, 'vnet0') | +| net.in_errors_sec | Network received packets with errors per second | 'device' (ie, 'vnet0') | +| net.out_errors_sec | Network transmitted packets with errors per second | 'device' (ie, 'vnet0') | +| net.in_packets | Network received total packets | 'device' (ie, 'vnet0') | +| net.out_packets | Network transmitted total packets | 'device' (ie, 'vnet0') | +| net.in_bytes | Network received total bytes | 'device' (ie, 'vnet0') | +| net.out_bytes | Network transmitted total bytes | 'device' (ie, 'vnet0') | +| mem.free_gb | Free memory in Gbytes | | +| mem.total_gb | Total memory in Gbytes | | +| mem.used_gb | Used memory in Gbytes | | +| mem.free_perc | Percent of memory free | | +| mem.swap_used_gb | Used swap space in Gbytes | | +| ping_status | 0 for ping success, 1 for ping failure | | +| cpu.time_ns | Cumulative CPU time (in ns) | | +| mem.resident_gb | Total memory used on host, an Operations-only metric | | ### host_alive_status Codes | Code | Description | value_meta 'detail' | diff --git a/monasca_agent/collector/checks_d/libvirt.py b/monasca_agent/collector/checks_d/libvirt.py index 53bf7bc8..862c9b22 100644 --- a/monasca_agent/collector/checks_d/libvirt.py +++ b/monasca_agent/collector/checks_d/libvirt.py @@ -401,14 +401,14 @@ class LibvirtCheck(AgentCheck): cpu_info = insp.inspect_cpus(inst) if 'cpu.time' in metric_cache[inst_name]: - # I have a prior value, so calculate the raw_perc & push the metric + # I have a prior value, so calculate the used_cores & push the metric cpu_diff = cpu_info.time - metric_cache[inst_name]['cpu.time']['value'] time_diff = sample_time - float(metric_cache[inst_name]['cpu.time']['timestamp']) # Convert time_diff to nanoseconds, and calculate percentage - raw_perc = (cpu_diff / (time_diff * 1000000000)) * 100 + used_cores = (cpu_diff / (time_diff * 1000000000)) # Divide by the number of cores to normalize the percentage - normalized_perc = (raw_perc / cpu_info.number) - if raw_perc < 0: + normalized_perc = (used_cores / cpu_info.number) * 100 + if used_cores < 0: # Bad value, save current reading and skip self.log.warn("Ignoring negative CPU sample for: " "{0} new cpu time: {1} old cpu time: {2}" @@ -418,7 +418,15 @@ class LibvirtCheck(AgentCheck): 'value': cpu_info.time} return - self.gauge('cpu.utilization_perc', int(round(raw_perc, 0)), + self.gauge('cpu.total_cores', float(cpu_info.number), + dimensions=dims_customer, + delegated_tenant=instance_cache.get(inst_name)['tenant_id'], + hostname=instance_cache.get(inst_name)['hostname']) + self.gauge('cpu.used_cores', float(used_cores), + dimensions=dims_customer, + delegated_tenant=instance_cache.get(inst_name)['tenant_id'], + hostname=instance_cache.get(inst_name)['hostname']) + self.gauge('cpu.utilization_perc', int(round(used_cores * 100, 0)), dimensions=dims_customer, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) @@ -426,7 +434,12 @@ class LibvirtCheck(AgentCheck): dimensions=dims_customer, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) - self.gauge('vm.cpu.utilization_perc', int(round(raw_perc, 0)), + + self.gauge('vm.cpu.total_cores', float(cpu_info.number), + dimensions=dims_operations) + self.gauge('vm.cpu.used_cores', float(used_cores), + dimensions=dims_operations) + self.gauge('vm.cpu.utilization_perc', int(round(used_cores * 100, 0)), dimensions=dims_operations) self.gauge('vm.cpu.utilization_norm_perc', int(round(normalized_perc, 0)), dimensions=dims_operations) @@ -580,6 +593,7 @@ class LibvirtCheck(AgentCheck): """ inst_state = inst.state() dom_status = inst_state[0] - 1 + health_status = 0 if dom_status == 0 else 1 # anything other than 'running' is considered unhealthy metatag = None if inst_state[0] in DOM_STATES: @@ -597,6 +611,13 @@ class LibvirtCheck(AgentCheck): dimensions=dims_operations, value_meta=metatag) + self.gauge('health_status', health_status, + dimensions=dims_customer, + delegated_tenant=instance_cache.get(inst_name)['tenant_id'], + hostname=instance_cache.get(inst_name)['hostname']) + self.gauge('vm.health_status', health_status, + dimensions=dims_operations) + return dom_status def prepare_run(self): @@ -766,10 +787,10 @@ class LibvirtCheck(AgentCheck): # (req. balloon driver; Linux kernel param CONFIG_VIRTIO_BALLOON) try: mem_stats = inst.memoryStats() - mem_metrics = {'mem.free_mb': float(mem_stats['unused']) / 1024, - 'mem.swap_used_mb': float(mem_stats['swap_out']) / 1024, - 'mem.total_mb': float(mem_stats['available']) / 1024, - 'mem.used_mb': float(mem_stats['available'] - mem_stats['unused']) / 1024, + mem_metrics = {'mem.free_gb': float(mem_stats['unused']) / 1024 / 1024, + 'mem.swap_used_gb': float(mem_stats['swap_out']) / 1024 / 1024, + 'mem.total_gb': float(mem_stats['available']) / 1024 / 1024, + 'mem.used_gb': float(mem_stats['available'] - mem_stats['unused']) / 1024 / 1024, 'mem.free_perc': float(mem_stats['unused']) / float(mem_stats['available']) * 100} for name in mem_metrics: self.gauge(name, mem_metrics[name], dimensions=dims_customer, @@ -778,7 +799,7 @@ class LibvirtCheck(AgentCheck): self.gauge("vm.{0}".format(name), mem_metrics[name], dimensions=dims_operations) memory_info = insp.inspect_memory_resident(inst) - self.gauge('vm.mem.resident_mb', float(memory_info.resident), dimensions=dims_operations) + self.gauge('vm.mem.resident_gb', float(memory_info.resident) / 1024, dimensions=dims_operations) except KeyError: self.log.debug("Balloon driver not active/available on guest {0} ({1})".format(inst_name, instance_cache.get(inst_name)['hostname'].encode('utf8'))) diff --git a/monasca_agent/collector/virt/inspector.py b/monasca_agent/collector/virt/inspector.py index 4dec98f8..92feb021 100644 --- a/monasca_agent/collector/virt/inspector.py +++ b/monasca_agent/collector/virt/inspector.py @@ -85,7 +85,9 @@ Interface = collections.namedtuple('Interface', ['name', 'mac', # InterfaceStats = collections.namedtuple('InterfaceStats', ['rx_bytes', 'rx_packets', - 'tx_bytes', 'tx_packets']) + 'rx_errors', 'rx_dropped', + 'tx_bytes', 'tx_packets', + 'tx_errors', 'tx_dropped']) # Named tuple representing vNIC rate statistics. diff --git a/monasca_agent/collector/virt/libvirt/inspector.py b/monasca_agent/collector/virt/libvirt/inspector.py index 63e6110d..559be05e 100644 --- a/monasca_agent/collector/virt/libvirt/inspector.py +++ b/monasca_agent/collector/virt/libvirt/inspector.py @@ -147,8 +147,12 @@ class LibvirtInspector(virt_inspector.Inspector): dom_stats = domain.interfaceStats(name) stats = virt_inspector.InterfaceStats(rx_bytes=dom_stats[0], rx_packets=dom_stats[1], + rx_errors=dom_stats[2], + rx_dropped=dom_stats[3], tx_bytes=dom_stats[4], - tx_packets=dom_stats[5]) + tx_packets=dom_stats[5], + tx_errors=dom_stats[6], + tx_dropped=dom_stats[7]) yield (interface, stats) def inspect_disks(self, instance):