507 lines
18 KiB
Python
507 lines
18 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
|
|
"""
|
|
What is this ?!
|
|
---------------
|
|
|
|
This script is designed to monitor VMs resource utilization
|
|
|
|
WorkFlow
|
|
--------
|
|
|
|
1) List all domains at the host via libvirt API
|
|
2) Spawn a separate thread for each domain for periodic check of disk usage
|
|
3) Spawn a separate thread for each domain for periodic check of memory usage
|
|
4) Spawn a separate thread for each domain for periodic check of cpu usage
|
|
5) Spawn a separate thread for checking of total numbers for host
|
|
6) Wait and read log messages with stats...
|
|
|
|
How to stop "monitoring"
|
|
------------------------
|
|
|
|
Just call Ctrl+C (KeyboardInterrupt) and the script should gracefully stop
|
|
all the threads and exit.
|
|
|
|
How to configure
|
|
----------------
|
|
|
|
The script accepts one input argument - the patch to a configuration file in
|
|
a JSON format.
|
|
All options are optional (have default values), so configuration file is
|
|
optional as well.
|
|
|
|
Options:
|
|
|
|
* debug
|
|
bool, if True the logger will use DEBUG level or INFO level if False.
|
|
Defaults to False
|
|
* connection
|
|
str, URI of libvirt to connect
|
|
Defaults to "qemu:///system"
|
|
* disk_getinfo_method
|
|
str, The way to obtain an information about the disk. There are 3
|
|
options available:
|
|
- "qemu" - using `qemu-img info` command
|
|
- "virsh" - via pulling volume pools and volumes in them by libvirt
|
|
API. like it is done in `virsh pool-list` and `virsh vol-info`
|
|
commands
|
|
- "guestfs" - mount all the disks and checks the actual size
|
|
(experimental, not checked actually)
|
|
Defaults to "qemu"
|
|
* host_check_interval
|
|
float, The interval in seconds to sleep between checking stats
|
|
Defaults to 5
|
|
* disk_check_interval
|
|
float, The interval in seconds to sleep between updating stats about disk
|
|
usage of a single VM.
|
|
Defaults to 10
|
|
* memory_check_interval
|
|
float, The interval in seconds to sleep between updating stats about ram
|
|
usage of a single VM.
|
|
Defaults to 5
|
|
* cpu_check_interval
|
|
float, The interval in seconds to sleep between updating stats about CPU
|
|
usage of a single VM.
|
|
Defaults to 1
|
|
* host_disk_utilization_alert
|
|
float, the number between 0 to 100. The achievement of the host's disk
|
|
usage to send alert about critical situation
|
|
Defaults to 80
|
|
* vm_disk_utilization_alert
|
|
float, the number between 0 to 100. The achievement of the VM's disk
|
|
usage to send alert about critical situation
|
|
Defaults to host_disk_utilization_alert value
|
|
* host_memory_utilization_alert
|
|
float, the number between 0 to 100. The achievement of the host's RAM
|
|
usage to send alert about critical situation
|
|
Defaults to 80
|
|
* vm_memory_utilization_alert
|
|
float, the number between 0 to 100. The achievement of the VM's RAM
|
|
usage to send alert about critical situation
|
|
Defaults to host_memory_utilization_alert value
|
|
|
|
"""
|
|
|
|
import collections
|
|
import logging
|
|
import sys
|
|
import subprocess
|
|
import time
|
|
import threading
|
|
import xml.etree.ElementTree
|
|
|
|
import json
|
|
import libvirt
|
|
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
def set_config_defaults(config):
|
|
"""Setup all default for config options."""
|
|
config.setdefault("debug", False)
|
|
config.setdefault("connection", "qemu:///system")
|
|
config.setdefault("disk_getinfo_method", "qemu")
|
|
# intervals
|
|
config.setdefault("host_check_interval", 5)
|
|
config.setdefault("disk_check_interval", 10)
|
|
config.setdefault("memory_check_interval", 5)
|
|
config.setdefault("cpu_check_interval", 1)
|
|
# alerts
|
|
config.setdefault("host_disk_utilization_alert", 80)
|
|
config.setdefault("vm_disk_utilization_alert",
|
|
config["host_disk_utilization_alert"])
|
|
config.setdefault("host_memory_utilization_alert", 80)
|
|
config.setdefault("vm_memory_utilization_alert",
|
|
config["host_memory_utilization_alert"])
|
|
return config
|
|
|
|
|
|
class Disk(object):
|
|
|
|
_VIRSH_VOLUME_CACHE = {}
|
|
|
|
def __init__(self, vm, dump, connection, config):
|
|
self._conn = connection
|
|
self._config = config
|
|
|
|
self.vm = vm
|
|
self.dump = dump
|
|
self.path = dump.find("source").get("file")
|
|
|
|
# self.target = dump.find("target")
|
|
|
|
def _get_info_from_qemu_img(self):
|
|
output = subprocess.check_output(["qemu-img", "info", self.path])
|
|
allocation = None
|
|
capacity = None
|
|
|
|
for line in output.splitlines():
|
|
if line.startswith("virtual size"):
|
|
# it looks like `virtual size: 4.0G (4294967296 bytes)`
|
|
_w1, size, _w2 = line.rsplit(" ", 2)
|
|
allocation = int(size.replace("(", ""))
|
|
elif line.startswith("disk size"):
|
|
size = line.split(" ")[2]
|
|
try:
|
|
capacity = float(size)
|
|
except ValueError:
|
|
from oslo_utils import strutils
|
|
capacity = strutils.string_to_bytes("%sB" % size,
|
|
return_int=True)
|
|
|
|
if allocation is None or capacity is None:
|
|
raise Exception("Failed to parse output of `qemu-img info %s`." %
|
|
self.path)
|
|
|
|
return capacity, allocation
|
|
|
|
def _get_info_from_virsh_vol_info(self):
|
|
# use the class level cache to not load all pools and volumes for each
|
|
# disk
|
|
cache = self._VIRSH_VOLUME_CACHE
|
|
if self.path not in cache:
|
|
# try to load all volumes
|
|
for pool in self._conn.listAllStoragePools():
|
|
for volume in pool.listAllVolumes():
|
|
cache[self.path] = volume
|
|
|
|
# it should appear after load
|
|
if self.path not in cache:
|
|
raise Exception("Failed to find %s volume." % self.path)
|
|
|
|
_something, capacity, allocation = cache[self.path].info()
|
|
return capacity, allocation
|
|
|
|
def _get_info_from_guestfs(self):
|
|
import guestfs
|
|
|
|
capacity = 0
|
|
allocation = 0
|
|
|
|
g = guestfs.GuestFS()
|
|
g.add_drive_opts(self.path, format="raw", readonly=1)
|
|
g.launch()
|
|
file_systems = g.list_filesystems()
|
|
for fs in file_systems:
|
|
if fs[1] not in ["", "swap", "unknown"]:
|
|
g.mount(fs[0], "/")
|
|
st = g.statvfs("/")
|
|
capacity += (st.f_blocks * st.f_frsize)
|
|
allocation += (st.f_blocks - st.f_bfree) * st.f_frsize
|
|
g.umount_all()
|
|
g.close()
|
|
return capacity, allocation
|
|
|
|
def info(self):
|
|
LOG.debug("Fetching info of %s disk." % self.path)
|
|
|
|
if self._config["disk_getinfo_method"] == "guestfs":
|
|
return self._get_info_from_guestfs()
|
|
elif self._config["disk_getinfo_method"] == "virsh":
|
|
return self._get_info_from_virsh_vol_info()
|
|
else:
|
|
return self._get_info_from_qemu_img()
|
|
|
|
|
|
class VM(object):
|
|
def __init__(self, domain, connection, config):
|
|
self._conn = connection
|
|
self._config = config
|
|
|
|
self.id = domain.ID()
|
|
self.uuid = domain.UUIDString()
|
|
self.name = domain.name()
|
|
self.dump = xml.etree.ElementTree.fromstring(domain.XMLDesc())
|
|
self._disks = None
|
|
|
|
# leave the original object just in case
|
|
self._domain = domain
|
|
|
|
@property
|
|
def disks(self):
|
|
if self._disks is None:
|
|
self._disks = []
|
|
for disk in self.dump.findall(".//disk"):
|
|
if disk.get("device") != "disk" or disk.get("type") != "file":
|
|
continue
|
|
self._disks.append(Disk(self, disk, self._conn, self._config))
|
|
return self._disks
|
|
|
|
def memory_utilization(self):
|
|
try:
|
|
stats = self._domain.memoryStats()
|
|
except libvirt.libvirtError:
|
|
if LOG.level == logging.DEBUG:
|
|
LOG.exception("Failed to retrieve memory info from %s VM." %
|
|
self.uuid)
|
|
return 0, 1
|
|
|
|
total = stats["actual"]
|
|
# "available" key is missed when the VM just begin launching
|
|
used = total - stats.get("available", 0)
|
|
return total, used
|
|
|
|
def cpu_utilization(self):
|
|
try:
|
|
total = self._domain.getCPUStats(total=True)[0]
|
|
except libvirt.libvirtError:
|
|
if LOG.level == logging.DEBUG:
|
|
LOG.exception("Failed to retrieve CPU timings from %s VM." %
|
|
self.uuid)
|
|
return 0
|
|
# The statistics are reported in nanoseconds.
|
|
return total["cpu_time"] / 1000000000.
|
|
|
|
|
|
class Host(object):
|
|
|
|
def __init__(self, config):
|
|
conn = libvirt.openReadOnly(config["connection"])
|
|
if conn is None:
|
|
raise Exception("Failed to open connection to %s." %
|
|
config["connection"])
|
|
self._config = config
|
|
self._conn = conn
|
|
|
|
self.vms = set()
|
|
self._stats = {}
|
|
|
|
self._stop_event = threading.Event()
|
|
|
|
def _vm_disk_utilization(self, vm, interval):
|
|
while not self._stop_event.isSet() and vm.uuid in self.vms:
|
|
total_c = 0
|
|
total_a = 0
|
|
for disk in vm.disks:
|
|
try:
|
|
capacity, allocation = disk.info()
|
|
except:
|
|
if LOG.level == logging.DEBUG:
|
|
LOG.exception("Error occurred while obtaining info "
|
|
"about disk (path=%s ; vm=%s)." %
|
|
(disk.path, vm.name))
|
|
continue
|
|
usage = capacity * 100.0 / allocation
|
|
LOG.debug("%(vm)s uses %(usage).4f%% of the disk %(file)s." % {
|
|
"vm": vm.name,
|
|
"usage": usage,
|
|
"file": disk.path
|
|
})
|
|
|
|
if usage >= self._config["vm_disk_utilization_alert"]:
|
|
LOG.critical("The VM %s uses too much (%.4f%%) of it's "
|
|
"disk %s!" % (vm.name, usage, disk.path))
|
|
|
|
total_c += capacity
|
|
total_a += allocation
|
|
self._stats[vm.uuid]["disks_capacity"] = total_c
|
|
self._stats[vm.uuid]["disks_allocation"] = total_a
|
|
time.sleep(interval)
|
|
|
|
# do not include the stats of turned-off VM
|
|
self._stats[vm.uuid].pop("disks_capacity", None)
|
|
self._stats[vm.uuid].pop("disks_allocation", None)
|
|
|
|
def _vm_memory_utilization(self, vm, interval):
|
|
while not self._stop_event.isSet() and vm.uuid in self.vms:
|
|
total, used = vm.memory_utilization()
|
|
usage = used * 100.0 / total
|
|
LOG.debug("%(vm)s uses %(usage).4f%% of memory." % {
|
|
"vm": vm.name,
|
|
"usage": usage
|
|
})
|
|
if usage >= self._config["vm_memory_utilization_alert"]:
|
|
LOG.critical("The VM %s uses too much (%.4f%%) of it's "
|
|
"memory!" % (vm.name, usage))
|
|
self._stats[vm.uuid]["total_ram"] = total
|
|
self._stats[vm.uuid]["used_ram"] = used
|
|
time.sleep(interval)
|
|
|
|
# do not include the stats of turned-off VM
|
|
self._stats[vm.uuid].pop("total_ram", None)
|
|
self._stats[vm.uuid].pop("used_ram", None)
|
|
|
|
def _vm_cpu_utilization(self, vm, interval):
|
|
self._stats[vm.uuid]["cpu_load"] = collections.deque(maxlen=60)
|
|
cpu_time_0 = None
|
|
while not self._stop_event.isSet() and vm.uuid in self.vms:
|
|
cpu_time = vm.cpu_utilization()
|
|
|
|
if cpu_time_0 is not None:
|
|
usage = (100.0 * (cpu_time - cpu_time_0) / interval)
|
|
LOG.debug("%(vm)s uses %(usage).4f%% of CPU." % {
|
|
"vm": vm.name,
|
|
"usage": usage
|
|
})
|
|
self._stats[vm.uuid]["cpu_load"].append(usage)
|
|
cpu_time_0 = cpu_time
|
|
time.sleep(interval)
|
|
|
|
# do not include the stats of turned-off VM
|
|
self._stats[vm.uuid].pop("cpu_load", None)
|
|
|
|
def _check_resources(self):
|
|
"""Check resources do not exceed their limits.
|
|
|
|
Check Disk, RAM, CPU utilization of the whole host based on the
|
|
stats from VMs and alert if necessary.
|
|
"""
|
|
while not self._stop_event.isSet():
|
|
disks_capacity = sum(
|
|
[s.get("disks_capacity", 0) for s in self._stats.values()])
|
|
disks_allocation = sum(
|
|
[s.get("disks_allocation", 0) for s in self._stats.values()])
|
|
if disks_allocation != 0:
|
|
disk_usage = disks_capacity * 100.0 / disks_allocation
|
|
else:
|
|
# it is not loaded yet or no vms
|
|
disk_usage = 0
|
|
if disk_usage >= self._config["host_disk_utilization_alert"]:
|
|
LOG.critical("Host uses too much (%.4f%%) of it's disk!" %
|
|
disk_usage)
|
|
else:
|
|
LOG.info("Host uses %.4f%% of it's disk." % disk_usage)
|
|
|
|
total_ram = sum(
|
|
[s.get("total_ram", 0) for s in self._stats.values()])
|
|
used_ram = sum(
|
|
[s.get("used_ram", 0) for s in self._stats.values()])
|
|
|
|
if total_ram != 0:
|
|
ram_usage = used_ram * 100.0 / total_ram
|
|
else:
|
|
# it is not loaded yet or no vms
|
|
ram_usage = 0
|
|
|
|
if ram_usage >= self._config["host_memory_utilization_alert"]:
|
|
LOG.critical("Host uses too much (%.4f%%) of it's memory!" %
|
|
ram_usage)
|
|
else:
|
|
LOG.info("Host uses %.4f%% of it's memory." % ram_usage)
|
|
|
|
time.sleep(self._config["host_check_interval"])
|
|
|
|
def _watch_for_vms(self):
|
|
workers = []
|
|
while not self._stop_event.isSet():
|
|
processed = set()
|
|
for domain_id in (self._conn.listDomainsID() or []):
|
|
domain = self._conn.lookupByID(domain_id)
|
|
if domain.UUIDString() not in self.vms:
|
|
LOG.info("Found a new VM (uuid=%s) at the host. Starting "
|
|
"watching for it's resources." %
|
|
domain.UUIDString())
|
|
|
|
vm = VM(domain, self._conn, self._config)
|
|
self.vms.add(vm.uuid)
|
|
self._stats[vm.uuid] = {}
|
|
|
|
disk_t = threading.Thread(
|
|
target=self._vm_disk_utilization,
|
|
kwargs={
|
|
"vm": vm,
|
|
"interval": self._config["disk_check_interval"]})
|
|
disk_t.start()
|
|
workers.append(disk_t)
|
|
|
|
memory_t = threading.Thread(
|
|
target=self._vm_memory_utilization,
|
|
kwargs={
|
|
"vm": vm,
|
|
"interval": self._config["memory_check_interval"]})
|
|
memory_t.start()
|
|
workers.append(memory_t)
|
|
|
|
cpu_t = threading.Thread(
|
|
target=self._vm_cpu_utilization,
|
|
kwargs={
|
|
"vm": vm,
|
|
"interval": self._config["cpu_check_interval"]})
|
|
cpu_t.start()
|
|
workers.append(cpu_t)
|
|
|
|
# sleep a bit to unsync checking different VMs (avoid
|
|
# checking disks of different VMs in the one timeframe)
|
|
time.sleep(0.5)
|
|
|
|
processed.add(domain.UUIDString())
|
|
|
|
for vm in self.vms - processed:
|
|
# stop watching for turned off VMs
|
|
LOG.info("The VM %s is shutdown now. Stop watching for it's "
|
|
"resources." % vm)
|
|
self.vms.remove(vm)
|
|
|
|
time.sleep(1)
|
|
|
|
for worker in workers:
|
|
worker.join()
|
|
|
|
def watch(self):
|
|
|
|
vms_t = threading.Thread(target=self._watch_for_vms)
|
|
vms_t.start()
|
|
|
|
checker_t = threading.Thread(target=self._check_resources)
|
|
checker_t.start()
|
|
|
|
try:
|
|
while True:
|
|
time.sleep(.1)
|
|
except KeyboardInterrupt:
|
|
self._stop_event.set()
|
|
vms_t.join()
|
|
checker_t.join()
|
|
self._conn.close()
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) not in (1, 2):
|
|
print("The script expects one argument - a path to config in json "
|
|
"format.")
|
|
exit(1)
|
|
elif len(sys.argv) == 2:
|
|
if sys.argv[1] in ("--help", "help"):
|
|
print(__doc__)
|
|
exit(0)
|
|
|
|
try:
|
|
with open(sys.argv[1]) as f:
|
|
config = json.loads(f)
|
|
except:
|
|
print("Failed to load json from %s." % sys.argv[1])
|
|
raise
|
|
else:
|
|
config = {}
|
|
config = set_config_defaults(config)
|
|
|
|
handler = logging.StreamHandler()
|
|
handler.setFormatter(
|
|
logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
|
|
LOG.addHandler(handler)
|
|
if config["debug"]:
|
|
LOG.setLevel(logging.DEBUG)
|
|
else:
|
|
LOG.setLevel(logging.INFO)
|
|
|
|
LOG.info("Loaded configuration:\n%s" % json.dumps(config, indent=4))
|
|
|
|
host = Host(config)
|
|
host.watch()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|