nova-libvirt-compare.py: compare Nova state to hypervisor state
Change-Id: I80fb1ea4896fb4ce539fd67da20e34ec658ecdf5
This commit is contained in:
parent
cc3ed99079
commit
24e0bff88c
|
@ -0,0 +1,36 @@
|
|||
# Nova folder
|
||||
|
||||
this folder contains scripts that are related to Nova
|
||||
|
||||
## Compare Nova state to hypervisor state: `nova-libvirt-compare.py`
|
||||
|
||||
This retrieves all instances in a region (or all regions when called
|
||||
with `-a`), then compares that with the libvirt domains running on all
|
||||
hypervisor hosts in that region, and reports any differences.
|
||||
|
||||
### Usage
|
||||
|
||||
usage: nova-libvirt-compare.py [-h] [-a] [-l REMOTE_USER]
|
||||
[--no-note-incomplete]
|
||||
[--blindly-trust-host-keys] [-p PROCESSES] [-v]
|
||||
|
||||
Check for inconsistent state between Nova DB and hypervisors
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-a, --all-regions query all regions
|
||||
-l REMOTE_USER, --remote-user REMOTE_USER
|
||||
SSH remote username for connecting to hypervisors
|
||||
--no-note-incomplete Don't report incomplete instances
|
||||
--blindly-trust-host-keys
|
||||
Accept all SSH host keys. This enables man-in-the-
|
||||
middle attacks!
|
||||
-p PROCESSES, --processes PROCESSES
|
||||
Number of parallel processes connecting to hypervisors
|
||||
-v, --verbose verbose
|
||||
|
||||
### Example
|
||||
|
||||
$ ./nova-libvirt-compare.py
|
||||
Hypervisor zhdk0062.zhdk.cloud.switch.ch should know about bd384f32-5e05-43a5-a66e-fc11693a733b, but doesn't
|
||||
Instance ebd1c623-35c3-4385-998f-10a96ecfbcdf (state BUILD) has no hypervisor
|
|
@ -0,0 +1,373 @@
|
|||
#!/usr/bin/env python
|
||||
#
|
||||
# Copyright (c) 2016 SWITCH http://www.switch.ch
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Author: Simon Leinen <simon.leinen@switch.ch>
|
||||
# Date: 2016-09-04
|
||||
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
import re
|
||||
import paramiko
|
||||
import multiprocessing
|
||||
sys.path.append('../lib')
|
||||
import openstackapi
|
||||
|
||||
|
||||
def get_environ(key, verbose=False):
|
||||
if key not in os.environ:
|
||||
print("ERROR:", key, "not defined in environment")
|
||||
sys.exit(1)
|
||||
if verbose:
|
||||
if 'password' in key.lower():
|
||||
key_value = '*' * len(os.environ[key])
|
||||
else:
|
||||
key_value = os.environ[key]
|
||||
print(u"{}: {}".format(key, key_value))
|
||||
return os.environ[key]
|
||||
|
||||
|
||||
class LibvirtDomainInfo:
|
||||
def __init__(self, uuid):
|
||||
self.uuid = uuid
|
||||
self.info = {}
|
||||
|
||||
class HypervisorInfo:
|
||||
def __init__(self, hostname):
|
||||
self.hostname = hostname
|
||||
self.domains = {}
|
||||
self.errors = []
|
||||
def add_domain(self, dom):
|
||||
self.domains[dom.uuid] = dom
|
||||
|
||||
class ServerInfo:
|
||||
def __init__(self, nova_info):
|
||||
self.nova_info = nova_info
|
||||
|
||||
|
||||
def collect_hypervisor_information(nova, verbose=False,
|
||||
remote_user=None,
|
||||
blindly_trust_host_keys=False,
|
||||
processes=None):
|
||||
"""Collect domain information from libvirt hypervisors in a region
|
||||
|
||||
Arguments:
|
||||
|
||||
nova: nova_client instance for the region
|
||||
verbose: Whether to print messages about harmless actions, default: False
|
||||
remote_user: The user under which SSH tries to connect, default: None
|
||||
blindly_trust_host_keys: Allow MITM attacks, default: False
|
||||
|
||||
This function enumerates the hypervisors for the region, connects
|
||||
to each over SSH, and retrieves information about the libvirt
|
||||
domains running on the respective machine.
|
||||
|
||||
The results are returned as a dictionary that maps domain UUIDs -
|
||||
which should correspond to Nova instance UUIDs - to
|
||||
LibvirtDomainInfo objects which are populated using virsh dominfo.
|
||||
"""
|
||||
hyp = {}
|
||||
hypervisors = nova.hypervisors.list(detailed=True)
|
||||
pool = multiprocessing.Pool(processes=processes)
|
||||
|
||||
if hypervisors:
|
||||
mapped = pool.map(_get_hypervisor_info,
|
||||
map(lambda h: {
|
||||
'hostname': h.hypervisor_hostname,
|
||||
'verbose': verbose,
|
||||
'remote_user': remote_user,
|
||||
'blindly_trust_host_keys': blindly_trust_host_keys,
|
||||
},
|
||||
hypervisors))
|
||||
pool.close()
|
||||
pool.join()
|
||||
for h in mapped:
|
||||
if h.errors:
|
||||
print(u"Error getting domain information from {}".
|
||||
format(h.hostname))
|
||||
for err in h.errors:
|
||||
print(u" {}".format(err))
|
||||
hyp[h.hostname] = h
|
||||
return hyp
|
||||
|
||||
def _get_hypervisor_info(closure):
|
||||
return get_hypervisor_info(
|
||||
closure['hostname'],
|
||||
verbose =closure['verbose'],
|
||||
remote_user =closure['remote_user'],
|
||||
blindly_trust_host_keys=closure['blindly_trust_host_keys'],
|
||||
)
|
||||
|
||||
def get_hypervisor_info(hostname,
|
||||
verbose=False,
|
||||
remote_user=None,
|
||||
blindly_trust_host_keys=False):
|
||||
"""Get domain information from a single libvirt hypervisor
|
||||
|
||||
The results are returned as a HypervisorInfo object.
|
||||
|
||||
Arguments:
|
||||
|
||||
hostname: Hypervisor hostname from Nova.hypervisors.list(detailed=True)
|
||||
verbose: Whether to print messages about harmless actions, default: False
|
||||
remote_user: The user under which SSH tries to connect, default: None
|
||||
blindly_trust_host_keys: Allow MITM attacks, default: False
|
||||
|
||||
This function connects to the given hypervisor over SSH and
|
||||
retrieves information about the libvirt domains known there.
|
||||
"""
|
||||
|
||||
h = HypervisorInfo(hostname)
|
||||
ssh = paramiko.SSHClient()
|
||||
if blindly_trust_host_keys:
|
||||
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
|
||||
try:
|
||||
ssh.connect(h.hostname, username=remote_user)
|
||||
_, stdout, stderr = ssh.exec_command("virsh list --uuid --all")
|
||||
for line in stdout:
|
||||
uuid = line.rstrip()
|
||||
if uuid == '':
|
||||
pass
|
||||
else:
|
||||
if verbose:
|
||||
print("Found virsh instance {} on {}".format(
|
||||
uuid, h.hostname))
|
||||
dom = LibvirtDomainInfo(uuid)
|
||||
h.add_domain(dom)
|
||||
for uuid, dom in h.domains.iteritems():
|
||||
_, stdout, stderr = ssh.exec_command("virsh dominfo {}".format(uuid))
|
||||
dominfo = re.compile("^([^:]*):\s*(.*)$")
|
||||
for line in stdout:
|
||||
if line == "\n":
|
||||
# Ignore stupid trailing empty line
|
||||
pass
|
||||
else:
|
||||
m = dominfo.match(line)
|
||||
if m:
|
||||
dom.info[m.group(1).lower()] = m.group(2)
|
||||
else:
|
||||
h.errors.append(u"Cannot understand line {} in virsh dominfo output".
|
||||
format(line))
|
||||
except paramiko.SSHException as e:
|
||||
h.errors.append(u"Error SSHing to {}:\n {}".
|
||||
format(h.hostname, e.message))
|
||||
except:
|
||||
h.errors.append(u"Unexpected error SSHing to {}:\n {}".
|
||||
format(h.hostname, sys.exc_info()[0]))
|
||||
ssh.close()
|
||||
return h
|
||||
|
||||
def collect_server_information(nova, verbose=False):
|
||||
"""Collect instance information from Nova in a region
|
||||
|
||||
Arguments:
|
||||
|
||||
nova: nova_client instance for the region
|
||||
verbose: Whether to print messages about harmless actions, default: False
|
||||
|
||||
This function enumerates the instances from the Nova compute
|
||||
service in a region.
|
||||
|
||||
The results are returned as a dictionary that maps domain UUIDs -
|
||||
which should correspond to Nova instance UUIDs - to ServerInfo
|
||||
objects which contain server detail information.
|
||||
"""
|
||||
srv = {}
|
||||
|
||||
servers = nova.servers.list(
|
||||
detailed=True,
|
||||
search_opts={'all_tenants': True})
|
||||
|
||||
while servers:
|
||||
last_server = None
|
||||
for server in servers:
|
||||
last_server_id = server.id
|
||||
s = ServerInfo(server)
|
||||
srv[server.id] = s
|
||||
if verbose:
|
||||
print(u"Found server {} on hypervisor {} ({})".format(
|
||||
server.id,
|
||||
server._info['OS-EXT-SRV-ATTR:hypervisor_hostname'],
|
||||
server.status,
|
||||
))
|
||||
servers = nova.servers.list(
|
||||
detailed=True,
|
||||
marker=last_server_id,
|
||||
search_opts={'all_tenants': True})
|
||||
return srv
|
||||
|
||||
def report_server_hypervisor_inconsistencies(srv, hyp, verbose=False, note_incomplete=True):
|
||||
"""Detect and report discrepancies between Nova and hypervisor views
|
||||
|
||||
Arguments:
|
||||
|
||||
srv: Dictionary of Nova instances as returned by collect_server_information()
|
||||
hyp: Dictionary of Hypervisor information as returned by collect_hypervisor_information()
|
||||
verbose: Whether to print messages about harmless actions, default: False
|
||||
note_incomplete: Whether to report instances without hypervisors, default: True
|
||||
|
||||
The following types of discrepancies are detected and reported:
|
||||
|
||||
* an instance exists in Nova, but is not on any hypervisor
|
||||
* an instance exists on a hypervisor, but is unknown to Nova
|
||||
* an instance exists on a hypervisor, but Nova thinks it should be on a different one
|
||||
* an instance has incompatible states between Nova and the hypervisor
|
||||
"""
|
||||
state_mapping = {
|
||||
'ACTIVE': 'running',
|
||||
'SUSPENDED': 'shut off',
|
||||
'SHUTOFF': 'shut off',
|
||||
'PAUSED': 'paused',
|
||||
}
|
||||
for uuid, s in srv.iteritems():
|
||||
nova_status = s.nova_info.status
|
||||
hypervisor_name = s.nova_info._info['OS-EXT-SRV-ATTR:hypervisor_hostname']
|
||||
if hypervisor_name is None:
|
||||
if note_incomplete:
|
||||
print(u"Instance {} (Nova status {}) has no hypervisor".
|
||||
format(uuid, nova_status))
|
||||
elif hypervisor_name not in hyp:
|
||||
print(u"Instance {} (Nova status {}) on unknown hypervisor {}".
|
||||
format(uuid, nova_status, hypervisor_name))
|
||||
else:
|
||||
h = hyp[hypervisor_name]
|
||||
if uuid in h.domains:
|
||||
dom = h.domains[uuid]
|
||||
dom_state = dom.info['state']
|
||||
if verbose:
|
||||
print(u"Instance {} (Nova state {}) hypervisor {} state {}".
|
||||
format(uuid, nova_status, hypervisor_name, dom_state))
|
||||
if nova_status in state_mapping \
|
||||
and dom_state == state_mapping[nova_status]:
|
||||
pass
|
||||
else:
|
||||
print((u"Possible inconsistency: Instance {} (Nova status {})\n"
|
||||
+u" On hypervisor {}, it has state {}").
|
||||
format(uuid, nova_status,
|
||||
hypervisor_name, dom_state))
|
||||
elif h.errors:
|
||||
# It's not worth complaining. The problem is that
|
||||
# there were errors trying to get domain information
|
||||
# from the hypervisor, and that has been signaled
|
||||
# already.
|
||||
pass
|
||||
else:
|
||||
print(u"Hypervisor {} should know about {}, but doesn't".
|
||||
format(hypervisor_name, uuid))
|
||||
for hypervisor_name, h in hyp.iteritems():
|
||||
for uuid, s in h.domains.iteritems():
|
||||
if not uuid in srv:
|
||||
print(u"Hypervisor {} contains unknown instance {}".
|
||||
format(hypervisor_name, uuid))
|
||||
else:
|
||||
nova_srv = srv[uuid]
|
||||
nova_status = nova_srv.nova_info.status
|
||||
nova_hyp_name = nova_srv.nova_info._info['OS-EXT-SRV-ATTR:hypervisor_hostname']
|
||||
if nova_hyp_name == hypervisor_name:
|
||||
pass
|
||||
elif nova_hyp_name not in hyp:
|
||||
print((u"Instance {} (Nova status {}):\n"
|
||||
+u" Found on hypervisor {} (state: {})\n"
|
||||
+u" Should be running on {}, which is not known.").
|
||||
format(uuid, nova_status,
|
||||
hypervisor_name, s.info['state'],
|
||||
nova_hyp_name))
|
||||
else:
|
||||
nova_hyp = hyp[nova_hyp_name]
|
||||
if uuid not in nova_hyp.domains:
|
||||
print((u"Instance {} (Nova status {}):\n"
|
||||
+u" Found on hypervisor {} (state: {})\n"
|
||||
+u" Should be running on {}, but unknown there.").
|
||||
format(uuid, nova_status,
|
||||
hypervisor_name, s.info['state'],
|
||||
nova_hyp_name))
|
||||
else:
|
||||
nova_s = nova_hyp.domains[uuid]
|
||||
print((u"Instance {} (Nova status {}):\n"
|
||||
+u" Found on hypervisor {} (state: {})\n"
|
||||
+u" Should be running on {}, and it is (state: {}).").
|
||||
format(uuid, nova_status,
|
||||
hypervisor_name, s.info['state'],
|
||||
nova_hyp_name, nova_s.info['state']))
|
||||
|
||||
def main():
|
||||
"""Check for state inconsistencies between Nova DB and hypervisors
|
||||
|
||||
Go through nova-compute hosts, and check if the status of
|
||||
VMs running there corresponds to the state of the Nova database.
|
||||
|
||||
Only supports libvirt hypervisors.
|
||||
|
||||
Requires SSH access to all hypervisor hosts. The remote user on
|
||||
the hypervisor must have sufficient privileges to run "virsh".
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Check for inconsistent state between Nova DB and hypervisors")
|
||||
parser.add_argument(
|
||||
'-a', '--all-regions', help='query all regions', action='store_true')
|
||||
parser.add_argument(
|
||||
'-l', '--remote-user', type=str,
|
||||
help='SSH remote username for connecting to hypervisors')
|
||||
parser.add_argument(
|
||||
'--no-note-incomplete', help='Don\'t report incomplete instances', action='store_true')
|
||||
parser.add_argument(
|
||||
'-b', '--blindly-trust-host-keys', help='Accept all SSH host keys. This enables man-in-the-middle attacks!', action='store_true')
|
||||
parser.add_argument(
|
||||
'-p', '--processes', type=int, default=20,
|
||||
help='Number of parallel processes connecting to hypervisors')
|
||||
parser.add_argument('-v', '--verbose', help='verbose', action='store_true')
|
||||
if len(sys.argv) < 1:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
args = parser.parse_args()
|
||||
|
||||
# get OS_* environment variables
|
||||
os_auth_url = get_environ('OS_AUTH_URL', args.verbose)
|
||||
os_username = get_environ('OS_USERNAME', args.verbose)
|
||||
os_password = get_environ('OS_PASSWORD', args.verbose)
|
||||
os_tenant_name = get_environ('OS_TENANT_NAME', args.verbose)
|
||||
os_region_name = get_environ('OS_REGION_NAME', args.verbose)
|
||||
|
||||
# Openstack clients API
|
||||
api = openstackapi.OpenstackAPI(os_auth_url, os_username, os_password, os_project_name=os_tenant_name)
|
||||
|
||||
# regions to use
|
||||
region_names = [os_region_name]
|
||||
if args.all_regions:
|
||||
# all regions available
|
||||
region_names = api.get_all_regions()
|
||||
|
||||
for region in region_names:
|
||||
# get Nova client for the region
|
||||
nova = api.nova(region)
|
||||
|
||||
hyp = collect_hypervisor_information(
|
||||
nova,
|
||||
verbose=args.verbose,
|
||||
remote_user=args.remote_user,
|
||||
blindly_trust_host_keys=args.blindly_trust_host_keys,
|
||||
processes=args.processes)
|
||||
srv = collect_server_information(nova, verbose=args.verbose)
|
||||
|
||||
report_server_hypervisor_inconsistencies(
|
||||
srv, hyp,
|
||||
verbose=args.verbose,
|
||||
note_incomplete=not args.no_note_incomplete)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue