osops-tools-contrib/nova/nova-libvirt-compare.py

384 lines
15 KiB
Python
Executable File

#!/usr/bin/env python
#
# Copyright (c) 2016 SWITCH http://www.switch.ch
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Author: Simon Leinen <simon.leinen@switch.ch>
# Date: 2016-09-04
from __future__ import print_function
import sys
import os
import argparse
import re
import paramiko
import multiprocessing
sys.path.append('../lib')
import openstackapi
def get_environ(key, verbose=False):
if key not in os.environ:
print("ERROR:", key, "not defined in environment")
sys.exit(1)
if verbose:
if 'password' in key.lower():
key_value = '*' * len(os.environ[key])
else:
key_value = os.environ[key]
print(u"{}: {}".format(key, key_value))
return os.environ[key]
class LibvirtDomainInfo:
def __init__(self, uuid):
self.uuid = uuid
self.info = {}
class HypervisorInfo:
def __init__(self, hostname):
self.hostname = hostname
self.domains = {}
self.errors = []
def add_domain(self, dom):
self.domains[dom.uuid] = dom
class ServerInfo:
def __init__(self, nova_info):
self.nova_info = nova_info
def collect_hypervisor_information(nova, verbose=False,
remote_user=None,
blindly_trust_host_keys=False,
processes=None):
"""Collect domain information from libvirt hypervisors in a region
Arguments:
nova: nova_client instance for the region
verbose: Whether to print messages about harmless actions, default: False
remote_user: The user under which SSH tries to connect, default: None
blindly_trust_host_keys: Allow MITM attacks, default: False
This function enumerates the hypervisors for the region, connects
to each over SSH, and retrieves information about the libvirt
domains running on the respective machine.
The results are returned as a dictionary that maps domain UUIDs -
which should correspond to Nova instance UUIDs - to
LibvirtDomainInfo objects which are populated using virsh dominfo.
"""
hyp = {}
hypervisors = nova.hypervisors.list(detailed=True)
pool = multiprocessing.Pool(processes=processes)
if hypervisors:
mapped = pool.map(_get_hypervisor_info,
map(lambda h: {
'hostname': h.hypervisor_hostname,
'verbose': verbose,
'remote_user': remote_user,
'blindly_trust_host_keys': blindly_trust_host_keys,
},
hypervisors))
pool.close()
pool.join()
for h in mapped:
if h.errors:
print(u"Error getting domain information from {}".
format(h.hostname))
for err in h.errors:
print(u" {}".format(err))
hyp[h.hostname] = h
return hyp
def _get_hypervisor_info(closure):
return get_hypervisor_info(
closure['hostname'],
verbose =closure['verbose'],
remote_user =closure['remote_user'],
blindly_trust_host_keys=closure['blindly_trust_host_keys'],
)
def get_hypervisor_info(hostname,
verbose=False,
remote_user=None,
blindly_trust_host_keys=False):
"""Get domain information from a single libvirt hypervisor
The results are returned as a HypervisorInfo object.
Arguments:
hostname: Hypervisor hostname from Nova.hypervisors.list(detailed=True)
verbose: Whether to print messages about harmless actions, default: False
remote_user: The user under which SSH tries to connect, default: None
blindly_trust_host_keys: Allow MITM attacks, default: False
This function connects to the given hypervisor over SSH and
retrieves information about the libvirt domains known there.
"""
h = HypervisorInfo(hostname)
ssh = paramiko.SSHClient()
if blindly_trust_host_keys:
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
try:
ssh.connect(h.hostname, username=remote_user)
_, stdout, stderr = ssh.exec_command("virsh list --uuid --all")
for line in stdout:
uuid = line.rstrip()
if uuid == '':
pass
else:
if verbose:
print("Found virsh instance {} on {}".format(
uuid, h.hostname))
dom = LibvirtDomainInfo(uuid)
h.add_domain(dom)
for uuid, dom in h.domains.iteritems():
_, stdout, stderr = ssh.exec_command("virsh dominfo {}".format(uuid))
dominfo = re.compile("^([^:]*):\s*(.*)$")
for line in stdout:
if line == "\n":
# Ignore stupid trailing empty line
pass
else:
m = dominfo.match(line)
if m:
dom.info[m.group(1).lower()] = m.group(2)
else:
h.errors.append(u"Cannot understand line {} in virsh dominfo output".
format(line))
except paramiko.SSHException as e:
h.errors.append(u"Error SSHing to {}:\n {}".
format(h.hostname, e.message))
except:
h.errors.append(u"Unexpected error SSHing to {}:\n {}".
format(h.hostname, sys.exc_info()[0]))
ssh.close()
return h
def collect_server_information(nova, verbose=False):
"""Collect instance information from Nova in a region
Arguments:
nova: nova_client instance for the region
verbose: Whether to print messages about harmless actions, default: False
This function enumerates the instances from the Nova compute
service in a region.
The results are returned as a dictionary that maps domain UUIDs -
which should correspond to Nova instance UUIDs - to ServerInfo
objects which contain server detail information.
"""
srv = {}
servers = nova.servers.list(
detailed=True,
search_opts={'all_tenants': True})
while servers:
last_server = None
for server in servers:
last_server_id = server.id
s = ServerInfo(server)
srv[server.id] = s
if verbose:
print(u"Found server {} on hypervisor {} ({})".format(
server.id,
server._info['OS-EXT-SRV-ATTR:hypervisor_hostname'],
server.status,
))
servers = nova.servers.list(
detailed=True,
marker=last_server_id,
search_opts={'all_tenants': True})
return srv
def instance_state_needs_hypervisor(state):
"""Return true if an instance in this state should have a hypervisor.
"""
if state == 'SHELVED_OFFLOADED':
return False
else:
return True
def report_server_hypervisor_inconsistencies(srv, hyp, verbose=False, note_incomplete=True):
"""Detect and report discrepancies between Nova and hypervisor views
Arguments:
srv: Dictionary of Nova instances as returned by collect_server_information()
hyp: Dictionary of Hypervisor information as returned by collect_hypervisor_information()
verbose: Whether to print messages about harmless actions, default: False
note_incomplete: Whether to report instances without hypervisors, default: True
The following types of discrepancies are detected and reported:
* an instance exists in Nova, but is not on any hypervisor
* an instance exists on a hypervisor, but is unknown to Nova
* an instance exists on a hypervisor, but Nova thinks it should be on a different one
* an instance has incompatible states between Nova and the hypervisor
"""
state_mapping = {
'ACTIVE': 'running',
'VERIFY_RESIZE': 'running',
'SUSPENDED': 'shut off',
'SHUTOFF': 'shut off',
'SHELVED': 'shut off',
'PAUSED': 'paused',
}
for uuid, s in srv.iteritems():
nova_status = s.nova_info.status
hypervisor_name = s.nova_info._info['OS-EXT-SRV-ATTR:hypervisor_hostname']
if hypervisor_name is None:
if instance_state_needs_hypervisor(nova_status) and note_incomplete:
print(u"Instance {} (Nova status {}) has no hypervisor".
format(uuid, nova_status))
elif hypervisor_name not in hyp:
print(u"Instance {} (Nova status {}) on unknown hypervisor {}".
format(uuid, nova_status, hypervisor_name))
else:
h = hyp[hypervisor_name]
if uuid in h.domains:
dom = h.domains[uuid]
dom_state = dom.info['state']
if verbose:
print(u"Instance {} (Nova state {}) hypervisor {} state {}".
format(uuid, nova_status, hypervisor_name, dom_state))
if nova_status in state_mapping \
and dom_state == state_mapping[nova_status]:
pass
else:
print((u"Possible inconsistency: Instance {} (Nova status {})\n"
+u" On hypervisor {}, it has state {}").
format(uuid, nova_status,
hypervisor_name, dom_state))
elif h.errors:
# It's not worth complaining. The problem is that
# there were errors trying to get domain information
# from the hypervisor, and that has been signaled
# already.
pass
else:
print(u"Hypervisor {} should know about {}, but doesn't".
format(hypervisor_name, uuid))
for hypervisor_name, h in hyp.iteritems():
for uuid, s in h.domains.iteritems():
if not uuid in srv:
print(u"Hypervisor {} contains unknown instance {}".
format(hypervisor_name, uuid))
else:
nova_srv = srv[uuid]
nova_status = nova_srv.nova_info.status
nova_hyp_name = nova_srv.nova_info._info['OS-EXT-SRV-ATTR:hypervisor_hostname']
if nova_hyp_name == hypervisor_name:
pass
elif nova_hyp_name not in hyp:
print((u"Instance {} (Nova status {}):\n"
+u" Found on hypervisor {} (state: {})\n"
+u" Should be running on {}, which is not known.").
format(uuid, nova_status,
hypervisor_name, s.info['state'],
nova_hyp_name))
else:
nova_hyp = hyp[nova_hyp_name]
if uuid not in nova_hyp.domains:
print((u"Instance {} (Nova status {}):\n"
+u" Found on hypervisor {} (state: {})\n"
+u" Should be running on {}, but unknown there.").
format(uuid, nova_status,
hypervisor_name, s.info['state'],
nova_hyp_name))
else:
nova_s = nova_hyp.domains[uuid]
print((u"Instance {} (Nova status {}):\n"
+u" Found on hypervisor {} (state: {})\n"
+u" Should be running on {}, and it is (state: {}).").
format(uuid, nova_status,
hypervisor_name, s.info['state'],
nova_hyp_name, nova_s.info['state']))
def main():
"""Check for state inconsistencies between Nova DB and hypervisors
Go through nova-compute hosts, and check if the status of
VMs running there corresponds to the state of the Nova database.
Only supports libvirt hypervisors.
Requires SSH access to all hypervisor hosts. The remote user on
the hypervisor must have sufficient privileges to run "virsh".
"""
parser = argparse.ArgumentParser(
description="Check for inconsistent state between Nova DB and hypervisors")
parser.add_argument(
'-a', '--all-regions', help='query all regions', action='store_true')
parser.add_argument(
'-l', '--remote-user', type=str,
help='SSH remote username for connecting to hypervisors')
parser.add_argument(
'--no-note-incomplete', help='Don\'t report incomplete instances', action='store_true')
parser.add_argument(
'-b', '--blindly-trust-host-keys', help='Accept all SSH host keys. This enables man-in-the-middle attacks!', action='store_true')
parser.add_argument(
'-p', '--processes', type=int, default=20,
help='Number of parallel processes connecting to hypervisors')
parser.add_argument('-v', '--verbose', help='verbose', action='store_true')
if len(sys.argv) < 1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
# get OS_* environment variables
os_auth_url = get_environ('OS_AUTH_URL', args.verbose)
os_username = get_environ('OS_USERNAME', args.verbose)
os_password = get_environ('OS_PASSWORD', args.verbose)
os_tenant_name = get_environ('OS_TENANT_NAME', args.verbose)
os_region_name = get_environ('OS_REGION_NAME', args.verbose)
# Openstack clients API
api = openstackapi.OpenstackAPI(os_auth_url, os_username, os_password, os_project_name=os_tenant_name)
# regions to use
region_names = [os_region_name]
if args.all_regions:
# all regions available
region_names = api.get_all_regions()
for region in region_names:
# get Nova client for the region
nova = api.nova(region)
hyp = collect_hypervisor_information(
nova,
verbose=args.verbose,
remote_user=args.remote_user,
blindly_trust_host_keys=args.blindly_trust_host_keys,
processes=args.processes)
srv = collect_server_information(nova, verbose=args.verbose)
report_server_hypervisor_inconsistencies(
srv, hyp,
verbose=args.verbose,
note_incomplete=not args.no_note_incomplete)
if __name__ == '__main__':
main()