Add handling for offlined CPUs to the nova libvirt driver.

When a host system has CPUs that are offlined via CPU hotplug, nova
fails to start an instance on the host. Currently the libvirt driver
does not check if the CPUs it selects for running the instance are
online or offline. As a result, CPUs that are offline can become
part of the cpuset that is passed to libvirt. Libvirt presents
the following error in this case:

libvirtError: Invalid value '8-15,24-31' for 'cpuset.cpus': Invalid
argument

With this fix, the nova libvirt driver makes use of the getCPUMap
API in libvirt to determine if CPUs are online or offline. When
selecting a CPU set for running an instance, offline CPUs are
masked out.

Rationale: on server platforms like s390, it is common to have offlined
CPUs on a host as the platform offers capabilities to run multiple host
operatings systems (e.g. multiple KVM hypervisors / compute nodes). CPUs
can dynamically be assigned to the different host operating systems, so
it is common to have offlined CPUs on a compute node.

Change-Id: I506ebc9608e17e02d807e5002fe867309c22aafc
Closes-Bug: #1417144
This commit is contained in:
Alexander Schmidt 2015-01-30 11:56:24 +01:00
parent 71d116eaef
commit 0696a5cd5f
4 changed files with 53 additions and 9 deletions

View File

@ -827,6 +827,10 @@ class Connection(object):
def registerCloseCallback(self, cb, opaque):
pass
def getCPUMap(self):
"""Return spoofed CPU map, showing 2 online CPUs."""
return (2, [True] * 2, 2)
def getCapabilities(self):
"""Return spoofed capabilities."""
return '''<capabilities>

View File

@ -1234,7 +1234,9 @@ class LibvirtConnTestCase(test.TestCase):
mock.patch.object(
random, 'choice', side_effect=lambda cells: cells[0]),
mock.patch.object(pci_manager, "get_instance_pci_devs",
return_value=[pci_device])):
return_value=[pci_device]),
mock.patch.object(host.Host, 'get_online_cpus',
return_value=set(range(8)))):
cfg = conn._get_guest_config(instance_ref, [], {}, disk_info)
self.assertIsNone(instance_ref.numa_topology)
self.assertEqual(set([2, 3]), cfg.cpuset)
@ -1279,6 +1281,8 @@ class LibvirtConnTestCase(test.TestCase):
host.Host, "get_capabilities", return_value=caps),
mock.patch.object(
hardware, 'get_vcpu_pin_set', return_value=set([3])),
mock.patch.object(host.Host, 'get_online_cpus',
return_value=set(range(8))),
mock.patch.object(pci_manager, "get_instance_pci_devs",
return_value=[pci_device])):
cfg = conn._get_guest_config(instance_ref, [], {}, disk_info)
@ -1402,9 +1406,12 @@ class LibvirtConnTestCase(test.TestCase):
mock.patch.object(
hardware, 'get_vcpu_pin_set', return_value=set([2, 3])),
mock.patch.object(
random, 'choice', side_effect=lambda cells: cells[0])
random, 'choice', side_effect=lambda cells: cells[0]),
mock.patch.object(host.Host, 'get_online_cpus',
return_value=set(range(8)))
) as (has_min_version_mock, get_host_cap_mock,
get_vcpu_pin_set_mock, choice_mock):
get_vcpu_pin_set_mock, choice_mock,
get_online_cpus_mock):
cfg = drvr._get_guest_config(instance_ref, [], {}, disk_info)
# NOTE(ndipanov): we make sure that pin_set was taken into account
# when choosing viable cells
@ -1498,7 +1505,9 @@ class LibvirtConnTestCase(test.TestCase):
return_value=caps),
mock.patch.object(
hardware, 'get_vcpu_pin_set',
return_value=set([2, 3, 4, 5]))
return_value=set([2, 3, 4, 5])),
mock.patch.object(host.Host, 'get_online_cpus',
return_value=set(range(8))),
):
cfg = drvr._get_guest_config(instance_ref, [], {}, disk_info)
self.assertIsNone(cfg.cpuset)
@ -1575,6 +1584,8 @@ class LibvirtConnTestCase(test.TestCase):
return_value=True),
mock.patch.object(host.Host, "get_capabilities",
return_value=caps),
mock.patch.object(host.Host, 'get_online_cpus',
return_value=set(range(8))),
):
cfg = drvr._get_guest_config(instance_ref, [], {}, disk_info)
self.assertIsNone(cfg.cpuset)
@ -1649,7 +1660,9 @@ class LibvirtConnTestCase(test.TestCase):
mock.patch.object(host.Host, 'has_min_version',
return_value=True),
mock.patch.object(host.Host, "get_capabilities",
return_value=caps)
return_value=caps),
mock.patch.object(host.Host, 'get_online_cpus',
return_value=set(range(8))),
):
cfg = conn._get_guest_config(instance_ref, [], {}, disk_info)
self.assertIsNone(cfg.cpuset)
@ -9543,7 +9556,10 @@ class LibvirtConnTestCase(test.TestCase):
mock.patch.object(host.Host, "get_capabilities",
return_value=caps),
mock.patch.object(
hardware, 'get_vcpu_pin_set', return_value=set([0, 1, 3]))
hardware, 'get_vcpu_pin_set',
return_value=set([0, 1, 3, 4, 5])),
mock.patch.object(host.Host, 'get_online_cpus',
return_value=set([0, 1, 2, 3, 6])),
):
got_topo = drvr._get_host_numa_topology()
got_topo_dict = got_topo._to_dict()

View File

@ -4736,6 +4736,11 @@ class LibvirtDriver(driver.ComputeDriver):
cells = []
allowed_cpus = hardware.get_vcpu_pin_set()
online_cpus = self._host.get_online_cpus()
if allowed_cpus:
allowed_cpus &= online_cpus
else:
allowed_cpus = online_cpus
for cell in topology.cells:
cpuset = set(cpu.id for cpu in cell.cpus)
@ -4744,9 +4749,8 @@ class LibvirtDriver(driver.ComputeDriver):
if cpu.siblings else ()
for cpu in cell.cpus)
))
if allowed_cpus:
cpuset &= allowed_cpus
siblings = [sib & allowed_cpus for sib in siblings]
cpuset &= allowed_cpus
siblings = [sib & allowed_cpus for sib in siblings]
# Filter out singles and empty sibling sets that may be left
siblings = [sib for sib in siblings if len(sib) > 1]

View File

@ -585,6 +585,26 @@ class Host(object):
return doms
def get_online_cpus(self):
"""Get the set of CPUs that are online on the host
Method is only used by NUMA code paths which check on
libvirt version >= 1.0.4. getCPUMap() was introduced in
libvirt 1.0.0.
:returns: set of online CPUs, raises libvirtError on error
"""
(cpus, cpu_map, online) = self.get_connection().getCPUMap()
online_cpus = set()
for cpu in range(cpus):
if cpu_map[cpu]:
online_cpus.add(cpu)
return online_cpus
def get_capabilities(self):
"""Returns the host capabilities information