diff --git a/nova/pci/manager.py b/nova/pci/manager.py index ad015db5109b..ff161e38cafb 100644 --- a/nova/pci/manager.py +++ b/nova/pci/manager.py @@ -163,23 +163,47 @@ class PciDevTracker(object): for existed in self.pci_devs: if existed.address in exist_addrs - new_addrs: + # Remove previously tracked PCI devices that are either + # no longer reported by the hypervisor or have been removed + # from the pci whitelist. try: existed.remove() except exception.PciDeviceInvalidStatus as e: - LOG.warning("Trying to remove device with %(status)s " + LOG.warning("Unable to remove device with %(status)s " "ownership %(instance_uuid)s because of " - "%(pci_exception)s", + "%(pci_exception)s. " + "Check your [pci]passthrough_whitelist " + "configuration to make sure this allocated " + "device is whitelisted. If you have removed " + "the device from the whitelist intentionally " + "or the device is no longer available on the " + "host you will need to delete the server or " + "migrate it to another host to silence this " + "warning.", {'status': existed.status, 'instance_uuid': existed.instance_uuid, 'pci_exception': e.format_message()}) - # Note(yjiang5): remove the device by force so that - # db entry is cleaned in next sync. - existed.status = fields.PciDeviceStatus.REMOVED + # NOTE(sean-k-mooney): the device may not be tracked for + # two reasons: first the device could have been removed + # from the host or second the whitelist could have been + # updated. While force removing may seam reasonable, if + # the device is allocated to a vm, force removing the + # device entry from the resource tracker can prevent the vm + # from rebooting. If the PCI device was removed due to an + # update to the PCI whitelist which was later reverted, + # removing the entry from the database and adding it back + # later may lead to the scheduler incorrectly selecting + # this host and the ResourceTracker assigning the PCI + # device to a second vm. To prevent this bug we skip + # deleting the device from the db in this iteration and + # will try again on the next sync. + continue else: # Note(yjiang5): no need to update stats if an assigned # device is hot removed. self.stats.remove_device(existed) else: + # Update tracked devices. new_value = next((dev for dev in devices if dev['address'] == existed.address)) new_value['compute_node_id'] = self.node_id @@ -202,6 +226,7 @@ class PciDevTracker(object): else: existed.update_device(new_value) + # Track newly discovered devices. for dev in [dev for dev in devices if dev['address'] in new_addrs - exist_addrs]: dev['compute_node_id'] = self.node_id diff --git a/nova/tests/unit/pci/test_manager.py b/nova/tests/unit/pci/test_manager.py index 5471bbc07765..5dea8f888640 100644 --- a/nova/tests/unit/pci/test_manager.py +++ b/nova/tests/unit/pci/test_manager.py @@ -312,6 +312,53 @@ class PciDevTrackerTestCase(test.NoDBTestCase): self.assertEqual(vfs, pf.child_devices) self.assertEqual(vfs[0].parent_device, pf) + def test_set_hvdev_remove_tree_maintained_with_allocations(self): + # Make sure the device tree is properly maintained when there are + # devices removed from the system that are allocated to vms. + + all_devs = fake_db_devs_tree[:] + self._create_tracker(all_devs) + # we start with 3 devices + self.assertEqual( + 3, + len([dev for dev in self.tracker.pci_devs + if dev.status != fields.PciDeviceStatus.REMOVED])) + # we then allocate one device + pci_requests_obj = self._create_pci_requests_object( + [{'count': 1, 'spec': [{'vendor_id': 'v2'}]}]) + # NOTE(sean-k-mooney): context, pci request, numa topology + claimed_dev = self.tracker.claim_instance( + mock.sentinel.context, pci_requests_obj, None)[0] + + self.tracker._set_hvdevs(all_devs) + # and assert that no devices were removed + self.assertEqual( + 0, + len([dev for dev in self.tracker.pci_devs + if dev.status == fields.PciDeviceStatus.REMOVED])) + # we then try to remove the allocated device from the set reported + # by the driver. + fake_pci_devs = [dev for dev in all_devs + if dev['address'] != claimed_dev.address] + with mock.patch("nova.pci.manager.LOG.warning") as log: + self.tracker._set_hvdevs(fake_pci_devs) + log.assert_called_once() + args = log.call_args_list[0][0] # args of first call + self.assertIn('Unable to remove device with', args[0]) + # and assert no devices are removed from the tracker + self.assertEqual( + 0, + len([dev for dev in self.tracker.pci_devs + if dev.status == fields.PciDeviceStatus.REMOVED])) + # free the device that was allocated and update tracker again + self.tracker._free_device(claimed_dev) + self.tracker._set_hvdevs(fake_pci_devs) + # and assert that one device is removed from the tracker + self.assertEqual( + 1, + len([dev for dev in self.tracker.pci_devs + if dev.status == fields.PciDeviceStatus.REMOVED])) + def test_set_hvdev_changed_stal(self): pci_requests_obj = self._create_pci_requests_object( [{'count': 1, 'spec': [{'vendor_id': 'v1'}]}])