From fb61e864b98dde4955ab29f7a39c9935fbd5ffd0 Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Fri, 28 Sep 2018 11:18:14 -0400 Subject: [PATCH] Fix InstanceNotFound during _destroy_evacuated_instances The _destroy_evacuated_instances method on compute startup tries to cleanup guests on the hypervisor and allocations held against that compute node resource provider by evacuated instances, but doesn't take into account that those evacuated instances could have been deleted in the meantime which leads to a lazy-load InstanceNotFound error that kills the startup of the compute service. This change does two things in the _destroy_evacuated_instances method: 1. Loads the evacuated instances with a read_deleted='yes' context when calling _get_instances_on_driver(). This should be fine since _get_instances_on_driver() is already returning deleted instances anyway (InstanceList.get_by_filters defaults to read deleted instances unless the filters tell it otherwise - which we don't in this case). This is needed so that things like driver.destroy() don't raise InstanceNotFound while lazy-loading fields on the instance. 2. Skips the call to remove_allocation_from_compute() if the evacuated instance is already deleted. If the instance is already deleted, its allocations should have been cleaned up by its hosting compute service (or the API). The functional regression test is updated to show the bug is now fixed. Conflicts: nova/compute/manager.py NOTE(mriedem): The conflict is due to not having change I1073faca6760bff3da0aaf3e8357bd8e64854be3 in Pike. Change-Id: I1f4b3540dd453650f94333b36d7504ba164192f7 Closes-Bug: #1794996 (cherry picked from commit 05cd8d128211adbbfb3cf5d626034ccd0f75a452) (cherry picked from commit 0208d64397731afa829bc08cd7b3b6494f0f05d5) (cherry picked from commit 6c7e53e21059f80325d728cf7dee2766da7a9471) --- nova/compute/manager.py | 32 +++++++++++++------ .../regressions/test_bug_1794996.py | 10 ++---- nova/tests/unit/compute/test_compute.py | 21 ++++++++---- nova/tests/unit/compute/test_compute_mgr.py | 10 ++++-- 4 files changed, 48 insertions(+), 25 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 608db7306112..bd582a40419e 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -647,7 +647,14 @@ class ComputeManager(manager.Manager): return evacuations = {mig.instance_uuid: mig for mig in evacuations} - local_instances = self._get_instances_on_driver(context) + # The instances might be deleted in which case we need to avoid + # InstanceNotFound being raised from lazy-loading fields on the + # instances while cleaning up this host. + read_deleted_context = context.elevated(read_deleted='yes') + # TODO(mriedem): We could optimize by pre-loading the joined fields + # we know we'll use, like info_cache and flavor. We can also replace + # this with a generic solution: https://review.openstack.org/575190/ + local_instances = self._get_instances_on_driver(read_deleted_context) evacuated = [inst for inst in local_instances if inst.uuid in evacuations] @@ -692,15 +699,20 @@ class ComputeManager(manager.Manager): continue cn_uuid = compute_nodes[migration.source_node] - my_resources = scheduler_utils.resources_from_flavor( - instance, instance.flavor) - res = self.reportclient.remove_provider_from_instance_allocation( - instance.uuid, cn_uuid, instance.user_id, - instance.project_id, my_resources) - if not res: - LOG.error("Failed to clean allocation of evacuated instance " - "on the source node %s", - cn_uuid, instance=instance) + # If the instance was deleted in the interim, assume its + # allocations were properly cleaned up (either by its hosting + # compute service or the API). + if not instance.deleted: + my_resources = scheduler_utils.resources_from_flavor( + instance, instance.flavor) + res = self.reportclient.\ + remove_provider_from_instance_allocation( + instance.uuid, cn_uuid, instance.user_id, + instance.project_id, my_resources) + if not res: + LOG.error("Failed to clean allocation of evacuated " + "instance on the source node %s", + cn_uuid, instance=instance) migration.status = 'completed' migration.save() diff --git a/nova/tests/functional/regressions/test_bug_1794996.py b/nova/tests/functional/regressions/test_bug_1794996.py index f6b20691edb7..f2133738d2db 100644 --- a/nova/tests/functional/regressions/test_bug_1794996.py +++ b/nova/tests/functional/regressions/test_bug_1794996.py @@ -12,7 +12,6 @@ from oslo_log import log as logging -from nova import exception from nova.tests.functional import test_servers LOG = logging.getLogger(__name__) @@ -149,7 +148,8 @@ class TestEvacuateDeleteServerRestartOriginalCompute( instance. Before the bug is fixed, the original compute fails to start because lazy-loading the instance.flavor on the deleted instance, which is needed to cleanup allocations from the source host, raises - InstanceNotFound. + InstanceNotFound. After the bug is fixed, the original source host + compute service starts up. """ source_hostname = self.compute1.host dest_hostname = self.compute2.host @@ -197,8 +197,4 @@ class TestEvacuateDeleteServerRestartOriginalCompute( self._delete_and_check_allocations(server) # restart the source compute - # FIXME(mriedem): This is bug 1794996 where we try to lazy-load the - # instance.flavor from the deleted instance which causes the compute - # startup to fail. - self.assertRaises(exception.InstanceNotFound, - self.restart_compute_service, self.compute1) + self.restart_compute_service(self.compute1) diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index e67476123cb1..b78adb5bfe9d 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -7170,6 +7170,7 @@ class ComputeTestCase(BaseTestCase, instance = self._create_fake_instance_obj({'host': 'someotherhost'}) self.compute._instance_update(self.context, instance, vcpus=4) + @mock.patch('nova.context.RequestContext.elevated') @mock.patch.object(compute_manager.ComputeManager, '_get_instances_on_driver') @mock.patch.object(compute_manager.ComputeManager, @@ -7181,8 +7182,10 @@ class ComputeTestCase(BaseTestCase, @mock.patch('nova.objects.Migration.save') def test_destroy_evacuated_instance_on_shared_storage(self, mock_save, mock_get_filter, mock_destroy, mock_is_inst, mock_get_blk, - mock_get_inst): + mock_get_inst, mock_elevated): fake_context = context.get_admin_context() + read_deleted_context = fake_context.elevated(read_only='yes') + mock_elevated.return_value = read_deleted_context # instances in central db instances = [ @@ -7216,7 +7219,7 @@ class ComputeTestCase(BaseTestCase, {'source_compute': self.compute.host, 'status': ['accepted', 'done'], 'migration_type': 'evacuation'}) - mock_get_inst.assert_called_once_with(fake_context) + mock_get_inst.assert_called_once_with(read_deleted_context) mock_get_nw.assert_called_once_with(fake_context, evacuated_instance) mock_get_blk.assert_called_once_with(fake_context, evacuated_instance) mock_is_inst.assert_called_once_with(fake_context, evacuated_instance) @@ -7224,6 +7227,7 @@ class ComputeTestCase(BaseTestCase, 'fake_network_info', 'fake_bdi', False) + @mock.patch('nova.context.RequestContext.elevated') @mock.patch.object(compute_manager.ComputeManager, '_get_instances_on_driver') @mock.patch.object(compute_manager.ComputeManager, @@ -7239,8 +7243,10 @@ class ComputeTestCase(BaseTestCase, @mock.patch('nova.objects.Migration.save') def test_destroy_evacuated_instance_with_disks(self, mock_save, mock_get_filter, mock_destroy, mock_check_clean, mock_check, - mock_check_local, mock_get_blk, mock_get_drv): + mock_check_local, mock_get_blk, mock_get_drv, mock_elevated): fake_context = context.get_admin_context() + read_deleted_context = fake_context.elevated(read_deleted='yes') + mock_elevated.return_value = read_deleted_context # instances in central db instances = [ @@ -7271,7 +7277,7 @@ class ComputeTestCase(BaseTestCase, return_value='fake_network_info') as mock_get_nw: self.compute._destroy_evacuated_instances(fake_context) - mock_get_drv.assert_called_once_with(fake_context) + mock_get_drv.assert_called_once_with(read_deleted_context) mock_get_nw.assert_called_once_with(fake_context, evacuated_instance) mock_get_blk.assert_called_once_with(fake_context, evacuated_instance) mock_check_local.assert_called_once_with(fake_context, @@ -7285,6 +7291,7 @@ class ComputeTestCase(BaseTestCase, 'fake_network_info', 'fake-bdi', True) + @mock.patch('nova.context.RequestContext.elevated') @mock.patch.object(compute_manager.ComputeManager, '_get_instances_on_driver') @mock.patch.object(compute_manager.ComputeManager, @@ -7299,8 +7306,10 @@ class ComputeTestCase(BaseTestCase, @mock.patch('nova.objects.Migration.save') def test_destroy_evacuated_instance_not_implemented(self, mock_save, mock_get_filter, mock_destroy, mock_check_clean, mock_check, - mock_check_local, mock_get_blk, mock_get_inst): + mock_check_local, mock_get_blk, mock_get_inst, mock_elevated): fake_context = context.get_admin_context() + read_deleted_context = fake_context.elevated(read_deleted='yes') + mock_elevated.return_value = read_deleted_context # instances in central db instances = [ @@ -7330,7 +7339,7 @@ class ComputeTestCase(BaseTestCase, return_value='fake_network_info') as mock_get_nw: self.compute._destroy_evacuated_instances(fake_context) - mock_get_inst.assert_called_once_with(fake_context) + mock_get_inst.assert_called_once_with(read_deleted_context) mock_get_nw.assert_called_once_with(fake_context, evacuated_instance) mock_get_blk.assert_called_once_with(fake_context, evacuated_instance) mock_check_local.assert_called_once_with(fake_context, diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 8223ef452460..6503dd989654 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -656,6 +656,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): self.compute.init_virt_events() self.assertFalse(mock_register.called) + @mock.patch('nova.context.RequestContext.elevated') @mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename') @mock.patch('nova.scheduler.utils.resources_from_flavor') @mock.patch.object(manager.ComputeManager, '_get_instances_on_driver') @@ -670,10 +671,12 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): def test_init_host_with_evacuated_instance(self, mock_save, mock_mig_get, mock_temp_mut, mock_init_host, mock_destroy, mock_host_get, mock_admin_ctxt, mock_init_virt, mock_get_inst, mock_resources, - mock_get_node): + mock_get_node, mock_elevated): our_host = self.compute.host not_our_host = 'not-' + our_host + read_deleted_context = self.context.elevated(read_deleted='yes') + mock_elevated.return_value = read_deleted_context deleted_instance = fake_instance.fake_instance_obj( self.context, host=not_our_host, uuid=uuids.deleted_instance) migration = objects.Migration(instance_uuid=deleted_instance.uuid) @@ -709,7 +712,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): expected_attrs=['info_cache', 'metadata']) mock_init_virt.assert_called_once_with() mock_temp_mut.assert_called_once_with(self.context, read_deleted='yes') - mock_get_inst.assert_called_once_with(self.context) + mock_get_inst.assert_called_once_with(read_deleted_context) mock_get_net.assert_called_once_with(self.context, deleted_instance) # ensure driver.destroy is called so that driver may @@ -3329,6 +3332,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): instance_2.host = 'not-' + our_host instance_2.user_id = uuids.user_id instance_2.project_id = uuids.project_id + instance_2.deleted = False # Only instance 2 has a migration record migration = objects.Migration(instance_uuid=instance_2.uuid) @@ -3386,6 +3390,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): instance_1.host = 'not-' + our_host instance_1.user_id = uuids.user_id instance_1.project_id = uuids.project_id + instance_1.deleted = False instance_2 = objects.Instance(self.context, flavor=flavor) instance_2.uuid = uuids.instance_2 instance_2.task_state = None @@ -3393,6 +3398,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): instance_2.host = 'not-' + our_host instance_2.user_id = uuids.user_id instance_2.project_id = uuids.project_id + instance_2.deleted = False migration_1 = objects.Migration(instance_uuid=instance_1.uuid) # Consider the migration successful but the node was deleted while the