Fix InstanceNotFound during _destroy_evacuated_instances
The _destroy_evacuated_instances method on compute
startup tries to cleanup guests on the hypervisor and
allocations held against that compute node resource
provider by evacuated instances, but doesn't take into
account that those evacuated instances could have been
deleted in the meantime which leads to a lazy-load
InstanceNotFound error that kills the startup of the
compute service.
This change does two things in the _destroy_evacuated_instances
method:
1. Loads the evacuated instances with a read_deleted='yes'
context when calling _get_instances_on_driver(). This
should be fine since _get_instances_on_driver() is already
returning deleted instances anyway (InstanceList.get_by_filters
defaults to read deleted instances unless the filters tell
it otherwise - which we don't in this case). This is needed
so that things like driver.destroy() don't raise
InstanceNotFound while lazy-loading fields on the instance.
2. Skips the call to remove_allocation_from_compute() if the
evacuated instance is already deleted. If the instance is
already deleted, its allocations should have been cleaned
up by its hosting compute service (or the API).
The functional regression test is updated to show the bug is
now fixed.
Conflicts:
nova/compute/manager.py
nova/tests/unit/compute/test_compute.py
NOTE(mriedem): The conflicts are due to not having change
I2af45a9540e7ccd60ace80d9fcadc79972da7df7 in Rocky.
Change-Id: I1f4b3540dd453650f94333b36d7504ba164192f7
Closes-Bug: #1794996
(cherry picked from commit 05cd8d1282
)
This commit is contained in:
parent
83d74dbbb6
commit
0208d64397
|
@ -664,7 +664,14 @@ class ComputeManager(manager.Manager):
|
|||
return
|
||||
evacuations = {mig.instance_uuid: mig for mig in evacuations}
|
||||
|
||||
local_instances = self._get_instances_on_driver(context)
|
||||
# The instances might be deleted in which case we need to avoid
|
||||
# InstanceNotFound being raised from lazy-loading fields on the
|
||||
# instances while cleaning up this host.
|
||||
read_deleted_context = context.elevated(read_deleted='yes')
|
||||
# TODO(mriedem): We could optimize by pre-loading the joined fields
|
||||
# we know we'll use, like info_cache and flavor. We can also replace
|
||||
# this with a generic solution: https://review.openstack.org/575190/
|
||||
local_instances = self._get_instances_on_driver(read_deleted_context)
|
||||
evacuated = [inst for inst in local_instances
|
||||
if inst.uuid in evacuations]
|
||||
|
||||
|
@ -709,8 +716,12 @@ class ComputeManager(manager.Manager):
|
|||
continue
|
||||
cn_uuid = compute_nodes[migration.source_node]
|
||||
|
||||
if not scheduler_utils.remove_allocation_from_compute(
|
||||
context, instance, cn_uuid, self.reportclient):
|
||||
# If the instance was deleted in the interim, assume its
|
||||
# allocations were properly cleaned up (either by its hosting
|
||||
# compute service or the API).
|
||||
if (not instance.deleted and
|
||||
not scheduler_utils.remove_allocation_from_compute(
|
||||
context, instance, cn_uuid, self.reportclient)):
|
||||
LOG.error("Failed to clean allocation of evacuated instance "
|
||||
"on the source node %s",
|
||||
cn_uuid, instance=instance)
|
||||
|
|
|
@ -12,7 +12,6 @@
|
|||
|
||||
from oslo_log import log as logging
|
||||
|
||||
from nova import exception
|
||||
from nova.tests.functional import integrated_helpers
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
@ -128,7 +127,8 @@ class TestEvacuateDeleteServerRestartOriginalCompute(
|
|||
instance. Before the bug is fixed, the original compute fails to start
|
||||
because lazy-loading the instance.flavor on the deleted instance,
|
||||
which is needed to cleanup allocations from the source host, raises
|
||||
InstanceNotFound.
|
||||
InstanceNotFound. After the bug is fixed, the original source host
|
||||
compute service starts up.
|
||||
"""
|
||||
source_hostname = self.compute1.host
|
||||
dest_hostname = self.compute2.host
|
||||
|
@ -176,8 +176,4 @@ class TestEvacuateDeleteServerRestartOriginalCompute(
|
|||
self._delete_and_check_allocations(server)
|
||||
|
||||
# restart the source compute
|
||||
# FIXME(mriedem): This is bug 1794996 where we try to lazy-load the
|
||||
# instance.flavor from the deleted instance which causes the compute
|
||||
# startup to fail.
|
||||
self.assertRaises(exception.InstanceNotFound,
|
||||
self.restart_compute_service, self.compute1)
|
||||
self.restart_compute_service(self.compute1)
|
||||
|
|
|
@ -7497,6 +7497,7 @@ class ComputeTestCase(BaseTestCase,
|
|||
instance = self._create_fake_instance_obj({'host': 'someotherhost'})
|
||||
self.compute._instance_update(self.context, instance, vcpus=4)
|
||||
|
||||
@mock.patch('nova.context.RequestContext.elevated')
|
||||
@mock.patch.object(compute_manager.ComputeManager,
|
||||
'_get_instances_on_driver')
|
||||
@mock.patch.object(compute_manager.ComputeManager,
|
||||
|
@ -7508,8 +7509,10 @@ class ComputeTestCase(BaseTestCase,
|
|||
@mock.patch('nova.objects.Migration.save')
|
||||
def test_destroy_evacuated_instance_on_shared_storage(self, mock_save,
|
||||
mock_get_filter, mock_destroy, mock_is_inst, mock_get_blk,
|
||||
mock_get_inst):
|
||||
mock_get_inst, mock_elevated):
|
||||
fake_context = context.get_admin_context()
|
||||
read_deleted_context = fake_context.elevated(read_only='yes')
|
||||
mock_elevated.return_value = read_deleted_context
|
||||
|
||||
# instances in central db
|
||||
instances = [
|
||||
|
@ -7546,7 +7549,7 @@ class ComputeTestCase(BaseTestCase,
|
|||
'pre-migrating',
|
||||
'done'],
|
||||
'migration_type': 'evacuation'})
|
||||
mock_get_inst.assert_called_once_with(fake_context)
|
||||
mock_get_inst.assert_called_once_with(read_deleted_context)
|
||||
mock_get_nw.assert_called_once_with(fake_context, evacuated_instance)
|
||||
mock_get_blk.assert_called_once_with(fake_context, evacuated_instance)
|
||||
mock_is_inst.assert_called_once_with(fake_context, evacuated_instance)
|
||||
|
@ -7554,6 +7557,7 @@ class ComputeTestCase(BaseTestCase,
|
|||
'fake_network_info',
|
||||
'fake_bdi', False)
|
||||
|
||||
@mock.patch('nova.context.RequestContext.elevated')
|
||||
@mock.patch.object(compute_manager.ComputeManager,
|
||||
'_get_instances_on_driver')
|
||||
@mock.patch.object(compute_manager.ComputeManager,
|
||||
|
@ -7569,8 +7573,10 @@ class ComputeTestCase(BaseTestCase,
|
|||
@mock.patch('nova.objects.Migration.save')
|
||||
def test_destroy_evacuated_instance_with_disks(self, mock_save,
|
||||
mock_get_filter, mock_destroy, mock_check_clean, mock_check,
|
||||
mock_check_local, mock_get_blk, mock_get_drv):
|
||||
mock_check_local, mock_get_blk, mock_get_drv, mock_elevated):
|
||||
fake_context = context.get_admin_context()
|
||||
read_deleted_context = fake_context.elevated(read_deleted='yes')
|
||||
mock_elevated.return_value = read_deleted_context
|
||||
|
||||
# instances in central db
|
||||
instances = [
|
||||
|
@ -7601,7 +7607,7 @@ class ComputeTestCase(BaseTestCase,
|
|||
return_value='fake_network_info') as mock_get_nw:
|
||||
self.compute._destroy_evacuated_instances(fake_context)
|
||||
|
||||
mock_get_drv.assert_called_once_with(fake_context)
|
||||
mock_get_drv.assert_called_once_with(read_deleted_context)
|
||||
mock_get_nw.assert_called_once_with(fake_context, evacuated_instance)
|
||||
mock_get_blk.assert_called_once_with(fake_context, evacuated_instance)
|
||||
mock_check_local.assert_called_once_with(fake_context,
|
||||
|
@ -7615,6 +7621,7 @@ class ComputeTestCase(BaseTestCase,
|
|||
'fake_network_info', 'fake-bdi',
|
||||
True)
|
||||
|
||||
@mock.patch('nova.context.RequestContext.elevated')
|
||||
@mock.patch.object(compute_manager.ComputeManager,
|
||||
'_get_instances_on_driver')
|
||||
@mock.patch.object(compute_manager.ComputeManager,
|
||||
|
@ -7629,8 +7636,10 @@ class ComputeTestCase(BaseTestCase,
|
|||
@mock.patch('nova.objects.Migration.save')
|
||||
def test_destroy_evacuated_instance_not_implemented(self, mock_save,
|
||||
mock_get_filter, mock_destroy, mock_check_clean, mock_check,
|
||||
mock_check_local, mock_get_blk, mock_get_inst):
|
||||
mock_check_local, mock_get_blk, mock_get_inst, mock_elevated):
|
||||
fake_context = context.get_admin_context()
|
||||
read_deleted_context = fake_context.elevated(read_deleted='yes')
|
||||
mock_elevated.return_value = read_deleted_context
|
||||
|
||||
# instances in central db
|
||||
instances = [
|
||||
|
@ -7660,7 +7669,7 @@ class ComputeTestCase(BaseTestCase,
|
|||
return_value='fake_network_info') as mock_get_nw:
|
||||
self.compute._destroy_evacuated_instances(fake_context)
|
||||
|
||||
mock_get_inst.assert_called_once_with(fake_context)
|
||||
mock_get_inst.assert_called_once_with(read_deleted_context)
|
||||
mock_get_nw.assert_called_once_with(fake_context, evacuated_instance)
|
||||
mock_get_blk.assert_called_once_with(fake_context, evacuated_instance)
|
||||
mock_check_local.assert_called_once_with(fake_context,
|
||||
|
|
|
@ -799,6 +799,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
self.compute.init_virt_events()
|
||||
self.assertFalse(mock_register.called)
|
||||
|
||||
@mock.patch('nova.context.RequestContext.elevated')
|
||||
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename')
|
||||
@mock.patch('nova.scheduler.utils.resources_from_flavor')
|
||||
@mock.patch.object(manager.ComputeManager, '_get_instances_on_driver')
|
||||
|
@ -813,10 +814,12 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
def test_init_host_with_evacuated_instance(self, mock_save, mock_mig_get,
|
||||
mock_temp_mut, mock_init_host, mock_destroy, mock_host_get,
|
||||
mock_admin_ctxt, mock_init_virt, mock_get_inst, mock_resources,
|
||||
mock_get_node):
|
||||
mock_get_node, mock_elevated):
|
||||
our_host = self.compute.host
|
||||
not_our_host = 'not-' + our_host
|
||||
|
||||
read_deleted_context = self.context.elevated(read_deleted='yes')
|
||||
mock_elevated.return_value = read_deleted_context
|
||||
deleted_instance = fake_instance.fake_instance_obj(
|
||||
self.context, host=not_our_host, uuid=uuids.deleted_instance)
|
||||
migration = objects.Migration(instance_uuid=deleted_instance.uuid)
|
||||
|
@ -852,7 +855,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
expected_attrs=['info_cache', 'metadata'])
|
||||
mock_init_virt.assert_called_once_with()
|
||||
mock_temp_mut.assert_called_once_with(self.context, read_deleted='yes')
|
||||
mock_get_inst.assert_called_once_with(self.context)
|
||||
mock_get_inst.assert_called_once_with(read_deleted_context)
|
||||
mock_get_net.assert_called_once_with(self.context, deleted_instance)
|
||||
|
||||
# ensure driver.destroy is called so that driver may
|
||||
|
@ -3760,6 +3763,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
instance_2.host = 'not-' + our_host
|
||||
instance_2.user_id = uuids.user_id
|
||||
instance_2.project_id = uuids.project_id
|
||||
instance_2.deleted = False
|
||||
|
||||
# Only instance 2 has a migration record
|
||||
migration = objects.Migration(instance_uuid=instance_2.uuid)
|
||||
|
@ -3817,6 +3821,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
instance_1.host = 'not-' + our_host
|
||||
instance_1.user_id = uuids.user_id
|
||||
instance_1.project_id = uuids.project_id
|
||||
instance_1.deleted = False
|
||||
instance_2 = objects.Instance(self.context, flavor=flavor)
|
||||
instance_2.uuid = uuids.instance_2
|
||||
instance_2.task_state = None
|
||||
|
@ -3824,6 +3829,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
instance_2.host = 'not-' + our_host
|
||||
instance_2.user_id = uuids.user_id
|
||||
instance_2.project_id = uuids.project_id
|
||||
instance_2.deleted = False
|
||||
|
||||
migration_1 = objects.Migration(instance_uuid=instance_1.uuid)
|
||||
# Consider the migration successful but the node was deleted while the
|
||||
|
|
Loading…
Reference in New Issue