compute: Cleans up allocations after failed resize

During cold resize, the ComputeManager's prep_resize calls the
rpcapi.ComputeAPI's resize_instance method, which will then do an
RPC cast (async).

Because the RPC cast is asynchronous, the exception branch in prep_resize
will not be executed if the cold resize failed, the allocations will not
be cleaned up, and the instance will not be rescheduled.

This patch adds allocation cleanup in the resize_instance and finish_resize
methods.

Change-Id: I2d9ab06b485f76550dbbff46f79f40ff4c97d12f
Closes-Bug: #1749215
(cherry picked from commit caf167862d)
This commit is contained in:
Claudiu Belu 2018-02-11 10:07:52 -08:00
parent a4a53bfa31
commit 5039511840
4 changed files with 46 additions and 7 deletions

View File

@ -4213,6 +4213,15 @@ class ComputeManager(manager.Manager):
This is initiated from the destination host's ``prep_resize`` routine
and runs on the source host.
"""
try:
self._resize_instance(context, instance, image, migration,
instance_type, clean_shutdown)
except Exception:
with excutils.save_and_reraise_exception():
self._revert_allocation(context, instance, migration)
def _resize_instance(self, context, instance, image,
migration, instance_type, clean_shutdown):
with self._error_out_instance_on_exception(context, instance), \
errors_out_migration_ctxt(migration):
network_info = self.network_api.get_instance_nw_info(context,
@ -4437,6 +4446,20 @@ class ComputeManager(manager.Manager):
Sets up the newly transferred disk and turns on the instance at its
new host machine.
"""
try:
self._finish_resize_helper(context, disk_info, image, instance,
migration)
except Exception:
with excutils.save_and_reraise_exception():
self._revert_allocation(context, instance, migration)
def _finish_resize_helper(self, context, disk_info, image, instance,
migration):
"""Completes the migration process.
The caller must revert the instance's allocations if the migration
process failed.
"""
bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
context, instance.uuid)

View File

@ -2728,9 +2728,10 @@ class ServerMovingTests(ProviderUsageBaseTestCase):
allocations = self._get_allocations_by_server_uuid(server['id'])
self.assertIn(source_rp_uuid, allocations)
def test_resize_to_same_host_prep_resize_fails(self):
def _test_resize_to_same_host_instance_fails(self, failing_method,
event_name):
"""Tests that when we resize to the same host and resize fails in
the prep_resize method, we cleanup the allocations before rescheduling.
the given method, we cleanup the allocations before rescheduling.
"""
# make sure that the test only uses a single host
compute2_service_id = self.admin_api.get_services(
@ -2742,16 +2743,17 @@ class ServerMovingTests(ProviderUsageBaseTestCase):
server = self._boot_and_check_allocations(self.flavor1, hostname)
def fake_prep_resize(*args, **kwargs):
def fake_resize_method(*args, **kwargs):
# Ensure the allocations are doubled now before we fail.
usages = self._get_provider_usages(rp_uuid)
self.assertFlavorsMatchAllocation(
self.flavor1, self.flavor2, usages)
raise test.TestingException('Simulated _prep_resize failure.')
raise test.TestingException('Simulated resize failure.')
# Yes this isn't great in a functional test, but it's simple.
self.stub_out('nova.compute.manager.ComputeManager._prep_resize',
fake_prep_resize)
self.stub_out(
'nova.compute.manager.ComputeManager.%s' % failing_method,
fake_resize_method)
self.flags(allow_resize_to_same_host=True)
resize_req = {
@ -2762,7 +2764,7 @@ class ServerMovingTests(ProviderUsageBaseTestCase):
self.api.post_server_action(server['id'], resize_req)
self._wait_for_action_fail_completion(
server, instance_actions.RESIZE, 'compute_prep_resize')
server, instance_actions.RESIZE, event_name)
# Ensure the allocation records still exist on the host.
source_rp_uuid = self._get_provider_uuid_by_host(hostname)
@ -2771,6 +2773,18 @@ class ServerMovingTests(ProviderUsageBaseTestCase):
# allocation which just leaves us with the original flavor.
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
def test_resize_to_same_host_prep_resize_fails(self):
self._test_resize_to_same_host_instance_fails(
'_prep_resize', 'compute_prep_resize')
def test_resize_instance_fails_allocation_cleanup(self):
self._test_resize_to_same_host_instance_fails(
'_resize_instance', 'compute_resize_instance')
def test_finish_resize_fails_allocation_cleanup(self):
self._test_resize_to_same_host_instance_fails(
'_finish_resize', 'compute_finish_resize')
def _test_resize_reschedule_uses_host_lists(self, fails, num_alts=None):
"""Test that when a resize attempt fails, the retry comes from the
supplied host_list, and does not call the scheduler.

View File

@ -4622,6 +4622,7 @@ class ComputeTestCase(BaseTestCase,
# ensure that task_state is reverted after a failed operation.
migration = objects.Migration(context=self.context.elevated())
migration.instance_uuid = 'b48316c5-71e8-45e4-9884-6c78055b9b13'
migration.uuid = mock.sentinel.uuid
migration.new_instance_type_id = '1'
instance_type = objects.Flavor()

View File

@ -5994,6 +5994,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
expected_attrs=['metadata', 'system_metadata', 'info_cache'])
self.migration = objects.Migration(
context=self.context.elevated(),
uuid=mock.sentinel.uuid,
instance_uuid=self.instance.uuid,
new_instance_type_id=7,
dest_compute=None,