diff --git a/nova/compute/manager.py b/nova/compute/manager.py index fe25ab156e43..5b40f0f787ad 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -3875,7 +3875,31 @@ class ComputeManager(manager.Manager): instance=instance) return - self._confirm_resize(context, instance, migration=migration) + with self._error_out_instance_on_exception(context, instance): + old_instance_type = instance.old_flavor + try: + self._confirm_resize( + context, instance, migration=migration) + except Exception: + # Something failed when cleaning up the source host so + # log a traceback and leave a hint about hard rebooting + # the server to correct its state in the DB. + with excutils.save_and_reraise_exception(logger=LOG): + LOG.exception( + 'Confirm resize failed on source host %s. ' + 'Resource allocations in the placement service ' + 'will be removed regardless because the instance ' + 'is now on the destination host %s. You can try ' + 'hard rebooting the instance to correct its ' + 'state.', self.host, migration.dest_compute, + instance=instance) + finally: + # Whether an error occurred or not, at this point the + # instance is on the dest host so to avoid leaking + # allocations in placement, delete them here. + self._delete_allocation_after_move( + context, instance, migration, old_instance_type, + migration.source_node) do_confirm_resize(context, instance, migration.id) @@ -3887,63 +3911,59 @@ class ComputeManager(manager.Manager): self.host, action=fields.NotificationAction.RESIZE_CONFIRM, phase=fields.NotificationPhase.START) - with self._error_out_instance_on_exception(context, instance): - # NOTE(danms): delete stashed migration information - old_instance_type = instance.old_flavor - instance.old_flavor = None - instance.new_flavor = None - instance.system_metadata.pop('old_vm_state', None) - instance.save() + # NOTE(danms): delete stashed migration information + old_instance_type = instance.old_flavor + instance.old_flavor = None + instance.new_flavor = None + instance.system_metadata.pop('old_vm_state', None) + instance.save() - # NOTE(tr3buchet): tear down networks on source host - self.network_api.setup_networks_on_host(context, instance, - migration.source_compute, teardown=True) + # NOTE(tr3buchet): tear down networks on source host + self.network_api.setup_networks_on_host(context, instance, + migration.source_compute, teardown=True) - network_info = self.network_api.get_instance_nw_info(context, - instance) - # TODO(mriedem): Get BDMs here and pass them to the driver. - self.driver.confirm_migration(context, migration, instance, - network_info) + network_info = self.network_api.get_instance_nw_info(context, + instance) + # TODO(mriedem): Get BDMs here and pass them to the driver. + self.driver.confirm_migration(context, migration, instance, + network_info) - migration.status = 'confirmed' - with migration.obj_as_admin(): - migration.save() + migration.status = 'confirmed' + with migration.obj_as_admin(): + migration.save() - rt = self._get_resource_tracker() - rt.drop_move_claim(context, instance, migration.source_node, - old_instance_type, prefix='old_') - self._delete_allocation_after_move(context, instance, migration, - old_instance_type, - migration.source_node) - instance.drop_migration_context() + rt = self._get_resource_tracker() + rt.drop_move_claim(context, instance, migration.source_node, + old_instance_type, prefix='old_') + instance.drop_migration_context() - # NOTE(mriedem): The old_vm_state could be STOPPED but the user - # might have manually powered up the instance to confirm the - # resize/migrate, so we need to check the current power state - # on the instance and set the vm_state appropriately. We default - # to ACTIVE because if the power state is not SHUTDOWN, we - # assume _sync_instance_power_state will clean it up. - p_state = instance.power_state - vm_state = None - if p_state == power_state.SHUTDOWN: - vm_state = vm_states.STOPPED - LOG.debug("Resized/migrated instance is powered off. " - "Setting vm_state to '%s'.", vm_state, - instance=instance) - else: - vm_state = vm_states.ACTIVE + # NOTE(mriedem): The old_vm_state could be STOPPED but the user + # might have manually powered up the instance to confirm the + # resize/migrate, so we need to check the current power state + # on the instance and set the vm_state appropriately. We default + # to ACTIVE because if the power state is not SHUTDOWN, we + # assume _sync_instance_power_state will clean it up. + p_state = instance.power_state + vm_state = None + if p_state == power_state.SHUTDOWN: + vm_state = vm_states.STOPPED + LOG.debug("Resized/migrated instance is powered off. " + "Setting vm_state to '%s'.", vm_state, + instance=instance) + else: + vm_state = vm_states.ACTIVE - instance.vm_state = vm_state - instance.task_state = None - instance.save(expected_task_state=[None, task_states.DELETING, - task_states.SOFT_DELETING]) + instance.vm_state = vm_state + instance.task_state = None + instance.save(expected_task_state=[None, task_states.DELETING, + task_states.SOFT_DELETING]) - self._notify_about_instance_usage( - context, instance, "resize.confirm.end", - network_info=network_info) - compute_utils.notify_about_instance_action(context, instance, - self.host, action=fields.NotificationAction.RESIZE_CONFIRM, - phase=fields.NotificationPhase.END) + self._notify_about_instance_usage( + context, instance, "resize.confirm.end", + network_info=network_info) + compute_utils.notify_about_instance_action(context, instance, + self.host, action=fields.NotificationAction.RESIZE_CONFIRM, + phase=fields.NotificationPhase.END) def _delete_allocation_after_move(self, context, instance, migration, flavor, nodename): diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 9c0c60b1dae4..e07d880ff6b2 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -6636,6 +6636,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): dest_node=None, dest_host=None, source_compute='source_compute', + source_node='source_node', status='migrating') self.migration.save = mock.MagicMock() self.useFixture(fixtures.SpawnIsSynchronousFixture()) @@ -6995,6 +6996,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): do_finish_revert_resize() def test_confirm_resize_deletes_allocations(self): + @mock.patch('nova.objects.Instance.get_by_uuid') + @mock.patch('nova.objects.Migration.get_by_id') @mock.patch.object(self.migration, 'save') @mock.patch.object(self.compute, '_notify_about_instance_usage') @mock.patch.object(self.compute, 'network_api') @@ -7005,12 +7008,15 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): @mock.patch.object(self.instance, 'save') def do_confirm_resize(mock_save, mock_drop, mock_delete, mock_get_rt, mock_confirm, mock_nwapi, mock_notify, - mock_mig_save): + mock_mig_save, mock_mig_get, mock_inst_get): self.instance.migration_context = objects.MigrationContext() self.migration.source_compute = self.instance['host'] self.migration.source_node = self.instance['node'] - self.compute._confirm_resize(self.context, self.instance, - self.migration) + self.migration.status = 'confirming' + mock_mig_get.return_value = self.migration + mock_inst_get.return_value = self.instance + self.compute.confirm_resize(self.context, self.instance, + self.migration) mock_delete.assert_called_once_with(self.context, self.instance, self.migration, self.instance.old_flavor, @@ -7066,9 +7072,10 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): with test.nested( mock.patch.object(self.compute, 'network_api'), mock.patch.object(self.compute.driver, 'confirm_migration', - side_effect=error) + side_effect=error), + mock.patch.object(self.compute, '_delete_allocation_after_move') ) as ( - network_api, confirm_migration + network_api, confirm_migration, delete_allocation ): self.assertRaises(exception.HypervisorUnavailable, self.compute.confirm_resize, @@ -7082,6 +7089,11 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): self.assertEqual(2, instance_save.call_count) # The migration.status should have been saved. self.migration.save.assert_called_once_with() + # Allocations should always be cleaned up even if cleaning up the + # source host fails. + delete_allocation.assert_called_once_with( + self.context, self.instance, self.migration, + self.instance.old_flavor, self.migration.source_node) # Assert other mocks we care less about. notify_usage.assert_called_once() notify_action.assert_called_once()