From 408ef8f84a698f764aa5d769d6d01fd9340de2e5 Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Mon, 25 Mar 2019 13:16:42 -0400 Subject: [PATCH] Error out migration when confirm_resize fails If anything fails and raises an exception during confirm_resize, the migration status is stuck in "confirming" status even though the instance status may be "ERROR". This change adds the errors_out_migration decorator to the confirm_resize method to make sure the migration status is "error" if an error is raised. In bug 1821594 it was the driver.confirm_migration method that raised some exception, so a unit test is added here which simulates a similar scenario. This only partially closes the bug because we are still leaking allocations on the source node resource provider since _delete_allocation_after_move is not called. That will be dealt with in a separate patch. Change-Id: Ic7d78ad43a2bad7f932c22c98944accbbed9e9e2 Partial-Bug: #1821594 --- nova/compute/manager.py | 1 + nova/tests/functional/test_servers.py | 3 +- nova/tests/unit/compute/test_compute_mgr.py | 48 +++++++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 33b40baf461c..de95a3650d25 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -3924,6 +3924,7 @@ class ComputeManager(manager.Manager): @wrap_exception() @wrap_instance_event(prefix='compute') + @errors_out_migration @wrap_instance_fault def confirm_resize(self, context, instance, migration): """Confirms a migration/resize and deletes the 'old' instance. diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index fded2120e88a..1825a9437c62 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -5147,8 +5147,7 @@ class ConsumerGenerationConflictTest( self.assertEqual('migration', migrations[0]['migration_type']) self.assertEqual(server['id'], migrations[0]['instance_uuid']) self.assertEqual(source_hostname, migrations[0]['source_compute']) - # NOTE(gibi): it might be better to mark the migration as error - self.assertEqual('confirmed', migrations[0]['status']) + self.assertEqual('error', migrations[0]['status']) # NOTE(gibi): Nova leaks the allocation held by the migration_uuid even # after the instance is deleted. At least nova logs a fat ERROR. diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 321b7e9bba8d..04c9e9465736 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -6833,6 +6833,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, expected_attrs=['metadata', 'system_metadata', 'info_cache']) self.migration = objects.Migration( context=self.context.elevated(), + id=1, uuid=uuids.migration_uuid, instance_uuid=self.instance.uuid, new_instance_type_id=7, @@ -7215,6 +7216,53 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, do_confirm_resize() + @mock.patch('nova.compute.utils.add_instance_fault_from_exc') + @mock.patch('nova.objects.Migration.get_by_id') + @mock.patch('nova.objects.Instance.get_by_uuid') + @mock.patch('nova.compute.utils.notify_about_instance_usage') + @mock.patch('nova.compute.utils.notify_about_instance_action') + @mock.patch('nova.objects.Instance.save') + def test_confirm_resize_driver_confirm_migration_fails( + self, instance_save, notify_action, notify_usage, + instance_get_by_uuid, migration_get_by_id, add_fault): + """Tests the scenario that driver.confirm_migration raises some error + to make sure the error is properly handled, like the instance and + migration status is set to 'error'. + """ + self.migration.status = 'confirming' + migration_get_by_id.return_value = self.migration + instance_get_by_uuid.return_value = self.instance + + error = exception.HypervisorUnavailable( + host=self.migration.source_compute) + with test.nested( + mock.patch.object(self.compute, 'network_api'), + mock.patch.object(self.compute.driver, 'confirm_migration', + side_effect=error) + ) as ( + network_api, confirm_migration + ): + self.assertRaises(exception.HypervisorUnavailable, + self.compute.confirm_resize, + self.context, self.instance, self.migration) + # Make sure the instance is in ERROR status. + self.assertEqual(vm_states.ERROR, self.instance.vm_state) + # Make sure the migration is in error status. + self.assertEqual('error', self.migration.status) + # Instance.save is called twice, once to clear the resize metadata + # and once to set the instance to ERROR status. + self.assertEqual(2, instance_save.call_count) + # The migration.status should have been saved. + self.migration.save.assert_called_once_with() + # Assert other mocks we care less about. + notify_usage.assert_called_once() + notify_action.assert_called_once() + add_fault.assert_called_once() + confirm_migration.assert_called_once() + network_api.setup_networks_on_host.assert_called_once() + instance_get_by_uuid.assert_called_once() + migration_get_by_id.assert_called_once() + def test_delete_allocation_after_move_confirm_by_migration(self): with mock.patch.object(self.compute, 'reportclient') as mock_report: mock_report.delete_allocation_for_instance.return_value = True