Stop failed live-migrates getting stuck migrating
When there are failures in driver.cleanup, we are seeing live-migrations that get stuck in the live-migrating state. While there has been a patch to stop the cause listed in the bug this closes, there are other failures (such as a token timeout when talking to cinder or neutron) that could trigger this same failure mode. When we hit an error this late in live-migration, it should be a very rare event, so its best to just put the instance and migration into an error state, and help alert both the operator and API user to the failure that has occurred. Closes-Bug: #1662626 Change-Id: Idfdce9e7dd8106af01db0358ada15737cb846395
This commit is contained in:
parent
b706155888
commit
b56f8fc2d1
|
@ -5379,12 +5379,16 @@ class ComputeManager(manager.Manager):
|
|||
self._rollback_live_migration,
|
||||
block_migration, migrate_data)
|
||||
except Exception:
|
||||
# Executing live migration
|
||||
# live_migration might raises exceptions, but
|
||||
# nothing must be recovered in this version.
|
||||
LOG.exception(_LE('Live migration failed.'), instance=instance)
|
||||
with excutils.save_and_reraise_exception():
|
||||
# Put instance and migration into error state,
|
||||
# as its almost certainly too late to rollback
|
||||
self._set_migration_status(migration, 'error')
|
||||
# first refresh instance as it may have got updated by
|
||||
# post_live_migration_at_destination
|
||||
instance.refresh()
|
||||
self._set_instance_obj_error_state(context, instance,
|
||||
clean_task_state=True)
|
||||
|
||||
@wrap_exception()
|
||||
@wrap_instance_event(prefix='compute')
|
||||
|
|
|
@ -5961,6 +5961,49 @@ class ComputeTestCase(BaseTestCase):
|
|||
mock_post.assert_called_once_with(c, instance, False, dest)
|
||||
mock_clear.assert_called_once_with(mock.ANY)
|
||||
|
||||
@mock.patch.object(compute_rpcapi.ComputeAPI, 'pre_live_migration')
|
||||
@mock.patch.object(compute_rpcapi.ComputeAPI,
|
||||
'post_live_migration_at_destination')
|
||||
@mock.patch.object(compute_manager.InstanceEvents,
|
||||
'clear_events_for_instance')
|
||||
@mock.patch.object(compute_utils, 'EventReporter')
|
||||
@mock.patch('nova.objects.Migration.save')
|
||||
def test_live_migration_handles_errors_correctly(self, mock_save,
|
||||
mock_event, mock_clear, mock_post, mock_pre):
|
||||
# Confirm live_migration() works as expected correctly.
|
||||
# creating instance testdata
|
||||
c = context.get_admin_context()
|
||||
instance = self._create_fake_instance_obj(context=c)
|
||||
instance.host = self.compute.host
|
||||
dest = 'desthost'
|
||||
|
||||
migrate_data = migrate_data_obj.LibvirtLiveMigrateData(
|
||||
is_shared_instance_path=False,
|
||||
is_shared_block_storage=False)
|
||||
mock_pre.return_value = migrate_data
|
||||
|
||||
# start test
|
||||
migration = objects.Migration()
|
||||
with mock.patch.object(self.compute.driver,
|
||||
'cleanup') as mock_cleanup:
|
||||
mock_cleanup.side_effect = test.TestingException
|
||||
|
||||
self.assertRaises(test.TestingException,
|
||||
self.compute.live_migration,
|
||||
c, dest, instance, False, migration, migrate_data)
|
||||
|
||||
# ensure we have updated the instance and migration objects
|
||||
self.assertEqual(vm_states.ERROR, instance.vm_state)
|
||||
self.assertIsNone(instance.task_state)
|
||||
self.assertEqual("error", migration.status)
|
||||
|
||||
mock_pre.assert_called_once_with(c, instance, False, None,
|
||||
dest, migrate_data)
|
||||
self.assertEqual(0, mock_clear.call_count)
|
||||
|
||||
# cleanup
|
||||
instance.destroy()
|
||||
|
||||
@mock.patch.object(fake.FakeDriver, 'unfilter_instance')
|
||||
@mock.patch.object(compute_rpcapi.ComputeAPI,
|
||||
'post_live_migration_at_destination')
|
||||
|
|
Loading…
Reference in New Issue