diff --git a/nova/compute/manager.py b/nova/compute/manager.py index d3405f1ef4d0..f953013f804c 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -6020,6 +6020,53 @@ class ComputeManager(manager.Manager): LOG.debug('pre_live_migration result data is %s', migrate_data) return migrate_data + @staticmethod + def _neutron_failed_live_migration_callback(event_name, instance): + msg = ('Neutron reported failure during live migration ' + 'with %(event)s for instance %(uuid)s') + msg_args = {'event': event_name, 'uuid': instance.uuid} + if CONF.vif_plugging_is_fatal: + raise exception.VirtualInterfacePlugException(msg % msg_args) + LOG.error(msg, msg_args) + + @staticmethod + def _get_neutron_events_for_live_migration(instance): + # We don't generate events if CONF.vif_plugging_timeout=0 + # or if waiting during live migration is disabled, + # meaning that the operator disabled using them. + if (CONF.vif_plugging_timeout and utils.is_neutron() and + CONF.compute.live_migration_wait_for_vif_plug): + return [('network-vif-plugged', vif['id']) + for vif in instance.get_network_info()] + else: + return [] + + def _cleanup_pre_live_migration(self, context, dest, instance, + migration, migrate_data): + """Helper method for when pre_live_migration fails + + Sets the migration status to "error" and rolls back the live migration + setup on the destination host. + + :param context: The user request context. + :type context: nova.context.RequestContext + :param dest: The live migration destination hostname. + :type dest: str + :param instance: The instance being live migrated. + :type instance: nova.objects.Instance + :param migration: The migration record tracking this live migration. + :type migration: nova.objects.Migration + :param migrate_data: Data about the live migration, populated from + the destination host. + :type migrate_data: Subclass of nova.objects.LiveMigrateData + """ + self._set_migration_status(migration, 'error') + # Make sure we set this for _rollback_live_migration() + # so it can find it, as expected if it was called later + migrate_data.migration = migration + self._rollback_live_migration(context, instance, dest, + migrate_data) + def _do_live_migration(self, context, dest, instance, block_migration, migration, migrate_data): # NOTE(danms): We should enhance the RT to account for migrations @@ -6028,6 +6075,7 @@ class ComputeManager(manager.Manager): # reporting self._set_migration_status(migration, 'preparing') + events = self._get_neutron_events_for_live_migration(instance) try: if ('block_migration' in migrate_data and migrate_data.block_migration): @@ -6038,19 +6086,37 @@ class ComputeManager(manager.Manager): else: disk = None - migrate_data = self.compute_rpcapi.pre_live_migration( - context, instance, - block_migration, disk, dest, migrate_data) + deadline = CONF.vif_plugging_timeout + error_cb = self._neutron_failed_live_migration_callback + # In order to avoid a race with the vif plugging that the virt + # driver does on the destination host, we register our events + # to wait for before calling pre_live_migration. + with self.virtapi.wait_for_instance_event( + instance, events, deadline=deadline, + error_callback=error_cb): + migrate_data = self.compute_rpcapi.pre_live_migration( + context, instance, + block_migration, disk, dest, migrate_data) + except exception.VirtualInterfacePlugException: + with excutils.save_and_reraise_exception(): + LOG.exception('Failed waiting for network virtual interfaces ' + 'to be plugged on the destination host %s.', + dest, instance=instance) + self._cleanup_pre_live_migration( + context, dest, instance, migration, migrate_data) + except eventlet.timeout.Timeout: + msg = 'Timed out waiting for events: %s' + LOG.warning(msg, events, instance=instance) + if CONF.vif_plugging_is_fatal: + self._cleanup_pre_live_migration( + context, dest, instance, migration, migrate_data) + raise exception.MigrationError(reason=msg % events) except Exception: with excutils.save_and_reraise_exception(): LOG.exception('Pre live migration failed at %s', dest, instance=instance) - self._set_migration_status(migration, 'error') - # Make sure we set this for _rollback_live_migration() - # so it can find it, as expected if it was called later - migrate_data.migration = migration - self._rollback_live_migration(context, instance, dest, - migrate_data) + self._cleanup_pre_live_migration( + context, dest, instance, migration, migrate_data) self._set_migration_status(migration, 'running') diff --git a/nova/conf/compute.py b/nova/conf/compute.py index 38921f54dae8..6cae12383f72 100644 --- a/nova/conf/compute.py +++ b/nova/conf/compute.py @@ -657,7 +657,56 @@ node. Possible values: * Any positive integer in seconds. -""") +"""), + cfg.BoolOpt('live_migration_wait_for_vif_plug', + # TODO(mriedem): Change to default=True starting in Stein. + default=False, + help=""" +Determine if the source compute host should wait for a ``network-vif-plugged`` +event from the (neutron) networking service before starting the actual transfer +of the guest to the destination compute host. + +If you set this option the same on all of your compute hosts, which you should +do if you use the same networking backend universally, you do not have to +worry about this. + +Before starting the transfer of the guest, some setup occurs on the destination +compute host, including plugging virtual interfaces. Depending on the +networking backend **on the destination host**, a ``network-vif-plugged`` +event may be triggered and then received on the source compute host and the +source compute can wait for that event to ensure networking is set up on the +destination host before starting the guest transfer in the hypervisor. + +By default, this is False for two reasons: + +1. Backward compatibility: deployments should test this out and ensure it works + for them before enabling it. + +2. The compute service cannot reliably determine which types of virtual + interfaces (``port.binding:vif_type``) will send ``network-vif-plugged`` + events without an accompanying port ``binding:host_id`` change. + Open vSwitch and linuxbridge should be OK, but OpenDaylight is at least + one known backend that will not currently work in this case, see bug + https://launchpad.net/bugs/1755890 for more details. + +Possible values: + +* True: wait for ``network-vif-plugged`` events before starting guest transfer +* False: do not wait for ``network-vif-plugged`` events before starting guest + transfer (this is how things have always worked before this option + was introduced) + +Related options: + +* [DEFAULT]/vif_plugging_is_fatal: if ``live_migration_wait_for_vif_plug`` is + True and ``vif_plugging_timeout`` is greater than 0, and a timeout is + reached, the live migration process will fail with an error but the guest + transfer will not have started to the destination host +* [DEFAULT]/vif_plugging_timeout: if ``live_migration_wait_for_vif_plug`` is + True, this controls the amount of time to wait before timing out and either + failing if ``vif_plugging_is_fatal`` is True, or simply continuing with the + live migration +"""), ] interval_opts = [ diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index d838df7ede32..66b387c044c2 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -6116,15 +6116,16 @@ class ComputeTestCase(BaseTestCase, fake_notifier.NOTIFICATIONS = [] migrate_data = objects.LibvirtLiveMigrateData( is_shared_instance_path=False) - mock_pre.return_value = None + mock_pre.return_value = migrate_data with mock.patch.object(self.compute.network_api, 'setup_networks_on_host') as mock_setup: + self.flags(live_migration_wait_for_vif_plug=True, group='compute') ret = self.compute.pre_live_migration(c, instance=instance, block_migration=False, disk=None, migrate_data=migrate_data) - self.assertIsNone(ret) + self.assertIs(migrate_data, ret) self.assertEqual(len(fake_notifier.NOTIFICATIONS), 2) msg = fake_notifier.NOTIFICATIONS[0] self.assertEqual(msg.event_type, @@ -6171,7 +6172,9 @@ class ComputeTestCase(BaseTestCase, instance = self._create_fake_instance_obj( {'host': 'src_host', - 'task_state': task_states.MIGRATING}) + 'task_state': task_states.MIGRATING, + 'info_cache': objects.InstanceInfoCache( + network_info=network_model.NetworkInfo([]))}) updated_instance = self._create_fake_instance_obj( {'host': 'fake-dest-host'}) dest_host = updated_instance['host'] @@ -6256,7 +6259,9 @@ class ComputeTestCase(BaseTestCase, # Confirm live_migration() works as expected correctly. # creating instance testdata c = context.get_admin_context() - instance = self._create_fake_instance_obj(context=c) + params = {'info_cache': objects.InstanceInfoCache( + network_info=network_model.NetworkInfo([]))} + instance = self._create_fake_instance_obj(params=params, context=c) instance.host = self.compute.host dest = 'desthost' @@ -6310,7 +6315,9 @@ class ComputeTestCase(BaseTestCase, # Confirm live_migration() works as expected correctly. # creating instance testdata c = context.get_admin_context() - instance = self._create_fake_instance_obj(context=c) + params = {'info_cache': objects.InstanceInfoCache( + network_info=network_model.NetworkInfo([]))} + instance = self._create_fake_instance_obj(params=params, context=c) instance.host = self.compute.host dest = 'desthost' diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index ceaf72dda1ca..2578973a693e 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -20,6 +20,7 @@ import time from cinderclient import exceptions as cinder_exception from cursive import exception as cursive_exception from eventlet import event as eventlet_event +from eventlet import timeout as eventlet_timeout import mock import netaddr from oslo_log import log as logging @@ -7009,6 +7010,132 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): new_attachment_id) _test() + def test_get_neutron_events_for_live_migration_empty(self): + """Tests the various ways that _get_neutron_events_for_live_migration + will return an empty list. + """ + nw = network_model.NetworkInfo([network_model.VIF(uuids.port1)]) + # 1. no timeout + self.flags(vif_plugging_timeout=0) + self.assertEqual( + [], self.compute._get_neutron_events_for_live_migration(nw)) + # 2. not neutron + self.flags(vif_plugging_timeout=300, use_neutron=False) + self.assertEqual( + [], self.compute._get_neutron_events_for_live_migration(nw)) + # 3. no VIFs + self.flags(vif_plugging_timeout=300, use_neutron=True) + self.assertEqual( + [], self.compute._get_neutron_events_for_live_migration([])) + + @mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration') + @mock.patch('nova.compute.manager.ComputeManager._post_live_migration') + def test_live_migration_wait_vif_plugged( + self, mock_post_live_mig, mock_pre_live_mig): + """Tests the happy path of waiting for network-vif-plugged events from + neutron when so configured. + """ + self.flags(live_migration_wait_for_vif_plug=True, group='compute') + migrate_data = objects.LibvirtLiveMigrateData() + mock_pre_live_mig.return_value = migrate_data + self.instance.info_cache = objects.InstanceInfoCache( + network_info=network_model.NetworkInfo([ + network_model.VIF(uuids.port1), network_model.VIF(uuids.port2) + ])) + with mock.patch.object(self.compute.virtapi, + 'wait_for_instance_event') as wait_for_event: + self.compute._do_live_migration( + self.context, 'dest-host', self.instance, None, self.migration, + migrate_data) + self.assertEqual(2, len(wait_for_event.call_args[0][1])) + self.assertEqual(CONF.vif_plugging_timeout, + wait_for_event.call_args[1]['deadline']) + mock_pre_live_mig.assert_called_once_with( + self.context, self.instance, None, None, 'dest-host', + migrate_data) + + @mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration') + @mock.patch('nova.compute.manager.ComputeManager._rollback_live_migration') + def test_live_migration_wait_vif_plugged_vif_plug_error( + self, mock_rollback_live_mig, mock_pre_live_mig): + """Tests the scenario where wait_for_instance_event fails with + VirtualInterfacePlugException. + """ + self.flags(live_migration_wait_for_vif_plug=True, group='compute') + migrate_data = objects.LibvirtLiveMigrateData() + mock_pre_live_mig.return_value = migrate_data + self.instance.info_cache = objects.InstanceInfoCache( + network_info=network_model.NetworkInfo([ + network_model.VIF(uuids.port1)])) + with mock.patch.object( + self.compute.virtapi, + 'wait_for_instance_event') as wait_for_event: + wait_for_event.return_value.__enter__.side_effect = ( + exception.VirtualInterfacePlugException()) + self.assertRaises( + exception.VirtualInterfacePlugException, + self.compute._do_live_migration, self.context, 'dest-host', + self.instance, None, self.migration, migrate_data) + self.assertEqual('error', self.migration.status) + mock_rollback_live_mig.assert_called_once_with( + self.context, self.instance, 'dest-host', migrate_data) + + @mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration') + @mock.patch('nova.compute.manager.ComputeManager._rollback_live_migration') + def test_live_migration_wait_vif_plugged_timeout_error( + self, mock_rollback_live_mig, mock_pre_live_mig): + """Tests the scenario where wait_for_instance_event raises an + eventlet Timeout exception and we're configured such that vif plugging + failures are fatal (which is the default). + """ + self.flags(live_migration_wait_for_vif_plug=True, group='compute') + migrate_data = objects.LibvirtLiveMigrateData() + mock_pre_live_mig.return_value = migrate_data + self.instance.info_cache = objects.InstanceInfoCache( + network_info=network_model.NetworkInfo([ + network_model.VIF(uuids.port1)])) + with mock.patch.object( + self.compute.virtapi, + 'wait_for_instance_event') as wait_for_event: + wait_for_event.return_value.__enter__.side_effect = ( + eventlet_timeout.Timeout()) + ex = self.assertRaises( + exception.MigrationError, self.compute._do_live_migration, + self.context, 'dest-host', self.instance, None, + self.migration, migrate_data) + self.assertIn('Timed out waiting for events', six.text_type(ex)) + self.assertEqual('error', self.migration.status) + mock_rollback_live_mig.assert_called_once_with( + self.context, self.instance, 'dest-host', migrate_data) + + @mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration') + @mock.patch('nova.compute.manager.ComputeManager._rollback_live_migration') + @mock.patch('nova.compute.manager.ComputeManager._post_live_migration') + def test_live_migration_wait_vif_plugged_timeout_non_fatal( + self, mock_post_live_mig, mock_rollback_live_mig, + mock_pre_live_mig): + """Tests the scenario where wait_for_instance_event raises an + eventlet Timeout exception and we're configured such that vif plugging + failures are NOT fatal. + """ + self.flags(live_migration_wait_for_vif_plug=True, group='compute') + self.flags(vif_plugging_is_fatal=False) + migrate_data = objects.LibvirtLiveMigrateData() + mock_pre_live_mig.return_value = migrate_data + self.instance.info_cache = objects.InstanceInfoCache( + network_info=network_model.NetworkInfo([ + network_model.VIF(uuids.port1)])) + with mock.patch.object( + self.compute.virtapi, + 'wait_for_instance_event') as wait_for_event: + wait_for_event.return_value.__enter__.side_effect = ( + eventlet_timeout.Timeout()) + self.compute._do_live_migration( + self.context, 'dest-host', self.instance, None, + self.migration, migrate_data) + self.assertEqual('running', self.migration.status) + mock_rollback_live_mig.assert_not_called() + def test_live_migration_force_complete_succeeded(self): migration = objects.Migration() migration.status = 'running' diff --git a/releasenotes/notes/live_migration_wait_for_vif_plug-c9dcb034067890d8.yaml b/releasenotes/notes/live_migration_wait_for_vif_plug-c9dcb034067890d8.yaml new file mode 100644 index 000000000000..dcb38fc50f62 --- /dev/null +++ b/releasenotes/notes/live_migration_wait_for_vif_plug-c9dcb034067890d8.yaml @@ -0,0 +1,19 @@ +--- +other: + - | + A new configuration option, ``[compute]/live_migration_wait_for_vif_plug``, + has been added which can be used to configure compute services to wait + for network interface plugging to complete on the destination host before + starting the guest transfer on the source host during live migration. + + If you set this option the same on all of your compute hosts, which you + should do if you use the same networking backend universally, you do not + have to worry about this. + + This is disabled by default for backward compatibilty and because the + compute service cannot reliably determine which types of virtual + interfaces (``port.binding:vif_type``) will send ``network-vif-plugged`` + events without an accompanying port ``binding:host_id`` change. + Open vSwitch and linuxbridge should be OK, but OpenDaylight is at least + one known backend that will not currently work in this case, see bug + https://launchpad.net/bugs/1755890 for more details.