From bfe89fec4669df6f4ac48dbb56fde3db0a24cbac Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Fri, 30 Mar 2018 17:16:08 -0400 Subject: [PATCH] Wait for network-vif-plugged before starting live migration This adds a new config option which is read on the source host during pre_live_migration which can be used to determine if it should wait for a "network-vif-plugged" event due to VIFs being plugged on the destination host. This helps us to avoid the guest transfer at all if vif plugging failed on the dest host, which we just wouldn't find out until post live migration and then we have to rollback. The option is disabled by default for backward compatibility and also because certain networking backends, like OpenDaylight, are known to not send network-vif-plugged events unless the port host binding information changes, which for live migration doesn't happen until after the guest is transferred to the destination host. Related to blueprint neutron-new-port-binding-api Related-Bug: #1786346 NOTE(danms): Stable-only changes to this patch from master include removing the RPC-related communication from the destination to the source node. As such, the new option is read on the source node so the conf option help and release note are updated. This is OK before Rocky since we don't claim support to live migrate between different networking backends (vif types), so operators would need to set the option universally, or at least have host aggregates in place if they are using different network types. Conflicts: nova/conf/compute.py nova/tests/unit/objects/test_migrate_data.py Change-Id: I0f3ab6604d8b79bdb75cf67571e359cfecc039d8 (cherry picked from commit 5aadff75c3ac4f2019838600df6580481a96db0f) --- nova/compute/manager.py | 84 ++++++++++-- nova/conf/compute.py | 51 ++++++- nova/tests/unit/compute/test_compute.py | 17 ++- nova/tests/unit/compute/test_compute_mgr.py | 127 ++++++++++++++++++ ...on_wait_for_vif_plug-c9dcb034067890d8.yaml | 19 +++ 5 files changed, 283 insertions(+), 15 deletions(-) create mode 100644 releasenotes/notes/live_migration_wait_for_vif_plug-c9dcb034067890d8.yaml diff --git a/nova/compute/manager.py b/nova/compute/manager.py index d3405f1ef4d0..f953013f804c 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -6020,6 +6020,53 @@ class ComputeManager(manager.Manager): LOG.debug('pre_live_migration result data is %s', migrate_data) return migrate_data + @staticmethod + def _neutron_failed_live_migration_callback(event_name, instance): + msg = ('Neutron reported failure during live migration ' + 'with %(event)s for instance %(uuid)s') + msg_args = {'event': event_name, 'uuid': instance.uuid} + if CONF.vif_plugging_is_fatal: + raise exception.VirtualInterfacePlugException(msg % msg_args) + LOG.error(msg, msg_args) + + @staticmethod + def _get_neutron_events_for_live_migration(instance): + # We don't generate events if CONF.vif_plugging_timeout=0 + # or if waiting during live migration is disabled, + # meaning that the operator disabled using them. + if (CONF.vif_plugging_timeout and utils.is_neutron() and + CONF.compute.live_migration_wait_for_vif_plug): + return [('network-vif-plugged', vif['id']) + for vif in instance.get_network_info()] + else: + return [] + + def _cleanup_pre_live_migration(self, context, dest, instance, + migration, migrate_data): + """Helper method for when pre_live_migration fails + + Sets the migration status to "error" and rolls back the live migration + setup on the destination host. + + :param context: The user request context. + :type context: nova.context.RequestContext + :param dest: The live migration destination hostname. + :type dest: str + :param instance: The instance being live migrated. + :type instance: nova.objects.Instance + :param migration: The migration record tracking this live migration. + :type migration: nova.objects.Migration + :param migrate_data: Data about the live migration, populated from + the destination host. + :type migrate_data: Subclass of nova.objects.LiveMigrateData + """ + self._set_migration_status(migration, 'error') + # Make sure we set this for _rollback_live_migration() + # so it can find it, as expected if it was called later + migrate_data.migration = migration + self._rollback_live_migration(context, instance, dest, + migrate_data) + def _do_live_migration(self, context, dest, instance, block_migration, migration, migrate_data): # NOTE(danms): We should enhance the RT to account for migrations @@ -6028,6 +6075,7 @@ class ComputeManager(manager.Manager): # reporting self._set_migration_status(migration, 'preparing') + events = self._get_neutron_events_for_live_migration(instance) try: if ('block_migration' in migrate_data and migrate_data.block_migration): @@ -6038,19 +6086,37 @@ class ComputeManager(manager.Manager): else: disk = None - migrate_data = self.compute_rpcapi.pre_live_migration( - context, instance, - block_migration, disk, dest, migrate_data) + deadline = CONF.vif_plugging_timeout + error_cb = self._neutron_failed_live_migration_callback + # In order to avoid a race with the vif plugging that the virt + # driver does on the destination host, we register our events + # to wait for before calling pre_live_migration. + with self.virtapi.wait_for_instance_event( + instance, events, deadline=deadline, + error_callback=error_cb): + migrate_data = self.compute_rpcapi.pre_live_migration( + context, instance, + block_migration, disk, dest, migrate_data) + except exception.VirtualInterfacePlugException: + with excutils.save_and_reraise_exception(): + LOG.exception('Failed waiting for network virtual interfaces ' + 'to be plugged on the destination host %s.', + dest, instance=instance) + self._cleanup_pre_live_migration( + context, dest, instance, migration, migrate_data) + except eventlet.timeout.Timeout: + msg = 'Timed out waiting for events: %s' + LOG.warning(msg, events, instance=instance) + if CONF.vif_plugging_is_fatal: + self._cleanup_pre_live_migration( + context, dest, instance, migration, migrate_data) + raise exception.MigrationError(reason=msg % events) except Exception: with excutils.save_and_reraise_exception(): LOG.exception('Pre live migration failed at %s', dest, instance=instance) - self._set_migration_status(migration, 'error') - # Make sure we set this for _rollback_live_migration() - # so it can find it, as expected if it was called later - migrate_data.migration = migration - self._rollback_live_migration(context, instance, dest, - migrate_data) + self._cleanup_pre_live_migration( + context, dest, instance, migration, migrate_data) self._set_migration_status(migration, 'running') diff --git a/nova/conf/compute.py b/nova/conf/compute.py index 38921f54dae8..6cae12383f72 100644 --- a/nova/conf/compute.py +++ b/nova/conf/compute.py @@ -657,7 +657,56 @@ node. Possible values: * Any positive integer in seconds. -""") +"""), + cfg.BoolOpt('live_migration_wait_for_vif_plug', + # TODO(mriedem): Change to default=True starting in Stein. + default=False, + help=""" +Determine if the source compute host should wait for a ``network-vif-plugged`` +event from the (neutron) networking service before starting the actual transfer +of the guest to the destination compute host. + +If you set this option the same on all of your compute hosts, which you should +do if you use the same networking backend universally, you do not have to +worry about this. + +Before starting the transfer of the guest, some setup occurs on the destination +compute host, including plugging virtual interfaces. Depending on the +networking backend **on the destination host**, a ``network-vif-plugged`` +event may be triggered and then received on the source compute host and the +source compute can wait for that event to ensure networking is set up on the +destination host before starting the guest transfer in the hypervisor. + +By default, this is False for two reasons: + +1. Backward compatibility: deployments should test this out and ensure it works + for them before enabling it. + +2. The compute service cannot reliably determine which types of virtual + interfaces (``port.binding:vif_type``) will send ``network-vif-plugged`` + events without an accompanying port ``binding:host_id`` change. + Open vSwitch and linuxbridge should be OK, but OpenDaylight is at least + one known backend that will not currently work in this case, see bug + https://launchpad.net/bugs/1755890 for more details. + +Possible values: + +* True: wait for ``network-vif-plugged`` events before starting guest transfer +* False: do not wait for ``network-vif-plugged`` events before starting guest + transfer (this is how things have always worked before this option + was introduced) + +Related options: + +* [DEFAULT]/vif_plugging_is_fatal: if ``live_migration_wait_for_vif_plug`` is + True and ``vif_plugging_timeout`` is greater than 0, and a timeout is + reached, the live migration process will fail with an error but the guest + transfer will not have started to the destination host +* [DEFAULT]/vif_plugging_timeout: if ``live_migration_wait_for_vif_plug`` is + True, this controls the amount of time to wait before timing out and either + failing if ``vif_plugging_is_fatal`` is True, or simply continuing with the + live migration +"""), ] interval_opts = [ diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index d838df7ede32..66b387c044c2 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -6116,15 +6116,16 @@ class ComputeTestCase(BaseTestCase, fake_notifier.NOTIFICATIONS = [] migrate_data = objects.LibvirtLiveMigrateData( is_shared_instance_path=False) - mock_pre.return_value = None + mock_pre.return_value = migrate_data with mock.patch.object(self.compute.network_api, 'setup_networks_on_host') as mock_setup: + self.flags(live_migration_wait_for_vif_plug=True, group='compute') ret = self.compute.pre_live_migration(c, instance=instance, block_migration=False, disk=None, migrate_data=migrate_data) - self.assertIsNone(ret) + self.assertIs(migrate_data, ret) self.assertEqual(len(fake_notifier.NOTIFICATIONS), 2) msg = fake_notifier.NOTIFICATIONS[0] self.assertEqual(msg.event_type, @@ -6171,7 +6172,9 @@ class ComputeTestCase(BaseTestCase, instance = self._create_fake_instance_obj( {'host': 'src_host', - 'task_state': task_states.MIGRATING}) + 'task_state': task_states.MIGRATING, + 'info_cache': objects.InstanceInfoCache( + network_info=network_model.NetworkInfo([]))}) updated_instance = self._create_fake_instance_obj( {'host': 'fake-dest-host'}) dest_host = updated_instance['host'] @@ -6256,7 +6259,9 @@ class ComputeTestCase(BaseTestCase, # Confirm live_migration() works as expected correctly. # creating instance testdata c = context.get_admin_context() - instance = self._create_fake_instance_obj(context=c) + params = {'info_cache': objects.InstanceInfoCache( + network_info=network_model.NetworkInfo([]))} + instance = self._create_fake_instance_obj(params=params, context=c) instance.host = self.compute.host dest = 'desthost' @@ -6310,7 +6315,9 @@ class ComputeTestCase(BaseTestCase, # Confirm live_migration() works as expected correctly. # creating instance testdata c = context.get_admin_context() - instance = self._create_fake_instance_obj(context=c) + params = {'info_cache': objects.InstanceInfoCache( + network_info=network_model.NetworkInfo([]))} + instance = self._create_fake_instance_obj(params=params, context=c) instance.host = self.compute.host dest = 'desthost' diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index ceaf72dda1ca..2578973a693e 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -20,6 +20,7 @@ import time from cinderclient import exceptions as cinder_exception from cursive import exception as cursive_exception from eventlet import event as eventlet_event +from eventlet import timeout as eventlet_timeout import mock import netaddr from oslo_log import log as logging @@ -7009,6 +7010,132 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): new_attachment_id) _test() + def test_get_neutron_events_for_live_migration_empty(self): + """Tests the various ways that _get_neutron_events_for_live_migration + will return an empty list. + """ + nw = network_model.NetworkInfo([network_model.VIF(uuids.port1)]) + # 1. no timeout + self.flags(vif_plugging_timeout=0) + self.assertEqual( + [], self.compute._get_neutron_events_for_live_migration(nw)) + # 2. not neutron + self.flags(vif_plugging_timeout=300, use_neutron=False) + self.assertEqual( + [], self.compute._get_neutron_events_for_live_migration(nw)) + # 3. no VIFs + self.flags(vif_plugging_timeout=300, use_neutron=True) + self.assertEqual( + [], self.compute._get_neutron_events_for_live_migration([])) + + @mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration') + @mock.patch('nova.compute.manager.ComputeManager._post_live_migration') + def test_live_migration_wait_vif_plugged( + self, mock_post_live_mig, mock_pre_live_mig): + """Tests the happy path of waiting for network-vif-plugged events from + neutron when so configured. + """ + self.flags(live_migration_wait_for_vif_plug=True, group='compute') + migrate_data = objects.LibvirtLiveMigrateData() + mock_pre_live_mig.return_value = migrate_data + self.instance.info_cache = objects.InstanceInfoCache( + network_info=network_model.NetworkInfo([ + network_model.VIF(uuids.port1), network_model.VIF(uuids.port2) + ])) + with mock.patch.object(self.compute.virtapi, + 'wait_for_instance_event') as wait_for_event: + self.compute._do_live_migration( + self.context, 'dest-host', self.instance, None, self.migration, + migrate_data) + self.assertEqual(2, len(wait_for_event.call_args[0][1])) + self.assertEqual(CONF.vif_plugging_timeout, + wait_for_event.call_args[1]['deadline']) + mock_pre_live_mig.assert_called_once_with( + self.context, self.instance, None, None, 'dest-host', + migrate_data) + + @mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration') + @mock.patch('nova.compute.manager.ComputeManager._rollback_live_migration') + def test_live_migration_wait_vif_plugged_vif_plug_error( + self, mock_rollback_live_mig, mock_pre_live_mig): + """Tests the scenario where wait_for_instance_event fails with + VirtualInterfacePlugException. + """ + self.flags(live_migration_wait_for_vif_plug=True, group='compute') + migrate_data = objects.LibvirtLiveMigrateData() + mock_pre_live_mig.return_value = migrate_data + self.instance.info_cache = objects.InstanceInfoCache( + network_info=network_model.NetworkInfo([ + network_model.VIF(uuids.port1)])) + with mock.patch.object( + self.compute.virtapi, + 'wait_for_instance_event') as wait_for_event: + wait_for_event.return_value.__enter__.side_effect = ( + exception.VirtualInterfacePlugException()) + self.assertRaises( + exception.VirtualInterfacePlugException, + self.compute._do_live_migration, self.context, 'dest-host', + self.instance, None, self.migration, migrate_data) + self.assertEqual('error', self.migration.status) + mock_rollback_live_mig.assert_called_once_with( + self.context, self.instance, 'dest-host', migrate_data) + + @mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration') + @mock.patch('nova.compute.manager.ComputeManager._rollback_live_migration') + def test_live_migration_wait_vif_plugged_timeout_error( + self, mock_rollback_live_mig, mock_pre_live_mig): + """Tests the scenario where wait_for_instance_event raises an + eventlet Timeout exception and we're configured such that vif plugging + failures are fatal (which is the default). + """ + self.flags(live_migration_wait_for_vif_plug=True, group='compute') + migrate_data = objects.LibvirtLiveMigrateData() + mock_pre_live_mig.return_value = migrate_data + self.instance.info_cache = objects.InstanceInfoCache( + network_info=network_model.NetworkInfo([ + network_model.VIF(uuids.port1)])) + with mock.patch.object( + self.compute.virtapi, + 'wait_for_instance_event') as wait_for_event: + wait_for_event.return_value.__enter__.side_effect = ( + eventlet_timeout.Timeout()) + ex = self.assertRaises( + exception.MigrationError, self.compute._do_live_migration, + self.context, 'dest-host', self.instance, None, + self.migration, migrate_data) + self.assertIn('Timed out waiting for events', six.text_type(ex)) + self.assertEqual('error', self.migration.status) + mock_rollback_live_mig.assert_called_once_with( + self.context, self.instance, 'dest-host', migrate_data) + + @mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration') + @mock.patch('nova.compute.manager.ComputeManager._rollback_live_migration') + @mock.patch('nova.compute.manager.ComputeManager._post_live_migration') + def test_live_migration_wait_vif_plugged_timeout_non_fatal( + self, mock_post_live_mig, mock_rollback_live_mig, + mock_pre_live_mig): + """Tests the scenario where wait_for_instance_event raises an + eventlet Timeout exception and we're configured such that vif plugging + failures are NOT fatal. + """ + self.flags(live_migration_wait_for_vif_plug=True, group='compute') + self.flags(vif_plugging_is_fatal=False) + migrate_data = objects.LibvirtLiveMigrateData() + mock_pre_live_mig.return_value = migrate_data + self.instance.info_cache = objects.InstanceInfoCache( + network_info=network_model.NetworkInfo([ + network_model.VIF(uuids.port1)])) + with mock.patch.object( + self.compute.virtapi, + 'wait_for_instance_event') as wait_for_event: + wait_for_event.return_value.__enter__.side_effect = ( + eventlet_timeout.Timeout()) + self.compute._do_live_migration( + self.context, 'dest-host', self.instance, None, + self.migration, migrate_data) + self.assertEqual('running', self.migration.status) + mock_rollback_live_mig.assert_not_called() + def test_live_migration_force_complete_succeeded(self): migration = objects.Migration() migration.status = 'running' diff --git a/releasenotes/notes/live_migration_wait_for_vif_plug-c9dcb034067890d8.yaml b/releasenotes/notes/live_migration_wait_for_vif_plug-c9dcb034067890d8.yaml new file mode 100644 index 000000000000..dcb38fc50f62 --- /dev/null +++ b/releasenotes/notes/live_migration_wait_for_vif_plug-c9dcb034067890d8.yaml @@ -0,0 +1,19 @@ +--- +other: + - | + A new configuration option, ``[compute]/live_migration_wait_for_vif_plug``, + has been added which can be used to configure compute services to wait + for network interface plugging to complete on the destination host before + starting the guest transfer on the source host during live migration. + + If you set this option the same on all of your compute hosts, which you + should do if you use the same networking backend universally, you do not + have to worry about this. + + This is disabled by default for backward compatibilty and because the + compute service cannot reliably determine which types of virtual + interfaces (``port.binding:vif_type``) will send ``network-vif-plugged`` + events without an accompanying port ``binding:host_id`` change. + Open vSwitch and linuxbridge should be OK, but OpenDaylight is at least + one known backend that will not currently work in this case, see bug + https://launchpad.net/bugs/1755890 for more details.