From 1f48d3d83b4d5f6f9cd96ee06d2fc005635c1ff9 Mon Sep 17 00:00:00 2001 From: Sivasathurappan Radhakrishnan Date: Thu, 16 Feb 2017 12:51:45 +0000 Subject: [PATCH] Port binding based on events during live migration Currently port binding call is made at destination compute in post live migration phase. This may cause network outage during post-copy as the virtual CPUs are paused immediately at source and unpaused at destination by transferring a minimum set of pages. The following domain life cycle events are emitted in this order during post-copy: * VIR_DOMAIN_EVENT_STARTED(destination) * VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY(source)--migration entered post-copy mode * VIR_DOMAIN_EVENT_RESUMED_POSTCOPY(destination)--guest is running on the destinaton host while some if its memory pages still remain on sourcehost. * VIR_DOMAIN_EVENT_RESUMED_MIGRATED(destination) * VIR_DOMAIN_EVENT_STOPPED_MIGRATED(source)--migration finished successfully and the destination host holds a complete guest state. In this change, dest host port binding activation is done when the following events are emitted at source for post-copy and pre-copy: * VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY * VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED This reduces the network outage during live migration as network switch is done right before VM resumes at destination. Co-Authored-By: Matt Riedemann Change-Id: Ic5cab99944df9e501ba2032eb96911c36304494d Closes-Bug: #1605016 --- nova/compute/manager.py | 49 ++++++++++++--- nova/tests/unit/compute/test_compute_mgr.py | 70 +++++++++++++++++++-- nova/tests/unit/virt/libvirt/test_host.py | 40 ++++++++++++ nova/virt/event.py | 5 ++ nova/virt/libvirt/host.py | 15 ++++- 5 files changed, 165 insertions(+), 14 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 8369ef858024..1c15dcaebc06 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1051,22 +1051,27 @@ class ComputeManager(manager.Manager): {'state': event.get_name()}, instance_uuid=event.get_instance_uuid()) context = nova.context.get_admin_context(read_deleted='yes') + # Join on info_cache since that's needed in migrate_instance_start. instance = objects.Instance.get_by_uuid(context, event.get_instance_uuid(), - expected_attrs=[]) + expected_attrs=['info_cache']) vm_power_state = None - if event.get_transition() == virtevent.EVENT_LIFECYCLE_STOPPED: + event_transition = event.get_transition() + if event_transition == virtevent.EVENT_LIFECYCLE_STOPPED: vm_power_state = power_state.SHUTDOWN - elif event.get_transition() == virtevent.EVENT_LIFECYCLE_STARTED: + elif event_transition == virtevent.EVENT_LIFECYCLE_STARTED: vm_power_state = power_state.RUNNING - elif event.get_transition() == virtevent.EVENT_LIFECYCLE_PAUSED: + elif event_transition in ( + virtevent.EVENT_LIFECYCLE_PAUSED, + virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED, + virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED): vm_power_state = power_state.PAUSED - elif event.get_transition() == virtevent.EVENT_LIFECYCLE_RESUMED: + elif event_transition == virtevent.EVENT_LIFECYCLE_RESUMED: vm_power_state = power_state.RUNNING - elif event.get_transition() == virtevent.EVENT_LIFECYCLE_SUSPENDED: + elif event_transition == virtevent.EVENT_LIFECYCLE_SUSPENDED: vm_power_state = power_state.SUSPENDED else: - LOG.warning("Unexpected power state %d", event.get_transition()) + LOG.warning("Unexpected lifecycle event: %d", event_transition) # Note(lpetrut): The event may be delayed, thus not reflecting # the current instance power state. In that case, ignore the event. @@ -1087,6 +1092,36 @@ class ComputeManager(manager.Manager): instance, vm_power_state) + # The following checks are for live migration. We want to activate + # the port binding for the destination host before the live migration + # is resumed on the destination host in order to reduce network + # downtime. Otherwise the ports are bound to the destination host + # in post_live_migration_at_destination. + migrate_finish_statuses = { + # This happens on the source node and indicates live migration + # entered post-copy mode. + virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED: 'running (post-copy)', + # Suspended for offline migration. + virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED: 'running' + } + if (instance.task_state == task_states.MIGRATING and + event_transition in migrate_finish_statuses): + status = migrate_finish_statuses[event_transition] + try: + migration = objects.Migration.get_by_instance_and_status( + context, instance.uuid, status) + LOG.debug('Binding ports to destination host: %s', + migration.dest_compute, instance=instance) + # For neutron, migrate_instance_start will activate the + # destination host port bindings, if there are any created by + # conductor before live migration started. + self.network_api.migrate_instance_start( + context, instance, migration) + except exception.MigrationNotFoundByStatus: + LOG.warning("Unable to find migration record with status " + "'%s' for instance. Port binding will happen in " + "post live migration.", status, instance=instance) + def handle_events(self, event): if isinstance(event, virtevent.LifecycleEvent): try: diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index d23417d8ffce..6b9b87efe7a8 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -92,24 +92,49 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): @mock.patch.object(manager.ComputeManager, '_get_power_state') @mock.patch.object(manager.ComputeManager, '_sync_instance_power_state') @mock.patch.object(objects.Instance, 'get_by_uuid') - def _test_handle_lifecycle_event(self, mock_get, mock_sync, - mock_get_power_state, transition, - event_pwr_state, current_pwr_state): + @mock.patch.object(objects.Migration, 'get_by_instance_and_status') + @mock.patch.object(nova.network.neutronv2.api.API, + 'migrate_instance_start') + def _test_handle_lifecycle_event(self, migrate_instance_start, + mock_get_migration, mock_get, + mock_sync, mock_get_power_state, + transition, event_pwr_state, + current_pwr_state): event = mock.Mock() - event.get_instance_uuid.return_value = mock.sentinel.uuid + mock_get.return_value = fake_instance.fake_instance_obj(self.context, + task_state=task_states.MIGRATING) event.get_transition.return_value = transition mock_get_power_state.return_value = current_pwr_state self.compute.handle_lifecycle_event(event) + mock_get.assert_called_once_with( + test.MatchType(context.RequestContext), + event.get_instance_uuid.return_value, + expected_attrs=['info_cache']) - mock_get.assert_called_with(mock.ANY, mock.sentinel.uuid, - expected_attrs=[]) if event_pwr_state == current_pwr_state: mock_sync.assert_called_with(mock.ANY, mock_get.return_value, event_pwr_state) else: self.assertFalse(mock_sync.called) + migrate_finish_statuses = { + virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED: 'running (post-copy)', + virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED: 'running' + } + if transition in migrate_finish_statuses: + mock_get_migration.assert_called_with( + test.MatchType(context.RequestContext), + mock_get.return_value.uuid, + migrate_finish_statuses[transition]) + migrate_instance_start.assert_called_once_with( + test.MatchType(context.RequestContext), + mock_get.return_value, + mock_get_migration.return_value) + else: + mock_get_migration.assert_not_called() + migrate_instance_start.assert_not_called() + def test_handle_lifecycle_event(self): event_map = {virtevent.EVENT_LIFECYCLE_STOPPED: power_state.SHUTDOWN, virtevent.EVENT_LIFECYCLE_STARTED: power_state.RUNNING, @@ -117,6 +142,10 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): virtevent.EVENT_LIFECYCLE_RESUMED: power_state.RUNNING, virtevent.EVENT_LIFECYCLE_SUSPENDED: power_state.SUSPENDED, + virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED: + power_state.PAUSED, + virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED: + power_state.PAUSED, } for transition, pwr_state in event_map.items(): @@ -130,6 +159,35 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): event_pwr_state=power_state.SHUTDOWN, current_pwr_state=power_state.RUNNING) + @mock.patch('nova.objects.Instance.get_by_uuid') + @mock.patch('nova.compute.manager.ComputeManager.' + '_sync_instance_power_state') + @mock.patch('nova.objects.Migration.get_by_instance_and_status', + side_effect=exception.MigrationNotFoundByStatus( + instance_id=uuids.instance, status='running (post-copy)')) + def test_handle_lifecycle_event_postcopy_migration_not_found( + self, mock_get_migration, mock_sync, mock_get_instance): + """Tests a EVENT_LIFECYCLE_POSTCOPY_STARTED scenario where the + migration record is not found by the expected status. + """ + inst = fake_instance.fake_instance_obj( + self.context, uuid=uuids.instance, + task_state=task_states.MIGRATING) + mock_get_instance.return_value = inst + event = virtevent.LifecycleEvent( + uuids.instance, virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED) + with mock.patch.object(self.compute, '_get_power_state', + return_value=power_state.PAUSED): + with mock.patch.object(self.compute.network_api, + 'migrate_instance_finish') as mig_finish: + self.compute.handle_lifecycle_event(event) + # Since we failed to find the migration record, we shouldn't call + # migrate_instance_finish. + mig_finish.assert_not_called() + mock_get_migration.assert_called_once_with( + test.MatchType(context.RequestContext), uuids.instance, + 'running (post-copy)') + @mock.patch('nova.compute.utils.notify_about_instance_action') def test_delete_instance_info_cache_delete_ordering(self, mock_notify): call_tracker = mock.Mock() diff --git a/nova/tests/unit/virt/libvirt/test_host.py b/nova/tests/unit/virt/libvirt/test_host.py index 97d16b72d7ea..f138c259ef9d 100644 --- a/nova/tests/unit/virt/libvirt/test_host.py +++ b/nova/tests/unit/virt/libvirt/test_host.py @@ -192,6 +192,46 @@ class HostTestCase(test.NoDBTestCase): self.assertEqual(got_events[0].transition, event.EVENT_LIFECYCLE_STOPPED) + def test_event_lifecycle_callback_suspended_old_libvirt(self): + """Tests the suspended lifecycle event with libvirt before post-copy + """ + hostimpl = mock.MagicMock() + conn = mock.MagicMock() + fake_dom_xml = """ + + cef19ce0-0ca2-11df-855d-b19fbce37686 + + """ + dom = fakelibvirt.Domain(conn, fake_dom_xml, running=True) + VIR_DOMAIN_EVENT_SUSPENDED_PAUSED = 0 + host.Host._event_lifecycle_callback( + conn, dom, fakelibvirt.VIR_DOMAIN_EVENT_SUSPENDED, + detail=VIR_DOMAIN_EVENT_SUSPENDED_PAUSED, opaque=hostimpl) + expected_event = hostimpl._queue_event.call_args[0][0] + self.assertEqual(event.EVENT_LIFECYCLE_PAUSED, + expected_event.transition) + + def test_event_lifecycle_callback_suspended_postcopy(self): + """Tests the suspended lifecycle event with libvirt with post-copy""" + hostimpl = mock.MagicMock() + conn = mock.MagicMock() + fake_dom_xml = """ + + cef19ce0-0ca2-11df-855d-b19fbce37686 + + """ + dom = fakelibvirt.Domain(conn, fake_dom_xml, running=True) + VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY = 7 + with mock.patch.object(host.libvirt, + 'VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY', new=7, + create=True): + host.Host._event_lifecycle_callback( + conn, dom, fakelibvirt.VIR_DOMAIN_EVENT_SUSPENDED, + detail=VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY, opaque=hostimpl) + expected_event = hostimpl._queue_event.call_args[0][0] + self.assertEqual(event.EVENT_LIFECYCLE_POSTCOPY_STARTED, + expected_event.transition) + def test_event_emit_delayed_call_delayed(self): ev = event.LifecycleEvent( "cef19ce0-0ca2-11df-855d-b19fbce37686", diff --git a/nova/virt/event.py b/nova/virt/event.py index 3b065bcb4226..6e4e01edc66c 100644 --- a/nova/virt/event.py +++ b/nova/virt/event.py @@ -29,6 +29,9 @@ EVENT_LIFECYCLE_STOPPED = 1 EVENT_LIFECYCLE_PAUSED = 2 EVENT_LIFECYCLE_RESUMED = 3 EVENT_LIFECYCLE_SUSPENDED = 4 +EVENT_LIFECYCLE_POSTCOPY_STARTED = 5 +EVENT_LIFECYCLE_MIGRATION_COMPLETED = 6 + NAMES = { EVENT_LIFECYCLE_STARTED: _('Started'), @@ -36,6 +39,8 @@ NAMES = { EVENT_LIFECYCLE_PAUSED: _('Paused'), EVENT_LIFECYCLE_RESUMED: _('Resumed'), EVENT_LIFECYCLE_SUSPENDED: _('Suspended'), + EVENT_LIFECYCLE_POSTCOPY_STARTED: _('Postcopy started'), + EVENT_LIFECYCLE_MIGRATION_COMPLETED: _('Migration completed'), } diff --git a/nova/virt/libvirt/host.py b/nova/virt/libvirt/host.py index 1c6916a01081..c0ea726bcdc7 100644 --- a/nova/virt/libvirt/host.py +++ b/nova/virt/libvirt/host.py @@ -170,7 +170,20 @@ class Host(object): elif event == libvirt.VIR_DOMAIN_EVENT_STARTED: transition = virtevent.EVENT_LIFECYCLE_STARTED elif event == libvirt.VIR_DOMAIN_EVENT_SUSPENDED: - transition = virtevent.EVENT_LIFECYCLE_PAUSED + # NOTE(siva_krishnan): We have to check if + # VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY and + # VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED exist since the current + # minimum version of libvirt (1.2.9) don't have those attributes. + # This check can be removed once MIN_LIBVIRT_VERSION is bumped to + # at least 1.3.3. + if (hasattr(libvirt, 'VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY') and + detail == libvirt.VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY): + transition = virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED + elif (hasattr(libvirt, 'VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED') and + detail == libvirt.VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED): + transition = virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED + else: + transition = virtevent.EVENT_LIFECYCLE_PAUSED elif event == libvirt.VIR_DOMAIN_EVENT_RESUMED: transition = virtevent.EVENT_LIFECYCLE_RESUMED