diff --git a/doc/source/admin/live-migration-usage.rst b/doc/source/admin/live-migration-usage.rst index 85bbbafc20e2..4ad048d057a5 100644 --- a/doc/source/admin/live-migration-usage.rst +++ b/doc/source/admin/live-migration-usage.rst @@ -218,9 +218,25 @@ What to do when the migration times out During the migration process, the instance may write to a memory page after that page has been copied to the destination. When that happens, the same page has to be copied again. The instance may write to memory pages faster than they -can be copied, so that the migration cannot complete. The Compute service will -cancel it when the ``live_migration_completion_timeout``, a configuration -parameter, is reached. +can be copied, so that the migration cannot complete. There are two optional +actions, controlled by +:oslo.config:option:`libvirt.live_migration_timeout_action`, which can be +taken against a VM after +:oslo.config:option:`libvirt.live_migration_completion_timeout` is reached: + +1. ``abort`` (default): The live migration operation will be cancelled after + the completion timeout is reached. This is similar to using API + ``DELETE /servers/{server_id}/migrations/{migration_id}``. + +2. ``force_complete``: The compute service will either pause the VM or trigger + post-copy depending on if post copy is enabled and available + (:oslo.config:option:`libvirt.live_migration_permit_post_copy` is set to + `True`). This is similar to using API + ``POST /servers/{server_id}/migrations/{migration_id}/action (force_complete)``. + +You can also read the +:oslo.config:option:`libvirt.live_migration_timeout_action` +configuration option help for more details. The following remarks assume the KVM/Libvirt hypervisor. diff --git a/nova/conf/libvirt.py b/nova/conf/libvirt.py index 68bfc11abdf9..70f7c0b076f2 100644 --- a/nova/conf/libvirt.py +++ b/nova/conf/libvirt.py @@ -375,6 +375,7 @@ transferred, with lower bound of a minimum of 2 GiB per device. """), cfg.IntOpt('live_migration_completion_timeout', default=800, + min=0, mutable=True, help=""" Time to wait, in seconds, for migration to successfully complete transferring @@ -407,6 +408,23 @@ Set to 0 to disable timeouts. This is deprecated, and now disabled by default because we have found serious bugs in this feature that caused false live-migration timeout failures. This feature will be removed or replaced in a future release. +"""), + cfg.StrOpt('live_migration_timeout_action', + default='abort', + choices=('abort', 'force_complete'), + mutable=True, + help=""" +This option will be used to determine what action will be taken against a +VM after ``live_migration_completion_timeout`` expires. By default, the live +migrate operation will be aborted after completion timeout. If it is set to +``force_complete``, the compute service will either pause the VM or trigger +post-copy depending on if post copy is enabled and available +(``live_migration_permit_post_copy`` is set to True). + +Related options: + +* live_migration_completion_timeout +* live_migration_permit_post_copy """), cfg.BoolOpt('live_migration_permit_post_copy', default=False, @@ -418,7 +436,12 @@ needs to be transferred. Post-copy requires libvirt>=1.3.3 and QEMU>=2.5.0. When permitted, post-copy mode will be automatically activated if a live-migration memory copy iteration does not make percentage increase of at -least 10% over the last iteration. +least 10% over the last iteration, or will be automatically activated if +we reach the timeout defined by ``live_migration_completion_timeout`` and +``live_migration_timeout_action`` is set to 'force_complete'. Note if you +change to no timeout or choose to use 'abort', +i.e. ``live_migration_completion_timeout = 0``, then there will be no +automatic switch to post-copy. The live-migration force complete API also uses post-copy when permitted. If post-copy mode is not available, force complete falls back to pausing the VM @@ -430,7 +453,8 @@ details, please see the Administration guide. Related options: - * live_migration_permit_auto_converge +* live_migration_permit_auto_converge +* live_migration_timeout_action """), cfg.BoolOpt('live_migration_permit_auto_converge', default=False, diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 559f7025ee2f..b7aba9d742db 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -11456,6 +11456,38 @@ class LibvirtConnTestCase(test.NoDBTestCase, self.EXPECT_SUCCESS, expected_switch=True) + @mock.patch.object(libvirt_driver.LibvirtDriver, + "_is_post_copy_enabled") + def test_live_migration_monitor_force_complete_postcopy(self, + mock_postcopy_enabled): + self.flags(live_migration_completion_timeout=40, + live_migration_timeout_action='force_complete', + group='libvirt') + mock_postcopy_enabled.return_value = True + + # Each one of these fake times is used for time.time() + # when a new domain_info_records entry is consumed. + fake_times = [0, 40, 80, 120, 160, 200, 240, 280, 320] + + domain_info_records = [ + libvirt_guest.JobInfo( + type=fakelibvirt.VIR_DOMAIN_JOB_NONE), + libvirt_guest.JobInfo( + type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED), + libvirt_guest.JobInfo( + type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED), + libvirt_guest.JobInfo( + type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED), + "thread-finish", + "domain-stop", + libvirt_guest.JobInfo( + type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED), + ] + + self._test_live_migration_monitoring(domain_info_records, fake_times, + self.EXPECT_SUCCESS, + expected_switch=True) + @mock.patch.object(host.Host, "get_connection") @mock.patch.object(utils, "spawn") @mock.patch.object(libvirt_driver.LibvirtDriver, "_live_migration_monitor") diff --git a/nova/tests/unit/virt/libvirt/test_migration.py b/nova/tests/unit/virt/libvirt/test_migration.py index 7dbdc3cbef2c..4c2fedf548f7 100644 --- a/nova/tests/unit/virt/libvirt/test_migration.py +++ b/nova/tests/unit/virt/libvirt/test_migration.py @@ -976,51 +976,33 @@ class MigrationMonitorTestCase(test.NoDBTestCase): def test_live_migration_abort_stuck(self): # Progress time exceeds progress timeout - self.assertTrue(migration.should_abort(self.instance, - 5000, - 1000, 2000, - 4500, 9000, - "running")) + self.assertTrue(migration.should_trigger_timeout_action( + self.instance, 5000, 1000, 2000, 4500, 9000, "running")) def test_live_migration_abort_no_prog_timeout(self): # Progress timeout is disabled - self.assertFalse(migration.should_abort(self.instance, - 5000, - 1000, 0, - 4500, 9000, - "running")) + self.assertFalse(migration.should_trigger_timeout_action( + self.instance, 5000, 1000, 0, 4500, 9000, "running")) def test_live_migration_abort_not_stuck(self): # Progress time is less than progress timeout - self.assertFalse(migration.should_abort(self.instance, - 5000, - 4500, 2000, - 4500, 9000, - "running")) + self.assertFalse(migration.should_trigger_timeout_action( + self.instance, 5000, 4500, 2000, 4500, 9000, "running")) def test_live_migration_abort_too_long(self): # Elapsed time is over completion timeout - self.assertTrue(migration.should_abort(self.instance, - 5000, - 4500, 2000, - 4500, 2000, - "running")) + self.assertTrue(migration.should_trigger_timeout_action( + self.instance, 5000, 4500, 2000, 4500, 2000, "running")) def test_live_migration_abort_no_comp_timeout(self): # Completion timeout is disabled - self.assertFalse(migration.should_abort(self.instance, - 5000, - 4500, 2000, - 4500, 0, - "running")) + self.assertFalse(migration.should_trigger_timeout_action( + self.instance, 5000, 4500, 2000, 4500, 0, "running")) def test_live_migration_abort_still_working(self): # Elapsed time is less than completion timeout - self.assertFalse(migration.should_abort(self.instance, - 5000, - 4500, 2000, - 4500, 9000, - "running")) + self.assertFalse(migration.should_trigger_timeout_action( + self.instance, 5000, 4500, 2000, 4500, 9000, "running")) def test_live_migration_postcopy_switch(self): # Migration progress is not fast enough diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 16a018a31842..df67c4c5417b 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -7384,18 +7384,28 @@ class LibvirtDriver(driver.ComputeDriver): progress_timeout = CONF.libvirt.live_migration_progress_timeout completion_timeout = int( CONF.libvirt.live_migration_completion_timeout * data_gb) - if libvirt_migrate.should_abort(instance, now, progress_time, - progress_timeout, elapsed, - completion_timeout, - migration.status): - try: - guest.abort_job() - except libvirt.libvirtError as e: - LOG.warning("Failed to abort migration %s", - encodeutils.exception_to_unicode(e), - instance=instance) - self._clear_empty_migration(instance) - raise + # NOTE(yikun): Check the completion timeout to determine + # should trigger the timeout action, and there are two choices + # ``abort`` (default) or ``force_complete``. If the action is + # set to ``force_complete``, the post-copy will be triggered + # if available else the VM will be suspended, otherwise the + # live migrate operation will be aborted. + if libvirt_migrate.should_trigger_timeout_action( + instance, now, progress_time, progress_timeout, + elapsed, completion_timeout, migration.status): + timeout_act = CONF.libvirt.live_migration_timeout_action + if timeout_act == 'force_complete': + self.live_migration_force_complete(instance) + else: + # timeout action is 'abort' + try: + guest.abort_job() + except libvirt.libvirtError as e: + LOG.warning("Failed to abort migration %s", + encodeutils.exception_to_unicode(e), + instance=instance) + self._clear_empty_migration(instance) + raise if (is_post_copy_enabled and libvirt_migrate.should_switch_to_postcopy( diff --git a/nova/virt/libvirt/migration.py b/nova/virt/libvirt/migration.py index de4fd5f8b701..daed5d3791f3 100644 --- a/nova/virt/libvirt/migration.py +++ b/nova/virt/libvirt/migration.py @@ -376,11 +376,11 @@ def find_job_type(guest, instance): return libvirt.VIR_DOMAIN_JOB_FAILED -def should_abort(instance, now, - progress_time, progress_timeout, - elapsed, completion_timeout, - migration_status): - """Determine if the migration should be aborted +def should_trigger_timeout_action(instance, now, + progress_time, progress_timeout, + elapsed, completion_timeout, + migration_status): + """Determine if the migration timeout action should be triggered :param instance: a nova.objects.Instance :param now: current time in secs since epoch @@ -391,12 +391,18 @@ def should_abort(instance, now, :param migration_status: current status of the migration Check the progress and completion timeouts to determine if either - of them have been hit, and should thus cause migration to be aborted + of them have been hit, and should thus cause migration timeout action to + be triggered. - Avoid migration to be aborted if it is running in post-copy mode + Avoid migration to be aborted or triggered post-copy again if it is + running in post-copy mode - :returns: True if migration should be aborted, False otherwise + :returns: True if the migration completion timeout action should be + performed, False otherwise """ + if not completion_timeout: + return False + if migration_status == 'running (post-copy)': return False @@ -406,8 +412,7 @@ def should_abort(instance, now, (now - progress_time), instance=instance) return True - if (completion_timeout != 0 and - elapsed > completion_timeout): + if elapsed > completion_timeout: LOG.warning("Live migration not completed after %d sec", completion_timeout, instance=instance) return True diff --git a/releasenotes/notes/live-migration-force-after-timeout-54f2a4b631d295bb.yaml b/releasenotes/notes/live-migration-force-after-timeout-54f2a4b631d295bb.yaml new file mode 100644 index 000000000000..3e46ce026981 --- /dev/null +++ b/releasenotes/notes/live-migration-force-after-timeout-54f2a4b631d295bb.yaml @@ -0,0 +1,15 @@ +--- +features: + - | + A new configuration option ``[libvirt]/live_migration_timeout_action`` + is added. This new option will have choices ``abort`` (default) + or ``force_complete``. This option will determine what actions will be + taken against a VM after ``live_migration_completion_timeout`` expires. + Currently nova just aborts the live migrate operation after completion + timeout expires. By default, we keep the same behavior of aborting after + completion timeout. ``force_complete`` will either pause the VM or trigger + post-copy depending on if post copy is enabled and available. + + The ``[libvirt]/live_migration_completion_timeout`` is restricted by + minimum 0 and will now raise a ValueError if the configuration option + value is less than minimum value.