Merge "libvirt: add live migration timeout action"

This commit is contained in:
Zuul 2018-12-15 19:33:02 +00:00 committed by Gerrit Code Review
commit 35ee7edd94
7 changed files with 141 additions and 57 deletions

View File

@ -218,9 +218,25 @@ What to do when the migration times out
During the migration process, the instance may write to a memory page after
that page has been copied to the destination. When that happens, the same page
has to be copied again. The instance may write to memory pages faster than they
can be copied, so that the migration cannot complete. The Compute service will
cancel it when the ``live_migration_completion_timeout``, a configuration
parameter, is reached.
can be copied, so that the migration cannot complete. There are two optional
actions, controlled by
:oslo.config:option:`libvirt.live_migration_timeout_action`, which can be
taken against a VM after
:oslo.config:option:`libvirt.live_migration_completion_timeout` is reached:
1. ``abort`` (default): The live migration operation will be cancelled after
the completion timeout is reached. This is similar to using API
``DELETE /servers/{server_id}/migrations/{migration_id}``.
2. ``force_complete``: The compute service will either pause the VM or trigger
post-copy depending on if post copy is enabled and available
(:oslo.config:option:`libvirt.live_migration_permit_post_copy` is set to
`True`). This is similar to using API
``POST /servers/{server_id}/migrations/{migration_id}/action (force_complete)``.
You can also read the
:oslo.config:option:`libvirt.live_migration_timeout_action`
configuration option help for more details.
The following remarks assume the KVM/Libvirt hypervisor.

View File

@ -375,6 +375,7 @@ transferred, with lower bound of a minimum of 2 GiB per device.
"""),
cfg.IntOpt('live_migration_completion_timeout',
default=800,
min=0,
mutable=True,
help="""
Time to wait, in seconds, for migration to successfully complete transferring
@ -407,6 +408,23 @@ Set to 0 to disable timeouts.
This is deprecated, and now disabled by default because we have found serious
bugs in this feature that caused false live-migration timeout failures. This
feature will be removed or replaced in a future release.
"""),
cfg.StrOpt('live_migration_timeout_action',
default='abort',
choices=('abort', 'force_complete'),
mutable=True,
help="""
This option will be used to determine what action will be taken against a
VM after ``live_migration_completion_timeout`` expires. By default, the live
migrate operation will be aborted after completion timeout. If it is set to
``force_complete``, the compute service will either pause the VM or trigger
post-copy depending on if post copy is enabled and available
(``live_migration_permit_post_copy`` is set to True).
Related options:
* live_migration_completion_timeout
* live_migration_permit_post_copy
"""),
cfg.BoolOpt('live_migration_permit_post_copy',
default=False,
@ -418,7 +436,12 @@ needs to be transferred. Post-copy requires libvirt>=1.3.3 and QEMU>=2.5.0.
When permitted, post-copy mode will be automatically activated if a
live-migration memory copy iteration does not make percentage increase of at
least 10% over the last iteration.
least 10% over the last iteration, or will be automatically activated if
we reach the timeout defined by ``live_migration_completion_timeout`` and
``live_migration_timeout_action`` is set to 'force_complete'. Note if you
change to no timeout or choose to use 'abort',
i.e. ``live_migration_completion_timeout = 0``, then there will be no
automatic switch to post-copy.
The live-migration force complete API also uses post-copy when permitted. If
post-copy mode is not available, force complete falls back to pausing the VM
@ -430,7 +453,8 @@ details, please see the Administration guide.
Related options:
* live_migration_permit_auto_converge
* live_migration_permit_auto_converge
* live_migration_timeout_action
"""),
cfg.BoolOpt('live_migration_permit_auto_converge',
default=False,

View File

@ -11457,6 +11457,38 @@ class LibvirtConnTestCase(test.NoDBTestCase,
self.EXPECT_SUCCESS,
expected_switch=True)
@mock.patch.object(libvirt_driver.LibvirtDriver,
"_is_post_copy_enabled")
def test_live_migration_monitor_force_complete_postcopy(self,
mock_postcopy_enabled):
self.flags(live_migration_completion_timeout=40,
live_migration_timeout_action='force_complete',
group='libvirt')
mock_postcopy_enabled.return_value = True
# Each one of these fake times is used for time.time()
# when a new domain_info_records entry is consumed.
fake_times = [0, 40, 80, 120, 160, 200, 240, 280, 320]
domain_info_records = [
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
"thread-finish",
"domain-stop",
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED),
]
self._test_live_migration_monitoring(domain_info_records, fake_times,
self.EXPECT_SUCCESS,
expected_switch=True)
@mock.patch.object(host.Host, "get_connection")
@mock.patch.object(utils, "spawn")
@mock.patch.object(libvirt_driver.LibvirtDriver, "_live_migration_monitor")

View File

@ -976,51 +976,33 @@ class MigrationMonitorTestCase(test.NoDBTestCase):
def test_live_migration_abort_stuck(self):
# Progress time exceeds progress timeout
self.assertTrue(migration.should_abort(self.instance,
5000,
1000, 2000,
4500, 9000,
"running"))
self.assertTrue(migration.should_trigger_timeout_action(
self.instance, 5000, 1000, 2000, 4500, 9000, "running"))
def test_live_migration_abort_no_prog_timeout(self):
# Progress timeout is disabled
self.assertFalse(migration.should_abort(self.instance,
5000,
1000, 0,
4500, 9000,
"running"))
self.assertFalse(migration.should_trigger_timeout_action(
self.instance, 5000, 1000, 0, 4500, 9000, "running"))
def test_live_migration_abort_not_stuck(self):
# Progress time is less than progress timeout
self.assertFalse(migration.should_abort(self.instance,
5000,
4500, 2000,
4500, 9000,
"running"))
self.assertFalse(migration.should_trigger_timeout_action(
self.instance, 5000, 4500, 2000, 4500, 9000, "running"))
def test_live_migration_abort_too_long(self):
# Elapsed time is over completion timeout
self.assertTrue(migration.should_abort(self.instance,
5000,
4500, 2000,
4500, 2000,
"running"))
self.assertTrue(migration.should_trigger_timeout_action(
self.instance, 5000, 4500, 2000, 4500, 2000, "running"))
def test_live_migration_abort_no_comp_timeout(self):
# Completion timeout is disabled
self.assertFalse(migration.should_abort(self.instance,
5000,
4500, 2000,
4500, 0,
"running"))
self.assertFalse(migration.should_trigger_timeout_action(
self.instance, 5000, 4500, 2000, 4500, 0, "running"))
def test_live_migration_abort_still_working(self):
# Elapsed time is less than completion timeout
self.assertFalse(migration.should_abort(self.instance,
5000,
4500, 2000,
4500, 9000,
"running"))
self.assertFalse(migration.should_trigger_timeout_action(
self.instance, 5000, 4500, 2000, 4500, 9000, "running"))
def test_live_migration_postcopy_switch(self):
# Migration progress is not fast enough

View File

@ -7379,18 +7379,28 @@ class LibvirtDriver(driver.ComputeDriver):
progress_timeout = CONF.libvirt.live_migration_progress_timeout
completion_timeout = int(
CONF.libvirt.live_migration_completion_timeout * data_gb)
if libvirt_migrate.should_abort(instance, now, progress_time,
progress_timeout, elapsed,
completion_timeout,
migration.status):
try:
guest.abort_job()
except libvirt.libvirtError as e:
LOG.warning("Failed to abort migration %s",
encodeutils.exception_to_unicode(e),
instance=instance)
self._clear_empty_migration(instance)
raise
# NOTE(yikun): Check the completion timeout to determine
# should trigger the timeout action, and there are two choices
# ``abort`` (default) or ``force_complete``. If the action is
# set to ``force_complete``, the post-copy will be triggered
# if available else the VM will be suspended, otherwise the
# live migrate operation will be aborted.
if libvirt_migrate.should_trigger_timeout_action(
instance, now, progress_time, progress_timeout,
elapsed, completion_timeout, migration.status):
timeout_act = CONF.libvirt.live_migration_timeout_action
if timeout_act == 'force_complete':
self.live_migration_force_complete(instance)
else:
# timeout action is 'abort'
try:
guest.abort_job()
except libvirt.libvirtError as e:
LOG.warning("Failed to abort migration %s",
encodeutils.exception_to_unicode(e),
instance=instance)
self._clear_empty_migration(instance)
raise
if (is_post_copy_enabled and
libvirt_migrate.should_switch_to_postcopy(

View File

@ -376,11 +376,11 @@ def find_job_type(guest, instance):
return libvirt.VIR_DOMAIN_JOB_FAILED
def should_abort(instance, now,
progress_time, progress_timeout,
elapsed, completion_timeout,
migration_status):
"""Determine if the migration should be aborted
def should_trigger_timeout_action(instance, now,
progress_time, progress_timeout,
elapsed, completion_timeout,
migration_status):
"""Determine if the migration timeout action should be triggered
:param instance: a nova.objects.Instance
:param now: current time in secs since epoch
@ -391,12 +391,18 @@ def should_abort(instance, now,
:param migration_status: current status of the migration
Check the progress and completion timeouts to determine if either
of them have been hit, and should thus cause migration to be aborted
of them have been hit, and should thus cause migration timeout action to
be triggered.
Avoid migration to be aborted if it is running in post-copy mode
Avoid migration to be aborted or triggered post-copy again if it is
running in post-copy mode
:returns: True if migration should be aborted, False otherwise
:returns: True if the migration completion timeout action should be
performed, False otherwise
"""
if not completion_timeout:
return False
if migration_status == 'running (post-copy)':
return False
@ -406,8 +412,7 @@ def should_abort(instance, now,
(now - progress_time), instance=instance)
return True
if (completion_timeout != 0 and
elapsed > completion_timeout):
if elapsed > completion_timeout:
LOG.warning("Live migration not completed after %d sec",
completion_timeout, instance=instance)
return True

View File

@ -0,0 +1,15 @@
---
features:
- |
A new configuration option ``[libvirt]/live_migration_timeout_action``
is added. This new option will have choices ``abort`` (default)
or ``force_complete``. This option will determine what actions will be
taken against a VM after ``live_migration_completion_timeout`` expires.
Currently nova just aborts the live migrate operation after completion
timeout expires. By default, we keep the same behavior of aborting after
completion timeout. ``force_complete`` will either pause the VM or trigger
post-copy depending on if post copy is enabled and available.
The ``[libvirt]/live_migration_completion_timeout`` is restricted by
minimum 0 and will now raise a ValueError if the configuration option
value is less than minimum value.