Add reproducer for bug #1894095
As with the resize/cold migration confirm/resize flow, the rolling back of live migrations can be racy due to us doing various things without the benefit of locks. Add a reproducer demonstrating this. Change-Id: I276d2bbce4a0390a6cfd597bb51481cc53eca918 Signed-off-by: Stephen Finucane <stephenfin@redhat.com> Related-Bug: #1894095 Related-Bug: #1879878
This commit is contained in:
parent
d01972b272
commit
99850a0481
|
@ -19,6 +19,7 @@ from oslo_config import cfg
|
|||
from oslo_log import log as logging
|
||||
|
||||
from nova.compute import manager as compute_manager
|
||||
from nova.compute import resource_tracker as rt
|
||||
from nova import context
|
||||
from nova import objects
|
||||
from nova import test
|
||||
|
@ -207,7 +208,7 @@ class NUMALiveMigrationPositiveTests(NUMALiveMigrationPositiveBase):
|
|||
dom.complete_job()
|
||||
self.migrate_stub_ran = True
|
||||
|
||||
def _test(self, pin_dest=False):
|
||||
def _test(self, pin_dest):
|
||||
"""Live migrate the server on host_a to host_b.
|
||||
"""
|
||||
# Make sure instances initially land on "overlapping" CPUs on both
|
||||
|
@ -254,7 +255,7 @@ class NUMALiveMigrationPositiveTests(NUMALiveMigrationPositiveBase):
|
|||
# direction here.
|
||||
|
||||
def test_numa_live_migration(self):
|
||||
self._test()
|
||||
self._test(pin_dest=False)
|
||||
|
||||
def test_numa_live_migration_dest_pinned(self):
|
||||
self._test(pin_dest=True)
|
||||
|
@ -298,7 +299,7 @@ class NUMALiveMigrationPositiveTests(NUMALiveMigrationPositiveBase):
|
|||
self.useFixture(fixtures.MonkeyPatch(
|
||||
'nova.compute.manager.ComputeManager.live_migration',
|
||||
live_migration))
|
||||
self._test()
|
||||
self._test(pin_dest=False)
|
||||
self.assertTrue(self.live_migration_ran)
|
||||
|
||||
|
||||
|
@ -358,6 +359,9 @@ class NUMALiveMigrationRollbackTests(NUMALiveMigrationPositiveBase):
|
|||
self.assertEqual('host_a', self.get_host(self.server_a['id']))
|
||||
self.assertIsNone(self._get_migration_context(self.server_a['id']))
|
||||
|
||||
def _test_rollback(self, pin_dest=False):
|
||||
self._test(pin_dest)
|
||||
|
||||
# Check consumed and pinned CPUs. Things should be as they were before
|
||||
# the live migration, with CPUs 0,1 consumed on both hosts by the 2
|
||||
# servers.
|
||||
|
@ -367,10 +371,77 @@ class NUMALiveMigrationRollbackTests(NUMALiveMigrationPositiveBase):
|
|||
[0, 1], [0, 1])
|
||||
|
||||
def test_rollback(self):
|
||||
self._test()
|
||||
self._test_rollback()
|
||||
|
||||
def test_rollback_pinned_dest(self):
|
||||
self._test(pin_dest=True)
|
||||
self._test_rollback(pin_dest=True)
|
||||
|
||||
def _test_bug_1894095(self, pre_drop_race=False, post_drop_race=False):
|
||||
"""Reproducer for bug #1894095 under live migration.
|
||||
|
||||
Demonstrate the possibility of races caused by running the resource
|
||||
tracker's periodic task between marking a migration as failed and
|
||||
dropping the claim for that migration on the destination host.
|
||||
"""
|
||||
|
||||
orig_drop_move_claim = rt.ResourceTracker.drop_move_claim
|
||||
|
||||
def drop_move_claim(*args, **kwargs):
|
||||
"""Run periodics after marking the migration confirmed, simulating
|
||||
a race between the doing this and actually dropping the claim.
|
||||
"""
|
||||
|
||||
# check the usage, which should show usage on both hosts: server_a
|
||||
# on host_a, and server_b plus server_a's migration on host_b
|
||||
self._assert_host_consumed_cpus('host_a', [0, 1])
|
||||
self._assert_host_consumed_cpus('host_b', [0, 1, 2, 3])
|
||||
|
||||
if pre_drop_race:
|
||||
self._run_periodics()
|
||||
# FIXME(stephenfin): This is picking up server_a's "destination
|
||||
# CPUs", intended for host_b, on host_a
|
||||
self._assert_host_consumed_cpus('host_a', [2, 3])
|
||||
self._assert_host_consumed_cpus('host_b', [0, 1, 2, 3])
|
||||
# self._assert_host_consumed_cpus('host_a', [0, 1])
|
||||
# self._assert_host_consumed_cpus('host_b', [0, 1, 2, 3])
|
||||
|
||||
result = orig_drop_move_claim(*args, **kwargs)
|
||||
|
||||
if pre_drop_race:
|
||||
# FIXME(stephenfin): host_a's pinning information is still
|
||||
# incorrect, which is expected since dropping the move claim
|
||||
# makes no changes to the source host
|
||||
self._assert_host_consumed_cpus('host_a', [2, 3])
|
||||
else:
|
||||
self._assert_host_consumed_cpus('host_a', [0, 1])
|
||||
self._assert_host_consumed_cpus('host_b', [0, 1])
|
||||
|
||||
if post_drop_race:
|
||||
self._run_periodics()
|
||||
# FIXME(stephenfin): host_a is using the wrong pinned CPUs and
|
||||
# host_b has regained its previously dropped allocation
|
||||
self._assert_host_consumed_cpus('host_a', [2, 3])
|
||||
self._assert_host_consumed_cpus('host_b', [0, 1, 2, 3])
|
||||
# self._assert_host_consumed_cpus('host_a', [0, 1])
|
||||
# self._assert_host_consumed_cpus('host_b', [0, 1])
|
||||
|
||||
return result
|
||||
|
||||
self.useFixture(fixtures.MonkeyPatch(
|
||||
'nova.compute.resource_tracker.ResourceTracker.drop_move_claim',
|
||||
drop_move_claim))
|
||||
|
||||
self._test()
|
||||
|
||||
self._run_periodics()
|
||||
self._assert_host_consumed_cpus('host_a', [0, 1])
|
||||
self._assert_host_consumed_cpus('host_b', [0, 1])
|
||||
|
||||
def test_bug_1894095_pre_drop(self):
|
||||
self._test_bug_1894095(pre_drop_race=True)
|
||||
|
||||
def test_bug_1894095_post_drop(self):
|
||||
self._test_bug_1894095(post_drop_race=True)
|
||||
|
||||
|
||||
class NUMALiveMigrationLegacyBase(NUMALiveMigrationPositiveBase):
|
||||
|
|
Loading…
Reference in New Issue