From a57800d3825ef2fb833d8024c6f7e5c550f55e2f Mon Sep 17 00:00:00 2001 From: Stephen Finucane Date: Fri, 21 Aug 2020 16:54:16 +0100 Subject: [PATCH] Move confirm resize under semaphore The 'ResourceTracker.update_available_resource' periodic task builds usage information for the current host by inspecting instances and in-progress migrations, combining the two. Specifically, it finds all instances that are not in the 'DELETED' or 'SHELVED_OFFLOADED' state, calculates the usage from these, then finds all in-progress migrations for the host that don't have an associated instance (to prevent double accounting) and includes the usage for these. In addition to the periodic task, the 'ResourceTracker' class has a number of helper functions to make or drop claims for the inventory generated by the 'update_available_resource' periodic task as part of the various instance operations. These helpers naturally assume that when making a claim for a particular instance or migration, there shouldn't already be resources allocated for same. Conversely, when dropping claims, the resources should currently be allocated. However, the check for *active* instances and *in-progress* migrations in the periodic task means we have to be careful in how we make changes to a given instance or migration record. Running the periodic task between such an operation and an attempt to make or drop a claim can result in TOCTOU-like races. This generally isn't an issue: we use the 'COMPUTE_RESOURCE_SEMAPHORE' semaphore to prevent the periodic task running while we're claiming resources in helpers like 'ResourceTracker.instance_claim' and we make our changes to the instances and migrations within this context. There is one exception though: the 'drop_move_claim' helper. This function is used when dropping a claim for either a cold migration, a resize or a live migration, and will drop usage from either the source host (based on the "old" flavor) for a resize confirm or the destination host (based on the "new" flavor) for a resize revert or live migration rollback. Unfortunately, while the function itself is wrapped in the semaphore, no changes to the state or the instance or migration in question are protected by it. Consider the confirm resize case, which we're addressing here. If we mark the migration as 'confirmed' before running 'drop_move_claim', then the periodic task running between these steps will not account for the usage on the source since the migration is allegedly 'confirmed'. The call to 'drop_move_claim' will then result in the tracker dropping usage that we're no longer accounting for. This "set migration status before dropping usage" is the current behaviour for both same-cell and cross-cell resize, via the 'ComputeManager.confirm_resize' and 'ComputeManager.confirm_snapshot_based_resize_at_source' functions, respectively. We could reverse those calls and run 'drop_move_claim' before marking the migration as 'confirmed', but while our usage will be momentarily correct, the periodic task running between these steps will re-add the usage we just dropped since the migration isn't yet 'confirmed'. The correct solution is to close this gap between setting the migration status and dropping the move claim to zero. We do this by putting both operations behind the 'COMPUTE_RESOURCE_SEMAPHORE', just like the claim operations. Change-Id: I26b050c402f5721fc490126e9becb643af9279b4 Signed-off-by: Stephen Finucane Partial-Bug: #1879878 --- nova/compute/manager.py | 23 ++++----------- nova/compute/resource_tracker.py | 22 +++++++++++++++ nova/conductor/tasks/cross_cell_migrate.py | 2 +- .../regressions/test_bug_1879878.py | 16 ++++------- nova/tests/unit/compute/test_compute.py | 6 +--- nova/tests/unit/compute/test_compute_mgr.py | 28 ++++++------------- .../unit/compute/test_resource_tracker.py | 22 +++++++++------ 7 files changed, 56 insertions(+), 63 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 075c5d8f89de..b6d8fe0ee437 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -4359,6 +4359,8 @@ class ComputeManager(manager.Manager): # NOTE(tr3buchet): tear down networks on source host self.network_api.setup_networks_on_host(context, instance, migration.source_compute, teardown=True) + + # TODO(stephenfin): These next three calls should be bundled network_info = self.network_api.get_instance_nw_info(context, instance) @@ -4372,17 +4374,8 @@ class ComputeManager(manager.Manager): self.driver.confirm_migration(context, migration, instance, network_info) - migration.status = 'confirmed' - migration.save() - - # NOTE(mriedem): drop_move_claim relies on - # instance.migration_context so make sure to not call - # instance.drop_migration_context() until after drop_move_claim - # is called. - self.rt.drop_move_claim( - context, instance, migration.source_node, instance.old_flavor, - prefix='old_') - instance.drop_migration_context() + # Free up the old_flavor usage from the resource tracker for this host. + self.rt.drop_move_claim_at_source(context, instance, migration) # NOTE(mriedem): The old_vm_state could be STOPPED but the user # might have manually powered up the instance to confirm the @@ -4540,13 +4533,7 @@ class ComputeManager(manager.Manager): self._delete_volume_attachments(ctxt, instance.get_bdms()) # Free up the old_flavor usage from the resource tracker for this host. - self.rt.drop_move_claim( - ctxt, instance, migration.source_node, instance.old_flavor, - prefix='old_') - instance.drop_migration_context() - - migration.status = 'confirmed' - migration.save() + self.rt.drop_move_claim_at_source(ctxt, instance, migration) def _confirm_snapshot_based_resize_delete_port_bindings( self, ctxt, instance): diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 1074879738c1..1037163f0f42 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -551,9 +551,31 @@ class ResourceTracker(object): dev_pools_obj = self.pci_tracker.stats.to_device_pools_obj() self.compute_nodes[nodename].pci_device_pools = dev_pools_obj + @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True) + def drop_move_claim_at_source(self, context, instance, migration): + """Drop a move claim after confirming a resize or cold migration.""" + migration.status = 'confirmed' + migration.save() + + self._drop_move_claim( + context, instance, migration.source_node, instance.old_flavor, + prefix='old_') + + # NOTE(stephenfin): Unsetting this is unnecessary for cross-cell + # resize, since the source and dest instance objects are different and + # the source instance will be deleted soon. It's easier to just do it + # though. + instance.drop_migration_context() + @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True) def drop_move_claim(self, context, instance, nodename, instance_type=None, prefix='new_'): + self._drop_move_claim( + context, instance, nodename, instance_type, prefix='new_') + + def _drop_move_claim( + self, context, instance, nodename, instance_type=None, prefix='new_', + ): """Remove usage for an incoming/outgoing migration. :param context: Security context. diff --git a/nova/conductor/tasks/cross_cell_migrate.py b/nova/conductor/tasks/cross_cell_migrate.py index ba0994c81a2d..18f9a999bcbf 100644 --- a/nova/conductor/tasks/cross_cell_migrate.py +++ b/nova/conductor/tasks/cross_cell_migrate.py @@ -981,7 +981,7 @@ class ConfirmResizeTask(base.TaskBase): """ LOG.debug('Updating migration and instance status in target cell DB.', instance=self.instance) - # Complete the migration confirmation. + # Update the target cell migration. self.migration.status = 'confirmed' self.migration.save() # Update the target cell instance. diff --git a/nova/tests/functional/regressions/test_bug_1879878.py b/nova/tests/functional/regressions/test_bug_1879878.py index d6f2b1168876..5f5f12b90d0a 100644 --- a/nova/tests/functional/regressions/test_bug_1879878.py +++ b/nova/tests/functional/regressions/test_bug_1879878.py @@ -61,7 +61,7 @@ class TestColdMigrationUsage(integrated_helpers.ProviderUsageBaseTestCase): self.assertUsage(src_host, 1) self.assertUsage(dst_host, 0) - orig_drop_claim = rt.ResourceTracker.drop_move_claim + orig_drop_claim = rt.ResourceTracker.drop_move_claim_at_source def fake_drop_move_claim(*args, **kwargs): # run periodics after marking the migration confirmed, simulating a @@ -74,15 +74,14 @@ class TestColdMigrationUsage(integrated_helpers.ProviderUsageBaseTestCase): if drop_race: self._run_periodics() - # FIXME(stephenfin): the periodic should not have dropped the - # records for the src yet - self.assertUsage(src_host, 0) - self.assertUsage(dst_host, 1) + self.assertUsage(src_host, 1) + self.assertUsage(dst_host, 1) return orig_drop_claim(*args, **kwargs) self.stub_out( - 'nova.compute.resource_tracker.ResourceTracker.drop_move_claim', + 'nova.compute.resource_tracker.ResourceTracker.' + 'drop_move_claim_at_source', fake_drop_move_claim, ) @@ -98,10 +97,7 @@ class TestColdMigrationUsage(integrated_helpers.ProviderUsageBaseTestCase): # migration is now confirmed so we should once again only have usage on # one host - # FIXME(stephenfin): Our usage here should be 0 and 1 for source and - # dest respectively when confirming, but that won't happen until we run - # the periodic and rebuild our inventory from scratch - self.assertUsage(src_host, -1 if drop_race else 0) + self.assertUsage(src_host, 0) self.assertUsage(dst_host, 1) # running periodics shouldn't change things diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 6da6ef50f59a..69a9e0b97cb2 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -8141,14 +8141,10 @@ class ComputeTestCase(BaseTestCase, instance.new_flavor = new_type instance.migration_context = objects.MigrationContext() - def fake_drop_move_claim(*args, **kwargs): - pass - def fake_setup_networks_on_host(self, *args, **kwargs): pass - self._mock_rt( - drop_move_claim=mock.Mock(side_effect=fake_drop_move_claim)) + self._mock_rt() with test.nested( mock.patch.object(self.compute.network_api, diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 8fbc3a1e97e0..7eba83177ed7 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -8777,14 +8777,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, mock_mig_save, mock_mig_get, mock_inst_get, mock_delete_scheduler_info): - def fake_drop_move_claim(*args, **kwargs): - # RT.drop_move_claim must be called before - # instance.drop_migration_context. - mock_drop.assert_not_called() - - mock_rt = self._mock_rt() - # Enforce order of drop_move_claim/drop_migration_context calls. - mock_rt.drop_move_claim.side_effect = fake_drop_move_claim + self._mock_rt() self.instance.migration_context = objects.MigrationContext( new_pci_devices=None, old_pci_devices=None) @@ -11569,22 +11562,22 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, '_confirm_snapshot_based_resize_delete_port_bindings') @mock.patch('nova.compute.manager.ComputeManager.' '_delete_volume_attachments') - @mock.patch('nova.objects.Instance.drop_migration_context') def test_confirm_snapshot_based_resize_at_source( - self, mock_drop_mig_ctx, mock_delete_vols, mock_delete_bindings, + self, mock_delete_vols, mock_delete_bindings, mock_delete_allocs, mock_get_bdms): """Happy path test for confirm_snapshot_based_resize_at_source.""" self.instance.old_flavor = objects.Flavor() with test.nested( - mock.patch.object(self.compute, 'network_api'), - mock.patch.object(self.compute.driver, 'cleanup'), - mock.patch.object(self.compute.rt, 'drop_move_claim') + mock.patch.object(self.compute, 'network_api'), + mock.patch.object(self.compute.driver, 'cleanup'), + mock.patch.object(self.compute.rt, 'drop_move_claim_at_source') ) as ( - mock_network_api, mock_cleanup, mock_drop_claim + mock_network_api, mock_cleanup, mock_drop_claim, ): # Run the code. self.compute.confirm_snapshot_based_resize_at_source( self.context, self.instance, self.migration) + # Assert the mocks. mock_network_api.get_instance_nw_info.assert_called_once_with( self.context, self.instance) @@ -11600,12 +11593,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, self.context, mock_get_bdms.return_value) # Move claim and migration context were dropped. mock_drop_claim.assert_called_once_with( - self.context, self.instance, self.migration.source_node, - self.instance.old_flavor, prefix='old_') - mock_drop_mig_ctx.assert_called_once_with() - # The migration was updated. - self.assertEqual('confirmed', self.migration.status) - self.migration.save.assert_called_once_with() + self.context, self.instance, self.migration) mock_delete_allocs.assert_called_once_with( self.context, self.instance, self.migration) diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index 62bf0b8ead9a..3676d4dd4246 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -2541,6 +2541,7 @@ class TestResize(BaseTestCase): mig_context_obj.new_resources = objects.ResourceList( objects=[self.resource_1, self.resource_2]) instance.migration_context = mig_context_obj + instance.system_metadata = {} migration = objects.Migration( id=3, @@ -2582,15 +2583,18 @@ class TestResize(BaseTestCase): len(self.rt.assigned_resources[cn.uuid][rc])) # Confirm or revert resize - if revert: - flavor = new_flavor - prefix = 'new_' - else: - flavor = old_flavor - prefix = 'old_' - - self.rt.drop_move_claim(ctx, instance, _NODENAME, flavor, - prefix=prefix) + with test.nested( + mock.patch('nova.objects.Migration.save'), + mock.patch('nova.objects.Instance.drop_migration_context'), + ): + if revert: + flavor = new_flavor + prefix = 'new_' + self.rt.drop_move_claim( + ctx, instance, _NODENAME, flavor, prefix=prefix) + else: # confirm + flavor = old_flavor + self.rt.drop_move_claim_at_source(ctx, instance, migration) expected = compute_update_usage(expected, flavor, sign=-1) self.assertTrue(obj_base.obj_equal_prims(