diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 65dcab883c47..fd43b5c7a3bb 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -26,6 +26,7 @@ import os_resource_classes as orc import os_traits from oslo_log import log as logging from oslo_serialization import jsonutils +from oslo_utils import excutils import retrying from nova.compute import claims @@ -1052,12 +1053,23 @@ class ResourceTracker(object): def _update(self, context, compute_node, startup=False): """Update partial stats locally and populate them to Scheduler.""" + # _resource_change will update self.old_resources if it detects changes + # but we want to restore those if compute_node.save() fails. + nodename = compute_node.hypervisor_hostname + old_compute = self.old_resources[nodename] if self._resource_change(compute_node): # If the compute_node's resource changed, update to DB. # NOTE(jianghuaw): Once we completely move to use get_inventory() # for all resource provider's inv data. We can remove this check. # At the moment we still need this check and save compute_node. - compute_node.save() + try: + compute_node.save() + except Exception: + # Restore the previous state in self.old_resources so that on + # the next trip through here _resource_change does not have + # stale data to compare. + with excutils.save_and_reraise_exception(logger=LOG): + self.old_resources[nodename] = old_compute self._update_to_placement(context, compute_node, startup) diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index 061a29905ea0..bd10dee023db 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -1655,6 +1655,33 @@ class TestUpdateComputeNode(BaseTestCase): self.assertIn('Unable to find services table record for nova-compute', mock_log_error.call_args[0][0]) + def test_update_compute_node_save_fails_restores_old_resources(self): + """Tests the scenario that compute_node.save() fails and the + old_resources value for the node is restored to its previous value + before calling _resource_change updated it. + """ + self._setup_rt() + orig_compute = _COMPUTE_NODE_FIXTURES[0].obj_clone() + # Pretend the ComputeNode was just created in the DB but not yet saved + # with the free_disk_gb field. + delattr(orig_compute, 'free_disk_gb') + nodename = orig_compute.hypervisor_hostname + self.rt.old_resources[nodename] = orig_compute + # Now have an updated compute node with free_disk_gb set which should + # make _resource_change modify old_resources and return True. + updated_compute = _COMPUTE_NODE_FIXTURES[0].obj_clone() + ctxt = context.get_admin_context() + # Mock ComputeNode.save() to trigger some failure (realistically this + # could be a DBConnectionError). + with mock.patch.object(updated_compute, 'save', + side_effect=test.TestingException('db error')): + self.assertRaises(test.TestingException, + self.rt._update, + ctxt, updated_compute, startup=True) + # Make sure that the old_resources entry for the node has not changed + # from the original. + self.assertTrue(self.rt._resource_change(updated_compute)) + def test_copy_resources_no_update_allocation_ratios(self): """Tests that a ComputeNode object's allocation ratio fields are not set if the configured allocation ratio values are default None.