From e279ac404673784f33610cad55288e2e60649f88 Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Tue, 31 Jul 2018 17:26:47 -0400 Subject: [PATCH] Make ResourceTracker.stats node-specific As of change I6827137f35c0cb4f9fc4c6f753d9a035326ed01b in Ocata, the ResourceTracker manages multiple compute nodes via its "compute_nodes" variable, but the "stats" variable was still being shared across all nodes, which leads to leaking stats across nodes in an ironic deployment where a single nova-compute service host is managing multiple ironic instances (nodes). This change makes ResourceTracker.stats node-specific which fixes the ironic leak but also allows us to remove the stats deepcopy while iterating over instances which should improve performance for single-node deployments with potentially a large number of instances, i.e. vCenter. Conflicts: nova/compute/manager.py NOTE(mriedem): The conflict was due to not having change Iae904afb6cb4fcea8bb27741d774ffbe986a5fb4 in Pike. NOTE(mriedem): The check for node is None was moved from _do_build_and_run_instance to build_and_run_instance because the functional tests are using the ChanceScheduler since we don't have change I12de2e195022593ea2a3e2894f2c3b5226930d4f and the ChanceScheduler does not return the nodename during select_destinations so the compute has to get it from the driver. Change-Id: I0b9e5b711878fa47ba90e43c0b41437b57cf8ef6 Closes-Bug: #1784705 Closes-Bug: #1777422 (cherry picked from commit b5b7d86bb04f92d21cf954cd6b3463c9fcc637e6) (cherry picked from commit 7d99f5753f992bd4b58b0aaa7f83f71fb044776f) --- nova/compute/manager.py | 22 ++++++------ nova/compute/resource_tracker.py | 35 +++++++++++++------ .../compute/test_resource_tracker.py | 14 +++----- 3 files changed, 39 insertions(+), 32 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index cec2874a5709..88e0f6bc9bad 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1708,16 +1708,16 @@ class ComputeManager(manager.Manager): return block_device_info - def _build_failed(self): + def _build_failed(self, node): if CONF.compute.consecutive_build_service_disable_threshold: rt = self._get_resource_tracker() # NOTE(danms): Update our counter, but wait for the next # update_available_resource() periodic to flush it to the DB - rt.stats.build_failed() + rt.build_failed(node) - def _build_succeeded(self): + def _build_succeeded(self, node): rt = self._get_resource_tracker() - rt.stats.build_succeeded() + rt.build_succeeded(node) @wrap_exception() @reverts_task_state @@ -1728,6 +1728,11 @@ class ComputeManager(manager.Manager): security_groups=None, block_device_mapping=None, node=None, limits=None): + if node is None: + node = self.driver.get_available_nodes(refresh=True)[0] + LOG.debug('No node specified, defaulting to %s', node, + instance=instance) + @utils.synchronized(instance.uuid) def _locked_do_build_and_run_instance(*args, **kwargs): # NOTE(danms): We grab the semaphore with the instance uuid @@ -1766,9 +1771,9 @@ class ComputeManager(manager.Manager): rt.reportclient.delete_allocation_for_instance( instance.uuid) - self._build_failed() + self._build_failed(node) else: - self._build_succeeded() + self._build_succeeded(node) # NOTE(danms): We spawn here to return the RPC worker thread back to # the pool. Since what follows could take a really long time, we don't @@ -1827,11 +1832,6 @@ class ComputeManager(manager.Manager): if limits is None: limits = {} - if node is None: - node = self.driver.get_available_nodes(refresh=True)[0] - LOG.debug('No node specified, defaulting to %s', node, - instance=instance) - try: with timeutils.StopWatch() as timer: self._build_and_run_instance(context, instance, image, diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index f5c5f3ba6421..5d946333a412 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -26,7 +26,7 @@ from oslo_serialization import jsonutils from nova.compute import claims from nova.compute import monitors -from nova.compute import stats +from nova.compute import stats as compute_stats from nova.compute import task_states from nova.compute import utils as compute_utils from nova.compute import vm_states @@ -136,7 +136,8 @@ class ResourceTracker(object): self.pci_tracker = None # Dict of objects.ComputeNode objects, keyed by nodename self.compute_nodes = {} - self.stats = stats.Stats() + # Dict of Stats objects, keyed by nodename + self.stats = collections.defaultdict(compute_stats.Stats) self.tracked_instances = {} self.tracked_migrations = {} monitor_handler = monitors.MonitorHandler(self) @@ -627,16 +628,18 @@ class ResourceTracker(object): def _copy_resources(self, compute_node, resources): """Copy resource values to supplied compute_node.""" + nodename = resources['hypervisor_hostname'] + stats = self.stats[nodename] # purge old stats and init with anything passed in by the driver # NOTE(danms): Preserve 'failed_builds' across the stats clearing, # as that is not part of resources # TODO(danms): Stop doing this when we get a column to store this # directly - prev_failed_builds = self.stats.get('failed_builds', 0) - self.stats.clear() - self.stats['failed_builds'] = prev_failed_builds - self.stats.digest_stats(resources.get('stats')) - compute_node.stats = copy.deepcopy(self.stats) + prev_failed_builds = stats.get('failed_builds', 0) + stats.clear() + stats['failed_builds'] = prev_failed_builds + stats.digest_stats(resources.get('stats')) + compute_node.stats = stats # update the allocation ratios for the related ComputeNode object compute_node.ram_allocation_ratio = self.ram_allocation_ratio @@ -920,7 +923,8 @@ class ResourceTracker(object): cn.free_ram_mb = cn.memory_mb - cn.memory_mb_used cn.free_disk_gb = cn.local_gb - cn.local_gb_used - cn.running_vms = self.stats.num_instances + stats = self.stats[nodename] + cn.running_vms = stats.num_instances # Calculate the numa usage free = sign == -1 @@ -1082,8 +1086,9 @@ class ResourceTracker(object): sign = -1 cn = self.compute_nodes[nodename] - self.stats.update_stats_for_instance(instance, is_removed_instance) - cn.stats = copy.deepcopy(self.stats) + stats = self.stats[nodename] + stats.update_stats_for_instance(instance, is_removed_instance) + cn.stats = stats # if it's a new or deleted instance: if is_new_instance or is_removed_instance: @@ -1099,7 +1104,7 @@ class ResourceTracker(object): self._update_usage(self._get_usage_dict(instance), nodename, sign=sign) - cn.current_workload = self.stats.calculate_workload() + cn.current_workload = stats.calculate_workload() if self.pci_tracker: obj = self.pci_tracker.stats.to_device_pools_obj() cn.pci_device_pools = obj @@ -1428,3 +1433,11 @@ class ResourceTracker(object): if key in updates: usage[key] = updates[key] return usage + + def build_failed(self, nodename): + """Increments the failed_builds stats for the given node.""" + self.stats[nodename].build_failed() + + def build_succeeded(self, nodename): + """Resets the failed_builds stats for the given node.""" + self.stats[nodename].build_succeeded() diff --git a/nova/tests/functional/compute/test_resource_tracker.py b/nova/tests/functional/compute/test_resource_tracker.py index e205bb7b4ee8..d156feb26af3 100644 --- a/nova/tests/functional/compute/test_resource_tracker.py +++ b/nova/tests/functional/compute/test_resource_tracker.py @@ -459,19 +459,14 @@ class IronicResourceTrackerTest(test.TestCase): self.rt.update_available_resource(self.ctx, nodename) self.assertEqual(3, len(self.rt.compute_nodes)) + self.assertEqual(3, len(self.rt.stats)) - def _assert_stats(bad_node=None): + def _assert_stats(): # Make sure each compute node has a unique set of stats and # they don't accumulate across nodes. for _cn in self.rt.compute_nodes.values(): node_stats_key = 'node:%s' % _cn.hypervisor_hostname - if bad_node == _cn.hypervisor_hostname: - # FIXME(mriedem): This is bug 1784705 where the - # compute node has lost its stats and is getting - # stats for a different node. - self.assertNotIn(node_stats_key, _cn.stats) - else: - self.assertIn(node_stats_key, _cn.stats) + self.assertIn(node_stats_key, _cn.stats) node_stat_count = 0 for stat in _cn.stats: if stat.startswith('node:'): @@ -485,5 +480,4 @@ class IronicResourceTrackerTest(test.TestCase): cn1_nodename = cn1_obj.hypervisor_hostname inst = self.instances[uuids.instance1] with self.rt.instance_claim(self.ctx, inst, cn1_nodename): - # FIXME(mriedem): Remove bad_node once bug 1784705 is fixed. - _assert_stats(bad_node=cn1_nodename) + _assert_stats()