Make ResourceTracker.stats node-specific

As of change I6827137f35c0cb4f9fc4c6f753d9a035326ed01b in Ocata, the ResourceTracker manages multiple compute nodes via its "compute_nodes" variable, but the "stats" variable was still being shared across all nodes, which leads to leaking stats across nodes in an ironic deployment where a single nova-compute service host is managing multiple ironic instances (nodes). This change makes ResourceTracker.stats node-specific which fixes the ironic leak but also allows us to remove the stats deepcopy while iterating over instances which should improve performance for single-node deployments with potentially a large number of instances, i.e. vCenter. Conflicts: nova/compute/manager.py nova/compute/resource_tracker.py NOTE(mriedem): The conflicts are due to not having change I02b7cd87d399d487dd1d650540f503a70bc27749 in Ocata which added the consecutive failed builds stats counting. As a result, the Ocata cherry pick does not need to move the "if node is None" check in _do_build_and_run_instance like in Pike. Change-Id: I0b9e5b711878fa47ba90e43c0b41437b57cf8ef6 Closes-Bug: #1784705 Closes-Bug: #1777422 (cherry picked from commit b5b7d86bb0) (cherry picked from commit 7d99f5753f) (cherry picked from commit e279ac4046)
2018-07-31 17:26:47 -04:00 · 2018-07-31 17:26:47 -04:00 · 2f6ea7b871
parent d4179e3e96
commit 2f6ea7b871
2 changed files with 18 additions and 19 deletions
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@ -26,7 +26,7 @@ from oslo_serialization import jsonutils

 from nova.compute import claims
 from nova.compute import monitors
-from nova.compute import stats
+from nova.compute import stats as compute_stats
 from nova.compute import task_states
 from nova.compute import vm_states
 import nova.conf
@ -88,7 +88,8 @@ class ResourceTracker(object):
        self.pci_tracker = None
        # Dict of objects.ComputeNode objects, keyed by nodename
        self.compute_nodes = {}
-        self.stats = stats.Stats()
+        # Dict of Stats objects, keyed by nodename
+        self.stats = collections.defaultdict(compute_stats.Stats)
        self.tracked_instances = {}
        self.tracked_migrations = {}
        monitor_handler = monitors.MonitorHandler(self)
@ -477,10 +478,12 @@ class ResourceTracker(object):

    def _copy_resources(self, compute_node, resources):
        """Copy resource values to supplied compute_node."""
+        nodename = resources['hypervisor_hostname']
+        stats = self.stats[nodename]
        # purge old stats and init with anything passed in by the driver
-        self.stats.clear()
-        self.stats.digest_stats(resources.get('stats'))
-        compute_node.stats = copy.deepcopy(self.stats)
+        stats.clear()
+        stats.digest_stats(resources.get('stats'))
+        compute_node.stats = stats

        # update the allocation ratios for the related ComputeNode object
        compute_node.ram_allocation_ratio = self.ram_allocation_ratio
@ -746,7 +749,8 @@ class ResourceTracker(object):
        cn.free_ram_mb = cn.memory_mb - cn.memory_mb_used
        cn.free_disk_gb = cn.local_gb - cn.local_gb_used

-        cn.running_vms = self.stats.num_instances
+        stats = self.stats[nodename]
+        cn.running_vms = stats.num_instances

        # Calculate the numa usage
        free = sign == -1
@ -908,8 +912,9 @@ class ResourceTracker(object):
            sign = -1

        cn = self.compute_nodes[nodename]
-        self.stats.update_stats_for_instance(instance, is_removed_instance)
-        cn.stats = copy.deepcopy(self.stats)
+        stats = self.stats[nodename]
+        stats.update_stats_for_instance(instance, is_removed_instance)
+        cn.stats = stats

        # if it's a new or deleted instance:
        if is_new_instance or is_removed_instance:
@ -923,7 +928,7 @@ class ResourceTracker(object):
            self._update_usage(self._get_usage_dict(instance), nodename,
                               sign=sign)

-        cn.current_workload = self.stats.calculate_workload()
+        cn.current_workload = stats.calculate_workload()
        if self.pci_tracker:
            obj = self.pci_tracker.stats.to_device_pools_obj()
            cn.pci_device_pools = obj
--- a/nova/tests/functional/compute/test_resource_tracker.py
+++ b/nova/tests/functional/compute/test_resource_tracker.py
@ -223,19 +223,14 @@ class IronicResourceTrackerTest(test.TestCase):
            self.rt.update_available_resource(self.ctx, nodename)

        self.assertEqual(3, len(self.rt.compute_nodes))
+        self.assertEqual(3, len(self.rt.stats))

-        def _assert_stats(bad_node=None):
+        def _assert_stats():
            # Make sure each compute node has a unique set of stats and
            # they don't accumulate across nodes.
            for _cn in self.rt.compute_nodes.values():
                node_stats_key = 'node:%s' % _cn.hypervisor_hostname
-                if bad_node == _cn.hypervisor_hostname:
-                    # FIXME(mriedem): This is bug 1784705 where the
-                    # compute node has lost its stats and is getting
-                    # stats for a different node.
-                    self.assertNotIn(node_stats_key, _cn.stats)
-                else:
-                    self.assertIn(node_stats_key, _cn.stats)
+                self.assertIn(node_stats_key, _cn.stats)
                node_stat_count = 0
                for stat in _cn.stats:
                    if stat.startswith('node:'):
@ -249,5 +244,4 @@ class IronicResourceTrackerTest(test.TestCase):
        cn1_nodename = cn1_obj.hypervisor_hostname
        inst = self.instances[uuids.instance1]
        with self.rt.instance_claim(self.ctx, inst, cn1_nodename):
-            # FIXME(mriedem): Remove bad_node once bug 1784705 is fixed.
-            _assert_stats(bad_node=cn1_nodename)
+            _assert_stats()