Make ResourceTracker.stats node-specific
As of change I6827137f35c0cb4f9fc4c6f753d9a035326ed01b in Ocata, the ResourceTracker manages multiple compute nodes via its "compute_nodes" variable, but the "stats" variable was still being shared across all nodes, which leads to leaking stats across nodes in an ironic deployment where a single nova-compute service host is managing multiple ironic instances (nodes). This change makes ResourceTracker.stats node-specific which fixes the ironic leak but also allows us to remove the stats deepcopy while iterating over instances which should improve performance for single-node deployments with potentially a large number of instances, i.e. vCenter. Change-Id: I0b9e5b711878fa47ba90e43c0b41437b57cf8ef6 Closes-Bug: #1784705 Closes-Bug: #1777422
This commit is contained in:
parent
31e6e715e0
commit
b5b7d86bb0
|
@ -1775,16 +1775,16 @@ class ComputeManager(manager.Manager):
|
|||
|
||||
return block_device_info
|
||||
|
||||
def _build_failed(self):
|
||||
def _build_failed(self, node):
|
||||
if CONF.compute.consecutive_build_service_disable_threshold:
|
||||
rt = self._get_resource_tracker()
|
||||
# NOTE(danms): Update our counter, but wait for the next
|
||||
# update_available_resource() periodic to flush it to the DB
|
||||
rt.stats.build_failed()
|
||||
rt.build_failed(node)
|
||||
|
||||
def _build_succeeded(self):
|
||||
def _build_succeeded(self, node):
|
||||
rt = self._get_resource_tracker()
|
||||
rt.stats.build_succeeded()
|
||||
rt.build_succeeded(node)
|
||||
|
||||
@wrap_exception()
|
||||
@reverts_task_state
|
||||
|
@ -1834,9 +1834,9 @@ class ComputeManager(manager.Manager):
|
|||
|
||||
if result in (build_results.FAILED,
|
||||
build_results.RESCHEDULED):
|
||||
self._build_failed()
|
||||
self._build_failed(node)
|
||||
else:
|
||||
self._build_succeeded()
|
||||
self._build_succeeded(node)
|
||||
|
||||
# NOTE(danms): We spawn here to return the RPC worker thread back to
|
||||
# the pool. Since what follows could take a really long time, we don't
|
||||
|
|
|
@ -27,7 +27,7 @@ from oslo_serialization import jsonutils
|
|||
|
||||
from nova.compute import claims
|
||||
from nova.compute import monitors
|
||||
from nova.compute import stats
|
||||
from nova.compute import stats as compute_stats
|
||||
from nova.compute import task_states
|
||||
from nova.compute import utils as compute_utils
|
||||
from nova.compute import vm_states
|
||||
|
@ -137,7 +137,8 @@ class ResourceTracker(object):
|
|||
self.pci_tracker = None
|
||||
# Dict of objects.ComputeNode objects, keyed by nodename
|
||||
self.compute_nodes = {}
|
||||
self.stats = stats.Stats()
|
||||
# Dict of Stats objects, keyed by nodename
|
||||
self.stats = collections.defaultdict(compute_stats.Stats)
|
||||
self.tracked_instances = {}
|
||||
self.tracked_migrations = {}
|
||||
self.is_bfv = {} # dict, keyed by instance uuid, to is_bfv boolean
|
||||
|
@ -605,16 +606,18 @@ class ResourceTracker(object):
|
|||
|
||||
def _copy_resources(self, compute_node, resources):
|
||||
"""Copy resource values to supplied compute_node."""
|
||||
nodename = resources['hypervisor_hostname']
|
||||
stats = self.stats[nodename]
|
||||
# purge old stats and init with anything passed in by the driver
|
||||
# NOTE(danms): Preserve 'failed_builds' across the stats clearing,
|
||||
# as that is not part of resources
|
||||
# TODO(danms): Stop doing this when we get a column to store this
|
||||
# directly
|
||||
prev_failed_builds = self.stats.get('failed_builds', 0)
|
||||
self.stats.clear()
|
||||
self.stats['failed_builds'] = prev_failed_builds
|
||||
self.stats.digest_stats(resources.get('stats'))
|
||||
compute_node.stats = copy.deepcopy(self.stats)
|
||||
prev_failed_builds = stats.get('failed_builds', 0)
|
||||
stats.clear()
|
||||
stats['failed_builds'] = prev_failed_builds
|
||||
stats.digest_stats(resources.get('stats'))
|
||||
compute_node.stats = stats
|
||||
|
||||
# update the allocation ratios for the related ComputeNode object
|
||||
compute_node.ram_allocation_ratio = self.ram_allocation_ratio
|
||||
|
@ -948,7 +951,8 @@ class ResourceTracker(object):
|
|||
cn.free_ram_mb = cn.memory_mb - cn.memory_mb_used
|
||||
cn.free_disk_gb = cn.local_gb - cn.local_gb_used
|
||||
|
||||
cn.running_vms = self.stats.num_instances
|
||||
stats = self.stats[nodename]
|
||||
cn.running_vms = stats.num_instances
|
||||
|
||||
# Calculate the numa usage
|
||||
free = sign == -1
|
||||
|
@ -1126,8 +1130,9 @@ class ResourceTracker(object):
|
|||
sign = -1
|
||||
|
||||
cn = self.compute_nodes[nodename]
|
||||
self.stats.update_stats_for_instance(instance, is_removed_instance)
|
||||
cn.stats = copy.deepcopy(self.stats)
|
||||
stats = self.stats[nodename]
|
||||
stats.update_stats_for_instance(instance, is_removed_instance)
|
||||
cn.stats = stats
|
||||
|
||||
# if it's a new or deleted instance:
|
||||
if is_new_instance or is_removed_instance:
|
||||
|
@ -1149,7 +1154,7 @@ class ResourceTracker(object):
|
|||
if is_removed_instance and uuid in self.is_bfv:
|
||||
del self.is_bfv[uuid]
|
||||
|
||||
cn.current_workload = self.stats.calculate_workload()
|
||||
cn.current_workload = stats.calculate_workload()
|
||||
if self.pci_tracker:
|
||||
obj = self.pci_tracker.stats.to_device_pools_obj()
|
||||
cn.pci_device_pools = obj
|
||||
|
@ -1489,3 +1494,11 @@ class ResourceTracker(object):
|
|||
if key in updates:
|
||||
usage[key] = updates[key]
|
||||
return usage
|
||||
|
||||
def build_failed(self, nodename):
|
||||
"""Increments the failed_builds stats for the given node."""
|
||||
self.stats[nodename].build_failed()
|
||||
|
||||
def build_succeeded(self, nodename):
|
||||
"""Resets the failed_builds stats for the given node."""
|
||||
self.stats[nodename].build_succeeded()
|
||||
|
|
|
@ -455,19 +455,14 @@ class IronicResourceTrackerTest(test_base.SchedulerReportClientTestBase):
|
|||
self.rt.update_available_resource(self.ctx, nodename)
|
||||
|
||||
self.assertEqual(3, len(self.rt.compute_nodes))
|
||||
self.assertEqual(3, len(self.rt.stats))
|
||||
|
||||
def _assert_stats(bad_node=None):
|
||||
def _assert_stats():
|
||||
# Make sure each compute node has a unique set of stats and
|
||||
# they don't accumulate across nodes.
|
||||
for _cn in self.rt.compute_nodes.values():
|
||||
node_stats_key = 'node:%s' % _cn.hypervisor_hostname
|
||||
if bad_node == _cn.hypervisor_hostname:
|
||||
# FIXME(mriedem): This is bug 1784705 where the
|
||||
# compute node has lost its stats and is getting
|
||||
# stats for a different node.
|
||||
self.assertNotIn(node_stats_key, _cn.stats)
|
||||
else:
|
||||
self.assertIn(node_stats_key, _cn.stats)
|
||||
self.assertIn(node_stats_key, _cn.stats)
|
||||
node_stat_count = 0
|
||||
for stat in _cn.stats:
|
||||
if stat.startswith('node:'):
|
||||
|
@ -481,5 +476,4 @@ class IronicResourceTrackerTest(test_base.SchedulerReportClientTestBase):
|
|||
cn1_nodename = cn1_obj.hypervisor_hostname
|
||||
inst = self.instances[uuids.instance1]
|
||||
with self.rt.instance_claim(self.ctx, inst, cn1_nodename):
|
||||
# FIXME(mriedem): Remove bad_node once bug 1784705 is fixed.
|
||||
_assert_stats(bad_node=cn1_nodename)
|
||||
_assert_stats()
|
||||
|
|
Loading…
Reference in New Issue