Add recreate test for RT.stats bug 1784705

With change I6827137f35c0cb4f9fc4c6f753d9a035326ed01b in
Ocata, we changed the ComputeManager to manage a single
ResourceTracker and that single ResourceTracker will
manage multiple compute nodes. The only time a single
nova-compute service hosts multiple compute nodes is for
ironic where there is a compute node per instance. The
problem is the ResourceTracker.stats variable, unlike the
ResourceTracker.compute_nodes variable, is not node-specific
so it's possible for node stats to leak across nodes based
on how the stats are used (and copied).

This change adds a functional recreate test to show the issue
before it's fixed. The fixture setup had to be tweaked a
bit to avoid modifying class variables by reference between
test cases.

Change-Id: Icc5f615baa1042347ec1699eb84ba0670445b995
Related-Bug: #1784705
This commit is contained in:
Matt Riedemann 2018-07-31 16:00:28 -04:00
parent 5164517ce7
commit fc05626d43
1 changed files with 99 additions and 2 deletions

View File

@ -20,6 +20,7 @@ from nova import conf
from nova import context
from nova import objects
from nova import rc_fields as fields
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional import test_report_client as test_base
from nova.tests import uuidsentinel as uuids
from nova.virt import driver as virt_driver
@ -151,10 +152,13 @@ class IronicResourceTrackerTest(test_base.SchedulerReportClientTestBase):
driver.update_provider_tree.side_effect = NotImplementedError
self.driver_mock = driver
self.rt = resource_tracker.ResourceTracker(COMPUTE_HOST, driver)
self.create_fixtures()
self.instances = self.create_fixtures()
def create_fixtures(self):
for flavor in self.FLAVOR_FIXTURES.values():
# Clone the object so the class variable isn't
# modified by reference.
flavor = flavor.obj_clone()
flavor._context = self.ctx
flavor.obj_set_defaults()
flavor.create()
@ -163,14 +167,23 @@ class IronicResourceTrackerTest(test_base.SchedulerReportClientTestBase):
# data before adding integration for Ironic baremetal nodes with the
# placement API...
for cn in self.COMPUTE_NODE_FIXTURES.values():
# Clone the object so the class variable isn't
# modified by reference.
cn = cn.obj_clone()
cn._context = self.ctx
cn.obj_set_defaults()
cn.create()
instances = {}
for instance in self.INSTANCE_FIXTURES.values():
# Clone the object so the class variable isn't
# modified by reference.
instance = instance.obj_clone()
instance._context = self.ctx
instance.obj_set_defaults()
instance.create()
instances[instance.uuid] = instance
return instances
def placement_get_inventory(self, rp_uuid):
url = '/resource_providers/%s/inventories' % rp_uuid
@ -288,7 +301,7 @@ class IronicResourceTrackerTest(test_base.SchedulerReportClientTestBase):
# RT's instance_claim().
cn1_obj = self.COMPUTE_NODE_FIXTURES[uuids.cn1]
cn1_nodename = cn1_obj.hypervisor_hostname
inst = self.INSTANCE_FIXTURES[uuids.instance1]
inst = self.instances[uuids.instance1]
# Since we're pike, the scheduler would have created our
# allocation for us. So, we can use our old update routine
# here to mimic that before we go do the compute RT claim,
@ -386,3 +399,87 @@ class IronicResourceTrackerTest(test_base.SchedulerReportClientTestBase):
# request a single amount of that custom resource class, we will
# modify the allocation/claim to consume only the custom resource
# class and not the VCPU, MEMORY_MB and DISK_GB.
@mock.patch('nova.compute.utils.is_volume_backed_instance',
new=mock.Mock(return_value=False))
@mock.patch('nova.objects.compute_node.ComputeNode.save', new=mock.Mock())
def test_node_stats_isolation(self):
"""Regression test for bug 1784705 introduced in Ocata.
The ResourceTracker.stats field is meant to track per-node stats
so this test registers three compute nodes with a single RT where
each node has unique stats, and then makes sure that after updating
usage for an instance, the nodes still have their unique stats and
nothing is leaked from node to node.
"""
self.useFixture(nova_fixtures.PlacementFixture())
# Before the resource tracker is "initialized", we shouldn't have
# any compute nodes or stats in the RT's cache...
self.assertEqual(0, len(self.rt.compute_nodes))
self.assertEqual(0, len(self.rt.stats))
# Now "initialize" the resource tracker. This is what
# nova.compute.manager.ComputeManager does when "initializing" the
# nova-compute service. Do this in a predictable order so cn1 is
# first and cn3 is last.
for cn in sorted(self.COMPUTE_NODE_FIXTURES.values(),
key=lambda _cn: _cn.hypervisor_hostname):
nodename = cn.hypervisor_hostname
# Fake that each compute node has unique extra specs stats and
# the RT makes sure those are unique per node.
stats = {'node:%s' % nodename: nodename}
self.driver_mock.get_available_resource.return_value = {
'hypervisor_hostname': nodename,
'hypervisor_type': 'ironic',
'hypervisor_version': 0,
'vcpus': cn.vcpus,
'vcpus_used': cn.vcpus_used,
'memory_mb': cn.memory_mb,
'memory_mb_used': cn.memory_mb_used,
'local_gb': cn.local_gb,
'local_gb_used': cn.local_gb_used,
'numa_topology': None,
'resource_class': None, # Act like admin hasn't set yet...
'stats': stats,
}
self.driver_mock.get_inventory.return_value = {
'CUSTOM_SMALL_IRON': {
'total': 1,
'reserved': 0,
'min_unit': 1,
'max_unit': 1,
'step_size': 1,
'allocation_ratio': 1.0,
},
}
self.rt.update_available_resource(self.ctx, nodename)
self.assertEqual(3, len(self.rt.compute_nodes))
def _assert_stats(bad_node=None):
# Make sure each compute node has a unique set of stats and
# they don't accumulate across nodes.
for _cn in self.rt.compute_nodes.values():
node_stats_key = 'node:%s' % _cn.hypervisor_hostname
if bad_node == _cn.hypervisor_hostname:
# FIXME(mriedem): This is bug 1784705 where the
# compute node has lost its stats and is getting
# stats for a different node.
self.assertNotIn(node_stats_key, _cn.stats)
else:
self.assertIn(node_stats_key, _cn.stats)
node_stat_count = 0
for stat in _cn.stats:
if stat.startswith('node:'):
node_stat_count += 1
self.assertEqual(1, node_stat_count, _cn.stats)
_assert_stats()
# Now "spawn" an instance to the first compute node by calling the
# RT's instance_claim().
cn1_obj = self.COMPUTE_NODE_FIXTURES[uuids.cn1]
cn1_nodename = cn1_obj.hypervisor_hostname
inst = self.instances[uuids.instance1]
with self.rt.instance_claim(self.ctx, inst, cn1_nodename):
# FIXME(mriedem): Remove bad_node once bug 1784705 is fixed.
_assert_stats(bad_node=cn1_nodename)