Add recreate test for RT.stats bug 1784705
With change I6827137f35c0cb4f9fc4c6f753d9a035326ed01b in Ocata, we changed the ComputeManager to manage a single ResourceTracker and that single ResourceTracker will manage multiple compute nodes. The only time a single nova-compute service hosts multiple compute nodes is for ironic where there is a compute node per instance. The problem is the ResourceTracker.stats variable, unlike the ResourceTracker.compute_nodes variable, is not node-specific so it's possible for node stats to leak across nodes based on how the stats are used (and copied). This change adds a functional recreate test to show the issue before it's fixed. The fixture setup had to be tweaked a bit to avoid modifying class variables by reference between test cases. Change-Id: Icc5f615baa1042347ec1699eb84ba0670445b995 Related-Bug: #1784705
This commit is contained in:
parent
5164517ce7
commit
fc05626d43
|
@ -20,6 +20,7 @@ from nova import conf
|
|||
from nova import context
|
||||
from nova import objects
|
||||
from nova import rc_fields as fields
|
||||
from nova.tests import fixtures as nova_fixtures
|
||||
from nova.tests.functional import test_report_client as test_base
|
||||
from nova.tests import uuidsentinel as uuids
|
||||
from nova.virt import driver as virt_driver
|
||||
|
@ -151,10 +152,13 @@ class IronicResourceTrackerTest(test_base.SchedulerReportClientTestBase):
|
|||
driver.update_provider_tree.side_effect = NotImplementedError
|
||||
self.driver_mock = driver
|
||||
self.rt = resource_tracker.ResourceTracker(COMPUTE_HOST, driver)
|
||||
self.create_fixtures()
|
||||
self.instances = self.create_fixtures()
|
||||
|
||||
def create_fixtures(self):
|
||||
for flavor in self.FLAVOR_FIXTURES.values():
|
||||
# Clone the object so the class variable isn't
|
||||
# modified by reference.
|
||||
flavor = flavor.obj_clone()
|
||||
flavor._context = self.ctx
|
||||
flavor.obj_set_defaults()
|
||||
flavor.create()
|
||||
|
@ -163,14 +167,23 @@ class IronicResourceTrackerTest(test_base.SchedulerReportClientTestBase):
|
|||
# data before adding integration for Ironic baremetal nodes with the
|
||||
# placement API...
|
||||
for cn in self.COMPUTE_NODE_FIXTURES.values():
|
||||
# Clone the object so the class variable isn't
|
||||
# modified by reference.
|
||||
cn = cn.obj_clone()
|
||||
cn._context = self.ctx
|
||||
cn.obj_set_defaults()
|
||||
cn.create()
|
||||
|
||||
instances = {}
|
||||
for instance in self.INSTANCE_FIXTURES.values():
|
||||
# Clone the object so the class variable isn't
|
||||
# modified by reference.
|
||||
instance = instance.obj_clone()
|
||||
instance._context = self.ctx
|
||||
instance.obj_set_defaults()
|
||||
instance.create()
|
||||
instances[instance.uuid] = instance
|
||||
return instances
|
||||
|
||||
def placement_get_inventory(self, rp_uuid):
|
||||
url = '/resource_providers/%s/inventories' % rp_uuid
|
||||
|
@ -288,7 +301,7 @@ class IronicResourceTrackerTest(test_base.SchedulerReportClientTestBase):
|
|||
# RT's instance_claim().
|
||||
cn1_obj = self.COMPUTE_NODE_FIXTURES[uuids.cn1]
|
||||
cn1_nodename = cn1_obj.hypervisor_hostname
|
||||
inst = self.INSTANCE_FIXTURES[uuids.instance1]
|
||||
inst = self.instances[uuids.instance1]
|
||||
# Since we're pike, the scheduler would have created our
|
||||
# allocation for us. So, we can use our old update routine
|
||||
# here to mimic that before we go do the compute RT claim,
|
||||
|
@ -386,3 +399,87 @@ class IronicResourceTrackerTest(test_base.SchedulerReportClientTestBase):
|
|||
# request a single amount of that custom resource class, we will
|
||||
# modify the allocation/claim to consume only the custom resource
|
||||
# class and not the VCPU, MEMORY_MB and DISK_GB.
|
||||
|
||||
@mock.patch('nova.compute.utils.is_volume_backed_instance',
|
||||
new=mock.Mock(return_value=False))
|
||||
@mock.patch('nova.objects.compute_node.ComputeNode.save', new=mock.Mock())
|
||||
def test_node_stats_isolation(self):
|
||||
"""Regression test for bug 1784705 introduced in Ocata.
|
||||
|
||||
The ResourceTracker.stats field is meant to track per-node stats
|
||||
so this test registers three compute nodes with a single RT where
|
||||
each node has unique stats, and then makes sure that after updating
|
||||
usage for an instance, the nodes still have their unique stats and
|
||||
nothing is leaked from node to node.
|
||||
"""
|
||||
self.useFixture(nova_fixtures.PlacementFixture())
|
||||
# Before the resource tracker is "initialized", we shouldn't have
|
||||
# any compute nodes or stats in the RT's cache...
|
||||
self.assertEqual(0, len(self.rt.compute_nodes))
|
||||
self.assertEqual(0, len(self.rt.stats))
|
||||
|
||||
# Now "initialize" the resource tracker. This is what
|
||||
# nova.compute.manager.ComputeManager does when "initializing" the
|
||||
# nova-compute service. Do this in a predictable order so cn1 is
|
||||
# first and cn3 is last.
|
||||
for cn in sorted(self.COMPUTE_NODE_FIXTURES.values(),
|
||||
key=lambda _cn: _cn.hypervisor_hostname):
|
||||
nodename = cn.hypervisor_hostname
|
||||
# Fake that each compute node has unique extra specs stats and
|
||||
# the RT makes sure those are unique per node.
|
||||
stats = {'node:%s' % nodename: nodename}
|
||||
self.driver_mock.get_available_resource.return_value = {
|
||||
'hypervisor_hostname': nodename,
|
||||
'hypervisor_type': 'ironic',
|
||||
'hypervisor_version': 0,
|
||||
'vcpus': cn.vcpus,
|
||||
'vcpus_used': cn.vcpus_used,
|
||||
'memory_mb': cn.memory_mb,
|
||||
'memory_mb_used': cn.memory_mb_used,
|
||||
'local_gb': cn.local_gb,
|
||||
'local_gb_used': cn.local_gb_used,
|
||||
'numa_topology': None,
|
||||
'resource_class': None, # Act like admin hasn't set yet...
|
||||
'stats': stats,
|
||||
}
|
||||
self.driver_mock.get_inventory.return_value = {
|
||||
'CUSTOM_SMALL_IRON': {
|
||||
'total': 1,
|
||||
'reserved': 0,
|
||||
'min_unit': 1,
|
||||
'max_unit': 1,
|
||||
'step_size': 1,
|
||||
'allocation_ratio': 1.0,
|
||||
},
|
||||
}
|
||||
self.rt.update_available_resource(self.ctx, nodename)
|
||||
|
||||
self.assertEqual(3, len(self.rt.compute_nodes))
|
||||
|
||||
def _assert_stats(bad_node=None):
|
||||
# Make sure each compute node has a unique set of stats and
|
||||
# they don't accumulate across nodes.
|
||||
for _cn in self.rt.compute_nodes.values():
|
||||
node_stats_key = 'node:%s' % _cn.hypervisor_hostname
|
||||
if bad_node == _cn.hypervisor_hostname:
|
||||
# FIXME(mriedem): This is bug 1784705 where the
|
||||
# compute node has lost its stats and is getting
|
||||
# stats for a different node.
|
||||
self.assertNotIn(node_stats_key, _cn.stats)
|
||||
else:
|
||||
self.assertIn(node_stats_key, _cn.stats)
|
||||
node_stat_count = 0
|
||||
for stat in _cn.stats:
|
||||
if stat.startswith('node:'):
|
||||
node_stat_count += 1
|
||||
self.assertEqual(1, node_stat_count, _cn.stats)
|
||||
_assert_stats()
|
||||
|
||||
# Now "spawn" an instance to the first compute node by calling the
|
||||
# RT's instance_claim().
|
||||
cn1_obj = self.COMPUTE_NODE_FIXTURES[uuids.cn1]
|
||||
cn1_nodename = cn1_obj.hypervisor_hostname
|
||||
inst = self.instances[uuids.instance1]
|
||||
with self.rt.instance_claim(self.ctx, inst, cn1_nodename):
|
||||
# FIXME(mriedem): Remove bad_node once bug 1784705 is fixed.
|
||||
_assert_stats(bad_node=cn1_nodename)
|
||||
|
|
Loading…
Reference in New Issue