Add recreate test for RT.stats bug 1784705

With change I6827137f35c0cb4f9fc4c6f753d9a035326ed01b in
Ocata, we changed the ComputeManager to manage a single
ResourceTracker and that single ResourceTracker will
manage multiple compute nodes. The only time a single
nova-compute service hosts multiple compute nodes is for
ironic where there is a compute node per instance. The
problem is the ResourceTracker.stats variable, unlike the
ResourceTracker.compute_nodes variable, is not node-specific
so it's possible for node stats to leak across nodes based
on how the stats are used (and copied).

This change adds a functional recreate test to show the issue
before it's fixed. The fixture setup had to be tweaked a
bit to avoid modifying class variables by reference between
test cases.

Conflicts:
      nova/tests/functional/compute/test_resource_tracker.py

NOTE(mriedem): Conflicts are due to not having change
I433700e833f97c0fec946dafc2cdda9d49e1100b or change
I075785abcd4f4a8e180959daeadf215b9cd175c8 in Queens.

Change-Id: Icc5f615baa1042347ec1699eb84ba0670445b995
Related-Bug: #1784705
(cherry picked from commit fc05626d43)
This commit is contained in:
Matt Riedemann 2018-07-31 16:00:28 -04:00
parent f07dd4436a
commit cc8167a191
1 changed files with 99 additions and 2 deletions

View File

@ -23,6 +23,7 @@ from nova import context
from nova import objects
from nova.objects import fields
from nova import test
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional.api.openstack.placement import test_report_client
from nova.tests import uuidsentinel as uuids
@ -150,10 +151,13 @@ class IronicResourceTrackerTest(test.TestCase):
self.rt.scheduler_client.reportclient = self.report_client
self.rt.reportclient = self.report_client
self.url = 'http://localhost/placement'
self.create_fixtures()
self.instances = self.create_fixtures()
def create_fixtures(self):
for flavor in self.FLAVOR_FIXTURES.values():
# Clone the object so the class variable isn't
# modified by reference.
flavor = flavor.obj_clone()
flavor._context = self.ctx
flavor.obj_set_defaults()
flavor.create()
@ -162,14 +166,23 @@ class IronicResourceTrackerTest(test.TestCase):
# data before adding integration for Ironic baremetal nodes with the
# placement API...
for cn in self.COMPUTE_NODE_FIXTURES.values():
# Clone the object so the class variable isn't
# modified by reference.
cn = cn.obj_clone()
cn._context = self.ctx
cn.obj_set_defaults()
cn.create()
instances = {}
for instance in self.INSTANCE_FIXTURES.values():
# Clone the object so the class variable isn't
# modified by reference.
instance = instance.obj_clone()
instance._context = self.ctx
instance.obj_set_defaults()
instance.create()
instances[instance.uuid] = instance
return instances
def placement_get_inventory(self, rp_uuid):
url = '/resource_providers/%s/inventories' % rp_uuid
@ -293,7 +306,7 @@ class IronicResourceTrackerTest(test.TestCase):
# RT's instance_claim().
cn1_obj = self.COMPUTE_NODE_FIXTURES[uuids.cn1]
cn1_nodename = cn1_obj.hypervisor_hostname
inst = self.INSTANCE_FIXTURES[uuids.instance1]
inst = self.instances[uuids.instance1]
# Since we're pike, the scheduler would have created our
# allocation for us. So, we can use our old update routine
# here to mimic that before we go do the compute RT claim,
@ -391,3 +404,87 @@ class IronicResourceTrackerTest(test.TestCase):
# request a single amount of that custom resource class, we will
# modify the allocation/claim to consume only the custom resource
# class and not the VCPU, MEMORY_MB and DISK_GB.
@mock.patch('nova.compute.utils.is_volume_backed_instance',
new=mock.Mock(return_value=False))
@mock.patch('nova.objects.compute_node.ComputeNode.save', new=mock.Mock())
def test_node_stats_isolation(self):
"""Regression test for bug 1784705 introduced in Ocata.
The ResourceTracker.stats field is meant to track per-node stats
so this test registers three compute nodes with a single RT where
each node has unique stats, and then makes sure that after updating
usage for an instance, the nodes still have their unique stats and
nothing is leaked from node to node.
"""
self.useFixture(nova_fixtures.PlacementFixture())
# Before the resource tracker is "initialized", we shouldn't have
# any compute nodes or stats in the RT's cache...
self.assertEqual(0, len(self.rt.compute_nodes))
self.assertEqual(0, len(self.rt.stats))
# Now "initialize" the resource tracker. This is what
# nova.compute.manager.ComputeManager does when "initializing" the
# nova-compute service. Do this in a predictable order so cn1 is
# first and cn3 is last.
for cn in sorted(self.COMPUTE_NODE_FIXTURES.values(),
key=lambda _cn: _cn.hypervisor_hostname):
nodename = cn.hypervisor_hostname
# Fake that each compute node has unique extra specs stats and
# the RT makes sure those are unique per node.
stats = {'node:%s' % nodename: nodename}
self.driver_mock.get_available_resource.return_value = {
'hypervisor_hostname': nodename,
'hypervisor_type': 'ironic',
'hypervisor_version': 0,
'vcpus': cn.vcpus,
'vcpus_used': cn.vcpus_used,
'memory_mb': cn.memory_mb,
'memory_mb_used': cn.memory_mb_used,
'local_gb': cn.local_gb,
'local_gb_used': cn.local_gb_used,
'numa_topology': None,
'resource_class': None, # Act like admin hasn't set yet...
'stats': stats,
}
self.driver_mock.get_inventory.return_value = {
'CUSTOM_SMALL_IRON': {
'total': 1,
'reserved': 0,
'min_unit': 1,
'max_unit': 1,
'step_size': 1,
'allocation_ratio': 1.0,
},
}
self.rt.update_available_resource(self.ctx, nodename)
self.assertEqual(3, len(self.rt.compute_nodes))
def _assert_stats(bad_node=None):
# Make sure each compute node has a unique set of stats and
# they don't accumulate across nodes.
for _cn in self.rt.compute_nodes.values():
node_stats_key = 'node:%s' % _cn.hypervisor_hostname
if bad_node == _cn.hypervisor_hostname:
# FIXME(mriedem): This is bug 1784705 where the
# compute node has lost its stats and is getting
# stats for a different node.
self.assertNotIn(node_stats_key, _cn.stats)
else:
self.assertIn(node_stats_key, _cn.stats)
node_stat_count = 0
for stat in _cn.stats:
if stat.startswith('node:'):
node_stat_count += 1
self.assertEqual(1, node_stat_count, _cn.stats)
_assert_stats()
# Now "spawn" an instance to the first compute node by calling the
# RT's instance_claim().
cn1_obj = self.COMPUTE_NODE_FIXTURES[uuids.cn1]
cn1_nodename = cn1_obj.hypervisor_hostname
inst = self.instances[uuids.instance1]
with self.rt.instance_claim(self.ctx, inst, cn1_nodename):
# FIXME(mriedem): Remove bad_node once bug 1784705 is fixed.
_assert_stats(bad_node=cn1_nodename)