Add recreate test for RT.stats bug 1784705

With change I6827137f35c0cb4f9fc4c6f753d9a035326ed01b in
Ocata, we changed the ComputeManager to manage a single
ResourceTracker and that single ResourceTracker will
manage multiple compute nodes. The only time a single
nova-compute service hosts multiple compute nodes is for
ironic where there is a compute node per instance. The
problem is the ResourceTracker.stats variable, unlike the
ResourceTracker.compute_nodes variable, is not node-specific
so it's possible for node stats to leak across nodes based
on how the stats are used (and copied).

This change adds a functional recreate test to show the issue
before it's fixed. The fixture setup had to be tweaked a
bit to avoid modifying class variables by reference between
test cases.

Conflicts:
      nova/tests/functional/compute/test_resource_tracker.py

NOTE(mriedem): The conflict is due to this test module not
existing in Ocata. It was introduced in Pike with change
I59be1cbedc99dcbb0ccde089a9f4737305176324 and changes were
made to it over time. In this backport, the basics needed
for the one test case specific to this patch are added and
everything else, like the placement stuff, is removed.

Change-Id: Icc5f615baa1042347ec1699eb84ba0670445b995
Related-Bug: #1784705
(cherry picked from commit fc05626d43)
(cherry picked from commit cc8167a191)
(cherry picked from commit dbce613a9a)
This commit is contained in:
Matt Riedemann 2018-07-31 16:00:28 -04:00
parent 77ece76a70
commit d4179e3e96
2 changed files with 253 additions and 0 deletions

View File

@ -0,0 +1,253 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import mock
from nova.compute import power_state
from nova.compute import resource_tracker
from nova.compute import task_states
from nova.compute import vm_states
from nova import conf
from nova import context
from nova import objects
from nova.objects import fields
from nova import test
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional.api.openstack.placement import test_report_client
from nova.tests import uuidsentinel as uuids
CONF = conf.CONF
VCPU = fields.ResourceClass.VCPU
MEMORY_MB = fields.ResourceClass.MEMORY_MB
DISK_GB = fields.ResourceClass.DISK_GB
COMPUTE_HOST = 'compute-host'
class IronicResourceTrackerTest(test.TestCase):
"""Functional tests for the resource tracker managing multiple nodes like
in the case of an ironic deployment.
"""
FLAVOR_FIXTURES = {
'CUSTOM_SMALL_IRON': objects.Flavor(
name='CUSTOM_SMALL_IRON',
flavorid=42,
vcpus=4,
memory_mb=4096,
root_gb=1024,
swap=0,
ephemeral_gb=0,
extra_specs={},
),
'CUSTOM_BIG_IRON': objects.Flavor(
name='CUSTOM_BIG_IRON',
flavorid=43,
vcpus=16,
memory_mb=65536,
root_gb=1024,
swap=0,
ephemeral_gb=0,
extra_specs={},
),
}
COMPUTE_NODE_FIXTURES = {
uuids.cn1: objects.ComputeNode(
uuid=uuids.cn1,
hypervisor_hostname='cn1',
hypervisor_type='ironic',
hypervisor_version=0,
cpu_info="",
host=COMPUTE_HOST,
vcpus=4,
vcpus_used=0,
cpu_allocation_ratio=1.0,
memory_mb=4096,
memory_mb_used=0,
ram_allocation_ratio=1.0,
local_gb=1024,
local_gb_used=0,
disk_allocation_ratio=1.0,
),
uuids.cn2: objects.ComputeNode(
uuid=uuids.cn2,
hypervisor_hostname='cn2',
hypervisor_type='ironic',
hypervisor_version=0,
cpu_info="",
host=COMPUTE_HOST,
vcpus=4,
vcpus_used=0,
cpu_allocation_ratio=1.0,
memory_mb=4096,
memory_mb_used=0,
ram_allocation_ratio=1.0,
local_gb=1024,
local_gb_used=0,
disk_allocation_ratio=1.0,
),
uuids.cn3: objects.ComputeNode(
uuid=uuids.cn3,
hypervisor_hostname='cn3',
hypervisor_type='ironic',
hypervisor_version=0,
cpu_info="",
host=COMPUTE_HOST,
vcpus=16,
vcpus_used=0,
cpu_allocation_ratio=1.0,
memory_mb=65536,
memory_mb_used=0,
ram_allocation_ratio=1.0,
local_gb=2048,
local_gb_used=0,
disk_allocation_ratio=1.0,
),
}
INSTANCE_FIXTURES = {
uuids.instance1: objects.Instance(
uuid=uuids.instance1,
flavor=FLAVOR_FIXTURES['CUSTOM_SMALL_IRON'],
vm_state=vm_states.BUILDING,
task_state=task_states.SPAWNING,
power_state=power_state.RUNNING,
project_id='project',
user_id=uuids.user,
),
}
def setUp(self):
super(IronicResourceTrackerTest, self).setUp()
self.ctx = context.RequestContext('user', 'project')
self.report_client = test_report_client.NoAuthReportClient()
driver = mock.MagicMock(autospec='nova.virt.driver.ComputeDriver')
driver.node_is_available.return_value = True
self.driver_mock = driver
self.rt = resource_tracker.ResourceTracker(COMPUTE_HOST, driver)
self.rt.scheduler_client.reportclient = self.report_client
self.rt.reportclient = self.report_client
self.instances = self.create_fixtures()
def create_fixtures(self):
for flavor in self.FLAVOR_FIXTURES.values():
# Clone the object so the class variable isn't
# modified by reference.
flavor = flavor.obj_clone()
flavor._context = self.ctx
flavor.obj_set_defaults()
flavor.create()
# We create some compute node records in the Nova cell DB to simulate
# data before adding integration for Ironic baremetal nodes with the
# placement API...
for cn in self.COMPUTE_NODE_FIXTURES.values():
# Clone the object so the class variable isn't
# modified by reference.
cn = cn.obj_clone()
cn._context = self.ctx
cn.obj_set_defaults()
cn.create()
instances = {}
for instance in self.INSTANCE_FIXTURES.values():
# Clone the object so the class variable isn't
# modified by reference.
instance = instance.obj_clone()
instance._context = self.ctx
instance.obj_set_defaults()
instance.create()
instances[instance.uuid] = instance
return instances
@mock.patch('nova.objects.compute_node.ComputeNode.save', new=mock.Mock())
def test_node_stats_isolation(self):
"""Regression test for bug 1784705 introduced in Ocata.
The ResourceTracker.stats field is meant to track per-node stats
so this test registers three compute nodes with a single RT where
each node has unique stats, and then makes sure that after updating
usage for an instance, the nodes still have their unique stats and
nothing is leaked from node to node.
"""
self.useFixture(nova_fixtures.PlacementFixture())
# Before the resource tracker is "initialized", we shouldn't have
# any compute nodes or stats in the RT's cache...
self.assertEqual(0, len(self.rt.compute_nodes))
self.assertEqual(0, len(self.rt.stats))
# Now "initialize" the resource tracker. This is what
# nova.compute.manager.ComputeManager does when "initializing" the
# nova-compute service. Do this in a predictable order so cn1 is
# first and cn3 is last.
for cn in sorted(self.COMPUTE_NODE_FIXTURES.values(),
key=lambda _cn: _cn.hypervisor_hostname):
nodename = cn.hypervisor_hostname
# Fake that each compute node has unique extra specs stats and
# the RT makes sure those are unique per node.
stats = {'node:%s' % nodename: nodename}
self.driver_mock.get_available_resource.return_value = {
'hypervisor_hostname': nodename,
'hypervisor_type': 'ironic',
'hypervisor_version': 0,
'vcpus': cn.vcpus,
'vcpus_used': cn.vcpus_used,
'memory_mb': cn.memory_mb,
'memory_mb_used': cn.memory_mb_used,
'local_gb': cn.local_gb,
'local_gb_used': cn.local_gb_used,
'numa_topology': None,
'resource_class': None, # Act like admin hasn't set yet...
'stats': stats,
}
self.driver_mock.get_inventory.return_value = {
'CUSTOM_SMALL_IRON': {
'total': 1,
'reserved': 0,
'min_unit': 1,
'max_unit': 1,
'step_size': 1,
'allocation_ratio': 1.0,
},
}
self.rt.update_available_resource(self.ctx, nodename)
self.assertEqual(3, len(self.rt.compute_nodes))
def _assert_stats(bad_node=None):
# Make sure each compute node has a unique set of stats and
# they don't accumulate across nodes.
for _cn in self.rt.compute_nodes.values():
node_stats_key = 'node:%s' % _cn.hypervisor_hostname
if bad_node == _cn.hypervisor_hostname:
# FIXME(mriedem): This is bug 1784705 where the
# compute node has lost its stats and is getting
# stats for a different node.
self.assertNotIn(node_stats_key, _cn.stats)
else:
self.assertIn(node_stats_key, _cn.stats)
node_stat_count = 0
for stat in _cn.stats:
if stat.startswith('node:'):
node_stat_count += 1
self.assertEqual(1, node_stat_count, _cn.stats)
_assert_stats()
# Now "spawn" an instance to the first compute node by calling the
# RT's instance_claim().
cn1_obj = self.COMPUTE_NODE_FIXTURES[uuids.cn1]
cn1_nodename = cn1_obj.hypervisor_hostname
inst = self.instances[uuids.instance1]
with self.rt.instance_claim(self.ctx, inst, cn1_nodename):
# FIXME(mriedem): Remove bad_node once bug 1784705 is fixed.
_assert_stats(bad_node=cn1_nodename)