Merge "Resource tracker compatibility with Ocata and Pike"

This commit is contained in:
Jenkins 2017-08-11 07:06:48 +00:00 committed by Gerrit Code Review
commit 4cd6a3a1b4
10 changed files with 136 additions and 43 deletions

View File

@ -13,7 +13,7 @@
"disabled_reason": null,
"report_count": 1,
"forced_down": false,
"version": 21,
"version": 22,
"availability_zone": null,
"uuid": "fa69c544-906b-4a6a-a9c6-c1f7a8078c73"
}

View File

@ -52,7 +52,7 @@ def main():
cmd_common.block_db_access('nova-compute')
objects_base.NovaObject.indirection_api = conductor_rpcapi.ConductorAPI()
objects.Service.enable_min_version_cache()
server = service.Service.create(binary='nova-compute',
topic=compute_rpcapi.RPC_TOPIC)
service.serve(server)

View File

@ -1010,7 +1010,7 @@ class ResourceTracker(object):
continue
def _update_usage_from_instance(self, context, instance, nodename,
is_removed=False):
is_removed=False, has_ocata_computes=False):
"""Update usage for a single instance."""
uuid = instance['uuid']
@ -1038,7 +1038,40 @@ class ResourceTracker(object):
self.pci_tracker.update_pci_for_instance(context,
instance,
sign=sign)
self.reportclient.update_instance_allocation(cn, instance, sign)
if has_ocata_computes:
LOG.debug("We're on a Pike compute host in a deployment "
"with Ocata compute hosts. Auto-correcting "
"allocations to handle Ocata-style assumptions.")
self.reportclient.update_instance_allocation(cn, instance,
sign)
else:
# NOTE(jaypipes): We're on a Pike compute host or later in
# a deployment with all compute hosts upgraded to Pike or
# later
#
# If that is the case, then we know that the scheduler will
# have properly created an allocation and that the compute
# hosts have not attempted to overwrite allocations
# **during the periodic update_available_resource() call**.
# However, Pike compute hosts may still rework an
# allocation for an instance in a move operation during
# confirm_resize() on the source host which will remove the
# source resource provider from any allocation for an
# instance.
#
# In Queens and beyond, the scheduler will understand when
# a move operation has been requested and instead of
# creating a doubled-up allocation that contains both the
# source and destination host, the scheduler will take the
# original allocation (against the source host) and change
# the consumer ID of that allocation to be the migration
# UUID and not the instance UUID. The scheduler will
# allocate the resources for the destination host to the
# instance UUID.
LOG.debug("We're on a Pike compute host in a deployment "
"with all Pike compute hosts. Skipping "
"auto-correction of allocations.")
# new instance, update compute node resource usage:
self._update_usage(self._get_usage_dict(instance), nodename,
sign=sign)
@ -1068,9 +1101,44 @@ class ResourceTracker(object):
cn.current_workload = 0
cn.running_vms = 0
# NOTE(jaypipes): In Pike, we need to be tolerant of Ocata compute
# nodes that overwrite placement allocations to look like what the
# resource tracker *thinks* is correct. When an instance is
# migrated from an Ocata compute node to a Pike compute node, the
# Pike scheduler will have created a "doubled-up" allocation that
# contains allocated resources against both the source and
# destination hosts. The Ocata source compute host, during its
# update_available_resource() periodic call will find the instance
# in its list of known instances and will call
# update_instance_allocation() in the report client. That call will
# pull the allocations for the instance UUID which will contain
# both the source and destination host providers in the allocation
# set. Seeing that this is different from what the Ocata source
# host thinks it should be and will overwrite the allocation to
# only be an allocation against itself.
#
# And therefore, here we need to have Pike compute hosts
# "correct" the improper healing that the Ocata source host did
# during its periodic interval. When the instance is fully migrated
# to the Pike compute host, the Ocata compute host will find an
# allocation that refers to itself for an instance it no longer
# controls and will *delete* all allocations that refer to that
# instance UUID, assuming that the instance has been deleted. We
# need the destination Pike compute host to recreate that
# allocation to refer to its own resource provider UUID.
#
# For Pike compute nodes that migrate to either a Pike compute host
# or a Queens compute host, we do NOT want the Pike compute host to
# be "healing" allocation information. Instead, we rely on the Pike
# scheduler to properly create allocations during scheduling.
compute_version = objects.Service.get_minimum_version(
context, 'nova-compute')
has_ocata_computes = compute_version < 22
for instance in instances:
if instance.vm_state not in vm_states.ALLOW_RESOURCE_REMOVAL:
self._update_usage_from_instance(context, instance, nodename)
self._update_usage_from_instance(context, instance, nodename,
has_ocata_computes=has_ocata_computes)
self._remove_deleted_instances_allocations(context, cn)
@ -1141,7 +1209,6 @@ class ResourceTracker(object):
"There are allocations remaining against the source "
"host that might need to be removed: %s.",
instance_uuid, instance.host, instance.node, alloc)
continue
def _find_orphaned_instances(self):
"""Given the set of instances and migrations already account for

View File

@ -32,7 +32,7 @@ LOG = logging.getLogger(__name__)
# NOTE(danms): This is the global service version counter
SERVICE_VERSION = 21
SERVICE_VERSION = 22
# NOTE(danms): This is our SERVICE_VERSION history. The idea is that any
@ -110,6 +110,9 @@ SERVICE_VERSION_HISTORY = (
{'compute_rpc': '4.16'},
# Version 21: Compute RPC version 4.17
{'compute_rpc': '4.17'},
# Version 22: A marker for the behaviour change of auto-healing code on the
# compute host regarding allocations against an instance
{'compute_rpc': '4.17'},
)

View File

@ -260,6 +260,7 @@ class TestCase(testtools.TestCase):
self._base_test_obj_backup = copy.copy(
objects_base.NovaObjectRegistry._registry._obj_classes)
self.addCleanup(self._restore_obj_registry)
objects.Service.clear_min_version_cache()
# NOTE(danms): Reset the cached list of cells
from nova.compute import api

View File

@ -290,12 +290,17 @@ class IronicResourceTrackerTest(test.TestCase):
self.assertEqual(3, len(inv))
# Now "spawn" an instance to the first compute node by calling the
# RT's instance_claim(), which should, in the case of an Ironic
# instance, grab the full compute node for the instance and write
# allocation records for VCPU, MEMORY_MB, and DISK_GB
# RT's instance_claim().
cn1_obj = self.COMPUTE_NODE_FIXTURES[uuids.cn1]
cn1_nodename = cn1_obj.hypervisor_hostname
inst = self.INSTANCE_FIXTURES[uuids.instance1]
# Since we're pike, the scheduler would have created our
# allocation for us. So, we can use our old update routine
# here to mimic that before we go do the compute RT claim,
# and then the checks below.
self.rt.reportclient.update_instance_allocation(cn1_obj,
inst,
1)
with self.rt.instance_claim(self.ctx, inst, cn1_nodename):
pass

View File

@ -41,7 +41,8 @@ class TestLocalDeleteAllocations(test.TestCase,
self.start_service('conductor')
self.start_service('consoleauth')
self.flags(group='scheduler', driver='chance_scheduler')
self.flags(enabled_filters=['RetryFilter', 'ComputeFilter'],
group='filter_scheduler')
self.start_service('scheduler')
self.compute = self.start_service('compute')

View File

@ -1081,11 +1081,7 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
self.useFixture(policy_fixture.RealPolicyFixture())
self.useFixture(nova_fixtures.NeutronFixture(self))
# NOTE(gibi): After fix 1707071 we need to set the service version to
# pike to test pike only interactions. We need separate tests for
# ocata - pike interactions
# self.useFixture(nova_fixtures.AllServicesCurrent())
self.useFixture(nova_fixtures.AllServicesCurrent())
placement = self.useFixture(nova_fixtures.PlacementFixture())
self.placement_api = placement.api
@ -1308,7 +1304,6 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
source_allocation = allocations[source_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(old_flavor, source_allocation)
self.assertEqual(2, len(allocations))
dest_allocation = allocations[dest_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(new_flavor, dest_allocation)
@ -1394,6 +1389,7 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
# destination host
allocations = self._get_allocations_by_server_uuid(server['id'])
# and the server allocates only from the target host
self.assertEqual(1, len(allocations))
source_usages = self._get_provider_usages(source_rp_uuid)
@ -1407,6 +1403,12 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
'The source host %s still has usages after the '
'resize has been confirmed' % source_hostname)
# and the target host allocation should be according to the new flavor
self.assertFlavorMatchesAllocation(self.flavor2, dest_usages)
dest_allocation = allocations[dest_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(self.flavor2, dest_allocation)
self._run_periodics()
# Check we're still accurate after running the periodics
@ -1462,29 +1464,13 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
usages = self._get_provider_usages(rp_uuid)
# NOTE(gibi): This is bug 1707071 where the compute "healing" periodic
# tramples on the doubled allocations created in the scheduler.
self.assertFlavorMatchesAllocation(new_flavor, usages)
# NOTE(gibi): After fixing bug 1707252 the following is expected
# self.assertEqual(old_flavor['vcpus'] + new_flavor['vcpus'],
# usages['VCPU'])
# self.assertEqual(old_flavor['ram'] + new_flavor['ram'],
# usages['MEMORY_MB'])
# self.assertEqual(old_flavor['disk'] + new_flavor['disk'],
# usages['DISK_GB'])
self.assertFlavorsMatchAllocation(old_flavor, new_flavor, usages)
allocations = self._get_allocations_by_server_uuid(server['id'])
self.assertEqual(1, len(allocations))
allocation = allocations[rp_uuid]['resources']
# NOTE(gibi): After fixing bug 1707252 the following is expected
# self.assertEqual(old_flavor['vcpus'] + new_flavor['vcpus'],
# allocation['VCPU'])
# self.assertEqual(old_flavor['ram'] + new_flavor['ram'],
# allocation['MEMORY_MB'])
# self.assertEqual(old_flavor['disk'] + new_flavor['disk'],
# allocation['DISK_GB'])
self.assertFlavorsMatchAllocation(old_flavor, new_flavor, allocation)
def test_resize_revert_same_host(self):
# make sure that the test only uses a single host

View File

@ -5643,6 +5643,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
self.instance, self.instance.node)
self.assertIsNotNone(self.instance.migration_context)
@mock.patch('nova.objects.Service.get_minimum_version',
return_value=22)
@mock.patch.object(self.compute, "_notify_about_instance_usage")
@mock.patch.object(self.compute, "_set_instance_info")
@mock.patch.object(self.instance, 'save')
@ -5665,7 +5667,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
mock_mig_save,
mock_inst_save,
mock_set,
mock_notify):
mock_notify,
mock_version):
self.compute.finish_revert_resize(context=self.context,
instance=self.instance,
reservations=None,

View File

@ -31,6 +31,7 @@ from nova.objects import base as obj_base
from nova.objects import fields as obj_fields
from nova.objects import pci_device
from nova.pci import manager as pci_manager
from nova.scheduler import utils as sched_utils
from nova import test
from nova.tests.unit.objects import test_pci_device as fake_pci_device
from nova.tests import uuidsentinel as uuids
@ -467,7 +468,9 @@ class BaseTestCase(test.NoDBTestCase):
class TestUpdateAvailableResources(BaseTestCase):
def _update_available_resources(self):
@mock.patch('nova.objects.Service.get_minimum_version',
return_value=22)
def _update_available_resources(self, version_mock):
# We test RT._update separately, since the complexity
# of the update_available_resource() function is high enough as
# it is, we just want to focus here on testing the resources
@ -1688,6 +1691,8 @@ class TestInstanceClaim(BaseTestCase):
class TestResize(BaseTestCase):
@mock.patch('nova.compute.utils.is_volume_backed_instance',
return_value=False)
@mock.patch('nova.objects.Service.get_minimum_version',
return_value=22)
@mock.patch('nova.objects.InstancePCIRequests.get_by_instance',
return_value=objects.InstancePCIRequests(requests=[]))
@mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
@ -1698,7 +1703,7 @@ class TestResize(BaseTestCase):
@mock.patch('nova.objects.ComputeNode.save')
def test_resize_claim_same_host(self, save_mock, get_mock, migr_mock,
get_cn_mock, pci_mock, instance_pci_mock,
is_bfv_mock):
version_mock, is_bfv_mock):
# Resize an existing instance from its current flavor (instance type
# 1) to a new flavor (instance type 2) and verify that the compute
# node's resources are appropriately updated to account for the new
@ -1788,6 +1793,8 @@ class TestResize(BaseTestCase):
@mock.patch('nova.compute.utils.is_volume_backed_instance',
return_value=False)
@mock.patch('nova.objects.Service.get_minimum_version',
return_value=22)
@mock.patch('nova.objects.InstancePCIRequests.get_by_instance_uuid',
return_value=objects.InstancePCIRequests(requests=[]))
@mock.patch('nova.objects.InstancePCIRequests.get_by_instance',
@ -1809,8 +1816,10 @@ class TestResize(BaseTestCase):
pci_get_by_compute_node_mock,
pci_get_by_instance_mock,
pci_get_by_instance_uuid_mock,
version_mock,
is_bfv_mock,
revert=False):
self.flags(reserved_host_disk_mb=0,
reserved_host_memory_mb=0)
virt_resources = copy.deepcopy(_VIRT_DRIVER_AVAIL_RESOURCES)
@ -1893,12 +1902,23 @@ class TestResize(BaseTestCase):
ignore=['stats']
))
def test_instance_build_resize_confirm(self):
self._test_instance_build_resize()
cn = self.rt.compute_nodes[_NODENAME]
cn_uuid = cn.uuid
rc = self.sched_client_mock.reportclient
remove_method = rc.remove_provider_from_instance_allocation
expected_resources = sched_utils.resources_from_flavor(instance,
flavor)
remove_method.assert_called_once_with(instance.uuid, cn_uuid,
instance.user_id, instance.project_id, expected_resources)
def test_instance_build_resize_revert(self):
self._test_instance_build_resize(revert=True)
def test_instance_build_resize_confirm(self):
self._test_instance_build_resize()
@mock.patch('nova.objects.Service.get_minimum_version',
return_value=22)
@mock.patch('nova.pci.stats.PciDeviceStats.support_requests',
return_value=True)
@mock.patch('nova.objects.PciDevice.save')
@ -1911,7 +1931,7 @@ class TestResize(BaseTestCase):
@mock.patch('nova.objects.ComputeNode.save')
def test_resize_claim_dest_host_with_pci(self, save_mock, get_mock,
migr_mock, get_cn_mock, pci_mock, pci_req_mock, pci_claim_mock,
pci_dev_save_mock, pci_supports_mock):
pci_dev_save_mock, pci_supports_mock, version_mock):
# Starting from an empty destination compute node, perform a resize
# operation for an instance containing SR-IOV PCI devices on the
# original host.
@ -2038,7 +2058,12 @@ class TestResize(BaseTestCase):
self.rt.drop_move_claim(ctx, instance, _NODENAME)
mock_pci_free_device.assert_called_once_with(
pci_dev, mock.ANY)
# Check that we grabbed resourced for the right flavor...
mock_resources.assert_called_once_with(instance,
instance.flavor)
@mock.patch('nova.objects.Service.get_minimum_version',
return_value=22)
@mock.patch('nova.objects.InstancePCIRequests.get_by_instance',
return_value=objects.InstancePCIRequests(requests=[]))
@mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
@ -2048,7 +2073,7 @@ class TestResize(BaseTestCase):
@mock.patch('nova.objects.InstanceList.get_by_host_and_node')
@mock.patch('nova.objects.ComputeNode.save')
def test_resize_claim_two_instances(self, save_mock, get_mock, migr_mock,
get_cn_mock, pci_mock, instance_pci_mock):
get_cn_mock, pci_mock, instance_pci_mock, version_mock):
# Issue two resize claims against a destination host with no prior
# instances on it and validate that the accounting for resources is
# correct.
@ -2163,6 +2188,8 @@ class TestResize(BaseTestCase):
class TestRebuild(BaseTestCase):
@mock.patch('nova.objects.Service.get_minimum_version',
return_value=22)
@mock.patch('nova.objects.InstancePCIRequests.get_by_instance',
return_value=objects.InstancePCIRequests(requests=[]))
@mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
@ -2172,7 +2199,7 @@ class TestRebuild(BaseTestCase):
@mock.patch('nova.objects.InstanceList.get_by_host_and_node')
@mock.patch('nova.objects.ComputeNode.save')
def test_rebuild_claim(self, save_mock, get_mock, migr_mock, get_cn_mock,
pci_mock, instance_pci_mock):
pci_mock, instance_pci_mock, version_mock):
# Rebuild an instance, emulating an evacuate command issued against the
# original instance. The rebuild operation uses the resource tracker's
# _move_claim() method, but unlike with resize_claim(), rebuild_claim()