SchedulerReportClient.update_from_provider_tree

Once the resource tracker has asked the compute driver to
update_provider_tree, it needs to flush any changes back to placement.

This change set introduces update_from_provider_tree to
SchedulerReportClient.  This method accepts the ProviderTree as modified
by ComputeDriver.update_provider_tree, compares it to what the report
client has in its cache, and flushes any changes back to the placement
service.

Change-Id: I9064a2598d773a814269995eed8862d093d9100e
blueprint: update-provider-tree
Co-Authored-By: Radoslav Gerganov <rgerganov@vmware.com>
Depends-On: https://review.openstack.org/536545
This commit is contained in:
Eric Fried 2018-01-16 11:41:13 -06:00 committed by naichuans
parent c683518ecb
commit 32fdf52958
4 changed files with 365 additions and 120 deletions

View File

@ -2139,6 +2139,11 @@ class ResourceProviderUpdateFailed(NovaException):
"%(error)s")
class ResourceProviderSyncFailed(NovaException):
msg_fmt = _("Failed to synchronize the placement service with resource "
"provider information supplied by the compute host.")
class PlacementAPIConflict(NovaException):
"""Any 409 error from placement APIs should use (a subclass of) this
exception.

View File

@ -13,6 +13,7 @@
# License for the specific language governing permissions and limitations
# under the License.
import contextlib
import copy
import functools
import re
@ -696,12 +697,25 @@ class SchedulerReportClient(object):
# NOTE(efried): We currently have no code path where we need to set the
# parent_provider_uuid on a previously-parent-less provider - so we do
# NOT handle that scenario here.
if self._provider_tree.exists(uuid):
# If we had the requested provider locally, refresh it and its
# descendants, but only if stale.
for u in self._provider_tree.get_provider_uuids(uuid):
self._refresh_associations(context, u, force=False)
return uuid
# TODO(efried): Reinstate this optimization if possible.
# For now, this is removed due to the following:
# - update_provider_tree adds a child with some bogus inventory (bad
# resource class) or trait (invalid trait name).
# - update_from_provider_tree creates the child in placement and adds
# it to the cache, then attempts to add the bogus inventory/trait.
# The latter fails, so update_from_provider_tree invalidates the
# cache entry by removing the child from the cache.
# - Ordinarily, we would rely on the code below (_get_providers_in_tree
# and _provider_tree.populate_from_iterable) to restore the child to
# the cache on the next iteration. BUT since the root is still
# present in the cache, the commented-out block will cause that part
# of this method to be skipped.
# if self._provider_tree.exists(uuid):
# # If we had the requested provider locally, refresh it and its
# # descendants, but only if stale.
# for u in self._provider_tree.get_provider_uuids(uuid):
# self._refresh_associations(context, u, force=False)
# return uuid
# We don't have it locally; check placement or create it.
created_rp = None
@ -719,6 +733,9 @@ class SchedulerReportClient(object):
# At this point, the whole tree exists in the local cache.
for rp_to_refresh in rps_to_refresh:
# NOTE(efried): _refresh_associations doesn't refresh inventory
# (yet) - see that method's docstring for the why.
self._refresh_and_get_inventory(context, rp_to_refresh['uuid'])
self._refresh_associations(
context, rp_to_refresh['uuid'],
generation=rp_to_refresh.get('generation'), force=True)
@ -1377,6 +1394,122 @@ class SchedulerReportClient(object):
# when we invoke the DELETE. See bug #1746374.
self._update_inventory(context, compute_node.uuid, inv_data)
def update_from_provider_tree(self, context, new_tree):
"""Flush changes from a specified ProviderTree back to placement.
The specified ProviderTree is compared against the local cache. Any
changes are flushed back to the placement service. Upon successful
completion, the local cache should reflect the specified ProviderTree.
This method is best-effort and not atomic. When exceptions are raised,
it is possible that some of the changes have been flushed back, leaving
the placement database in an inconsistent state. This should be
recoverable through subsequent calls.
:param context: The security context
:param new_tree: A ProviderTree instance representing the desired state
of providers in placement.
:raises: ResourceProviderSyncFailed if any errors were encountered
attempting to perform the necessary API operations.
"""
# NOTE(efried): We currently do not handle the "rename" case. This is
# where new_tree contains a provider named Y whose UUID already exists
# but is named X. Today the only way the consumer could accomplish
# this is by deleting the provider and recreating it with the new name.
@contextlib.contextmanager
def catch_all(rp_uuid):
"""Convert all "expected" exceptions from placement API helpers to
True or False. Saves having to do try/except for every helper call
below.
"""
class Status(object):
success = True
s = Status()
# TODO(efried): Make a base exception class from which all these
# can inherit.
helper_exceptions = (
exception.InvalidResourceClass,
exception.InventoryInUse,
exception.ResourceProviderAggregateRetrievalFailed,
exception.ResourceProviderDeletionFailed,
exception.ResourceProviderInUse,
exception.ResourceProviderRetrievalFailed,
exception.ResourceProviderTraitRetrievalFailed,
exception.ResourceProviderUpdateConflict,
exception.ResourceProviderUpdateFailed,
exception.TraitCreationFailed,
exception.TraitRetrievalFailed,
)
try:
yield s
except helper_exceptions:
s.success = False
# Invalidate the caches
try:
self._provider_tree.remove(rp_uuid)
except ValueError:
pass
self.association_refresh_time.pop(rp_uuid, None)
# Overall indicator of success. Will be set to False on any exception.
success = True
# Helper methods herein will be updating the local cache (this is
# intentional) so we need to grab up front any data we need to operate
# on in its "original" form.
old_tree = self._provider_tree
old_uuids = old_tree.get_provider_uuids()
new_uuids = new_tree.get_provider_uuids()
# Do provider deletion first, since it has the best chance of failing
# for non-generation-conflict reasons (i.e. allocations).
uuids_to_remove = set(old_uuids) - set(new_uuids)
# We have to do deletions in bottom-up order, so we don't error
# attempting to delete a parent who still has children.
for uuid in reversed(old_uuids):
if uuid not in uuids_to_remove:
continue
with catch_all(uuid) as status:
self._delete_provider(uuid)
success = success and status.success
# Now create (or load) any "new" providers
uuids_to_add = set(new_uuids) - set(old_uuids)
# We have to do additions in top-down order, so we don't error
# attempting to create a child before its parent exists.
for uuid in new_uuids:
if uuid not in uuids_to_add:
continue
provider = new_tree.data(uuid)
with catch_all(uuid) as status:
self._ensure_resource_provider(
context, uuid, name=provider.name,
parent_provider_uuid=provider.parent_uuid)
success = success and status.success
# At this point the local cache should have all the same providers as
# new_tree. Whether we added them or not, walk through and diff/flush
# inventories, traits, and aggregates as necessary (the helper methods
# are set up to check and short out when the relevant property does not
# differ from what's in the cache).
# If we encounter any error and remove a provider from the cache, all
# its descendants are also removed, and set_*_for_provider methods on
# it wouldn't be able to get started. Walking the tree in bottom-up
# order ensures we at least try to process all of the providers.
for uuid in reversed(new_uuids):
pd = new_tree.data(uuid)
with catch_all(pd.uuid) as status:
self._set_inventory_for_provider(
context, pd.uuid, pd.inventory)
self.set_aggregates_for_provider(
context, pd.uuid, pd.aggregates)
self.set_traits_for_provider(context, pd.uuid, pd.traits)
success = success and status.success
if not success:
raise exception.ResourceProviderSyncFailed()
@safe_connect
def get_allocations_for_consumer(self, context, consumer):
url = '/allocations/%s' % consumer

View File

@ -18,6 +18,7 @@ import requests
from wsgi_intercept import interceptor
from nova.api.openstack.placement import deploy
from nova.compute import provider_tree
from nova import conf
from nova import context
# TODO(cdent): This points to the nova, not placement, exception for
@ -690,3 +691,206 @@ class SchedulerReportClientTests(test.TestCase):
inv,
self.client._get_inventory(
self.context, uuids.cn)['inventories'])
def test_update_from_provider_tree(self):
"""A "realistic" walk through the lifecycle of a compute node provider
tree.
"""
# NOTE(efried): We can use the same ProviderTree throughout, since
# update_from_provider_tree doesn't change it.
new_tree = provider_tree.ProviderTree()
def assert_ptrees_equal():
uuids = set(self.client._provider_tree.get_provider_uuids())
self.assertEqual(uuids, set(new_tree.get_provider_uuids()))
for uuid in uuids:
cdata = self.client._provider_tree.data(uuid)
ndata = new_tree.data(uuid)
self.assertEqual(ndata.name, cdata.name)
self.assertEqual(ndata.parent_uuid, cdata.parent_uuid)
self.assertFalse(
new_tree.has_inventory_changed(uuid, cdata.inventory))
self.assertFalse(
new_tree.have_traits_changed(uuid, cdata.traits))
self.assertFalse(
new_tree.have_aggregates_changed(uuid, cdata.aggregates))
# To begin with, the cache should be empty
self.assertEqual([], self.client._provider_tree.get_provider_uuids())
# When new_tree is empty, it's a no-op.
# Do this outside the interceptor to prove no API calls are made.
self.client.update_from_provider_tree(self.context, new_tree)
assert_ptrees_equal()
with self._interceptor():
# Populate with a provider with no inventories, aggregates, traits
new_tree.new_root('root', uuids.root, None)
self.client.update_from_provider_tree(self.context, new_tree)
assert_ptrees_equal()
# Throw in some more providers, in various spots in the tree, with
# some sub-properties
new_tree.new_child('child1', uuids.root, uuid=uuids.child1)
new_tree.update_aggregates('child1', [uuids.agg1, uuids.agg2])
new_tree.new_child('grandchild1_1', uuids.child1, uuid=uuids.gc1_1)
new_tree.update_traits(uuids.gc1_1, ['CUSTOM_PHYSNET_2'])
new_tree.new_root('ssp', uuids.ssp, None)
new_tree.update_inventory('ssp', {
fields.ResourceClass.DISK_GB: {
'total': 100,
'reserved': 1,
'min_unit': 1,
'max_unit': 10,
'step_size': 2,
'allocation_ratio': 10.0,
},
}, None)
self.client.update_from_provider_tree(self.context, new_tree)
assert_ptrees_equal()
# Swizzle properties
# Give the root some everything
new_tree.update_inventory(uuids.root, {
fields.ResourceClass.VCPU: {
'total': 10,
'reserved': 0,
'min_unit': 1,
'max_unit': 2,
'step_size': 1,
'allocation_ratio': 10.0,
},
fields.ResourceClass.MEMORY_MB: {
'total': 1048576,
'reserved': 2048,
'min_unit': 1024,
'max_unit': 131072,
'step_size': 1024,
'allocation_ratio': 1.0,
},
}, None)
new_tree.update_aggregates(uuids.root, [uuids.agg1])
new_tree.update_traits(uuids.root, ['HW_CPU_X86_AVX',
'HW_CPU_X86_AVX2'])
# Take away the child's aggregates
new_tree.update_aggregates(uuids.child1, [])
# Grandchild gets some inventory
ipv4_inv = {
fields.ResourceClass.IPV4_ADDRESS: {
'total': 128,
'reserved': 0,
'min_unit': 1,
'max_unit': 8,
'step_size': 1,
'allocation_ratio': 1.0,
},
}
new_tree.update_inventory('grandchild1_1', ipv4_inv, None)
# Shared storage provider gets traits
new_tree.update_traits('ssp', set(['MISC_SHARES_VIA_AGGREGATE',
'STORAGE_DISK_SSD']))
self.client.update_from_provider_tree(self.context, new_tree)
assert_ptrees_equal()
# Let's go for some error scenarios.
# Add inventory in an invalid resource class
new_tree.update_inventory(
'grandchild1_1',
dict(ipv4_inv,
MOTSUC_BANDWIDTH={
'total': 1250000,
'reserved': 10000,
'min_unit': 5000,
'max_unit': 250000,
'step_size': 5000,
'allocation_ratio': 8.0,
}), None)
self.assertRaises(
exception.ResourceProviderSyncFailed,
self.client.update_from_provider_tree, self.context, new_tree)
# The inventory update didn't get synced...
self.assertIsNone(self.client._get_inventory(
self.context, uuids.grandchild1_1))
# ...and the grandchild was removed from the cache
self.assertFalse(
self.client._provider_tree.exists('grandchild1_1'))
# Fix that problem so we can try the next one
new_tree.update_inventory(
'grandchild1_1',
dict(ipv4_inv,
CUSTOM_BANDWIDTH={
'total': 1250000,
'reserved': 10000,
'min_unit': 5000,
'max_unit': 250000,
'step_size': 5000,
'allocation_ratio': 8.0,
}), None)
# Add a bogus trait
new_tree.update_traits(uuids.root, ['HW_CPU_X86_AVX',
'HW_CPU_X86_AVX2',
'MOTSUC_FOO'])
self.assertRaises(
exception.ResourceProviderSyncFailed,
self.client.update_from_provider_tree, self.context, new_tree)
# Placement didn't get updated
self.assertEqual(set(['HW_CPU_X86_AVX', 'HW_CPU_X86_AVX2']),
self.client._get_provider_traits(self.context,
uuids.root))
# ...and the root was removed from the cache
self.assertFalse(self.client._provider_tree.exists(uuids.root))
# Fix that problem
new_tree.update_traits(uuids.root, ['HW_CPU_X86_AVX',
'HW_CPU_X86_AVX2',
'CUSTOM_FOO'])
# Now the sync should work
self.client.update_from_provider_tree(self.context, new_tree)
assert_ptrees_equal()
# Let's cause a conflict error by doing an "out-of-band" update
gen = self.client._provider_tree.data(uuids.ssp).generation
self.assertTrue(self.client.put(
'/resource_providers/%s/traits' % uuids.ssp,
{'resource_provider_generation': gen,
'traits': ['MISC_SHARES_VIA_AGGREGATE', 'STORAGE_DISK_HDD']},
version='1.6'))
# Now if we try to modify the traits, we should fail and invalidate
# the cache...
new_tree.update_traits(uuids.ssp, ['MISC_SHARES_VIA_AGGREGATE',
'STORAGE_DISK_SSD',
'CUSTOM_FAST'])
self.assertRaises(
exception.ResourceProviderSyncFailed,
self.client.update_from_provider_tree, self.context, new_tree)
# ...but the next iteration will refresh the cache with the latest
# generation and so the next attempt should succeed.
self.client.update_from_provider_tree(self.context, new_tree)
# The out-of-band change is blown away, as it should be.
assert_ptrees_equal()
# Let's delete some stuff
new_tree.remove(uuids.ssp)
self.assertFalse(new_tree.exists('ssp'))
new_tree.remove('child1')
self.assertFalse(new_tree.exists('child1'))
# Removing a node removes its descendants too
self.assertFalse(new_tree.exists('grandchild1_1'))
self.client.update_from_provider_tree(self.context, new_tree)
assert_ptrees_equal()
# Remove the last provider
new_tree.remove(uuids.root)
self.assertEqual([], new_tree.get_provider_uuids())
self.client.update_from_provider_tree(self.context, new_tree)
assert_ptrees_equal()
# Having removed the providers this way, they ought to be gone
# from placement
for uuid in (uuids.root, uuids.child1, uuids.grandchild1_1,
uuids.ssp):
resp = self.client.get('/resource_providers/%s' % uuid)
self.assertEqual(404, resp.status_code)

View File

@ -1176,83 +1176,7 @@ class TestProviderOperations(SchedulerReportClientTestCase):
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_create_resource_provider')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_get_provider_aggregates')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_get_provider_traits')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_get_sharing_providers')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_get_providers_in_tree')
def test_ensure_resource_provider_exists_in_cache(self, get_rpt_mock,
get_shr_mock, get_trait_mock, get_agg_mock, create_rp_mock):
# Override the client object's cache to contain a resource provider
# object for the compute host and check that
# _ensure_resource_provider() doesn't call _get_resource_provider() or
# _create_resource_provider()
cn = self.compute_node
self.client._provider_tree.new_root(
cn.hypervisor_hostname,
cn.uuid,
1,
)
get_agg_mock.side_effect = [
set([uuids.agg1, uuids.agg2]),
set([uuids.agg1, uuids.agg3]),
set([uuids.agg2]),
]
get_trait_mock.side_effect = [
set(['CUSTOM_GOLD', 'CUSTOM_SILVER']),
set(),
set(['CUSTOM_BRONZE'])
]
get_shr_mock.return_value = [
{
'uuid': uuids.shr1,
'name': 'sharing1',
'generation': 1,
},
{
'uuid': uuids.shr2,
'name': 'sharing2',
'generation': 2,
},
]
self.client._ensure_resource_provider(self.context, cn.uuid)
get_shr_mock.assert_called_once_with(
self.context, set([uuids.agg1, uuids.agg2]))
self.assertTrue(self.client._provider_tree.exists(uuids.shr1))
self.assertTrue(self.client._provider_tree.exists(uuids.shr2))
# _get_provider_aggregates and _traits were called thrice: one for the
# compute RP and once for each of the sharing RPs.
expected_calls = [mock.call(self.context, uuid)
for uuid in (cn.uuid, uuids.shr1, uuids.shr2)]
get_agg_mock.assert_has_calls(expected_calls)
get_trait_mock.assert_has_calls(expected_calls)
# The compute RP is associated with aggs 1 and 2
self.assertFalse(self.client._provider_tree.have_aggregates_changed(
uuids.compute_node, [uuids.agg1, uuids.agg2]))
# The first sharing RP is associated with agg1 and agg3
self.assertFalse(self.client._provider_tree.have_aggregates_changed(
uuids.shr1, [uuids.agg1, uuids.agg3]))
# And the second with just agg2
self.assertFalse(self.client._provider_tree.have_aggregates_changed(
uuids.shr2, [uuids.agg2]))
# The compute RP has gold and silver traits
self.assertFalse(self.client._provider_tree.have_traits_changed(
uuids.compute_node, ['CUSTOM_GOLD', 'CUSTOM_SILVER']))
# The first sharing RP has none
self.assertFalse(self.client._provider_tree.have_traits_changed(
uuids.shr1, []))
# The second has bronze
self.assertFalse(self.client._provider_tree.have_traits_changed(
uuids.shr2, ['CUSTOM_BRONZE']))
# These were not called because we had the root provider in the cache.
self.assertFalse(get_rpt_mock.called)
self.assertFalse(create_rp_mock.called)
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_create_resource_provider')
'_get_inventory')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_get_provider_aggregates')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
@ -1262,7 +1186,7 @@ class TestProviderOperations(SchedulerReportClientTestCase):
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_get_providers_in_tree')
def test_ensure_resource_provider_get(self, get_rpt_mock, get_shr_mock,
get_trait_mock, get_agg_mock, create_rp_mock):
get_trait_mock, get_agg_mock, get_inv_mock, create_rp_mock):
# No resource provider exists in the client's cache, so validate that
# if we get the resource provider from the placement API that we don't
# try to create the resource provider.
@ -1272,6 +1196,7 @@ class TestProviderOperations(SchedulerReportClientTestCase):
'generation': 1,
}]
get_inv_mock.return_value = None
get_agg_mock.return_value = set([uuids.agg1])
get_trait_mock.return_value = set(['CUSTOM_GOLD'])
get_shr_mock.return_value = []
@ -1334,11 +1259,15 @@ class TestProviderOperations(SchedulerReportClientTestCase):
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_create_resource_provider')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_refresh_and_get_inventory')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_refresh_associations')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_get_providers_in_tree')
def test_ensure_resource_provider_create(self, get_rpt_mock, refresh_mock,
def test_ensure_resource_provider_create(self, get_rpt_mock,
refresh_inv_mock,
refresh_assoc_mock,
create_rp_mock):
# No resource provider exists in the client's cache and no resource
# provider was returned from the placement API, so verify that in this
@ -1358,7 +1287,8 @@ class TestProviderOperations(SchedulerReportClientTestCase):
aggregates=set(), traits=set())
# We don't refresh for a just-created provider
refresh_mock.assert_not_called()
refresh_inv_mock.assert_not_called()
refresh_assoc_mock.assert_not_called()
get_rpt_mock.assert_called_once_with(self.context, uuids.compute_node)
create_rp_mock.assert_called_once_with(
self.context,
@ -1370,16 +1300,6 @@ class TestProviderOperations(SchedulerReportClientTestCase):
create_rp_mock.reset_mock()
self.assertEqual(
uuids.compute_node,
self.client._ensure_resource_provider(self.context,
uuids.compute_node))
self._validate_provider(uuids.compute_node, name='compute-name',
generation=1, parent_uuid=None)
# Shouldn't be called now that provider is in cache...
self.assertFalse(create_rp_mock.called)
# Validate the path where we specify a name (don't default to the UUID)
self.client._ensure_resource_provider(
self.context, uuids.cn2, 'a-name')
@ -1441,33 +1361,14 @@ class TestProviderOperations(SchedulerReportClientTestCase):
set([uuids.root, uuids.child1, uuids.child2, uuids.grandchild]),
set(self.client._provider_tree.get_provider_uuids()))
@mock.patch('nova.compute.provider_tree.ProviderTree.exists')
@mock.patch('nova.compute.provider_tree.ProviderTree.get_provider_uuids')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_refresh_associations')
def test_ensure_resource_provider_refresh_local(self, mock_refresh,
mock_gpu, mock_exists):
"""Make sure refreshes are called with the appropriate UUIDs and flags
when the local cache already has the provider in it.
"""
mock_exists.return_value = True
tree_uuids = [uuids.root, uuids.one, uuids.two]
mock_gpu.return_value = tree_uuids
self.assertEqual(uuids.root,
self.client._ensure_resource_provider(self.context,
uuids.root))
mock_exists.assert_called_once_with(uuids.root)
mock_gpu.assert_called_once_with(uuids.root)
mock_refresh.assert_has_calls(
[mock.call(self.context, uuid, force=False)
for uuid in tree_uuids])
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_get_providers_in_tree')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_refresh_and_get_inventory')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'_refresh_associations')
def test_ensure_resource_provider_refresh_fetch(self, mock_refresh,
mock_gpit):
def test_ensure_resource_provider_refresh_fetch(self, mock_ref_assoc,
mock_ref_inv, mock_gpit):
"""Make sure refreshes are called with the appropriate UUIDs and flags
when we fetch the provider tree from placement.
"""
@ -1478,7 +1379,9 @@ class TestProviderOperations(SchedulerReportClientTestCase):
self.client._ensure_resource_provider(self.context,
uuids.root))
mock_gpit.assert_called_once_with(self.context, uuids.root)
mock_refresh.assert_has_calls(
mock_ref_inv.assert_has_calls([mock.call(self.context, uuid)
for uuid in tree_uuids])
mock_ref_assoc.assert_has_calls(
[mock.call(self.context, uuid, generation=42, force=True)
for uuid in tree_uuids])
self.assertEqual(tree_uuids,