claim resources in placement API during schedule()

Adds logic to call the placement API's PUT /allocations/{consumer_uuid}
when selecting hosts in the filter scheduler's _schedule() method.

We only attempt the claim of resources if and only if the scheduler
driver uses allocation candidates (i.e. it isn't the caching scheduler)
and the conductor has passed in a list of instance UUIDs (otherwise,
there's no way to allocate in the placement API).

Change-Id: Ifc5cf482209e4f6f4e3e39b24389bd3563d86444
blueprint: placement-claims
This commit is contained in:
Jay Pipes 2017-07-13 17:04:34 -04:00 committed by Dan Smith
parent 9d1db10f84
commit 23c4eb3438
3 changed files with 420 additions and 15 deletions

View File

@ -28,6 +28,7 @@ import nova.conf
from nova import exception
from nova.i18n import _
from nova import rpc
from nova.scheduler import client
from nova.scheduler import driver
CONF = nova.conf.CONF
@ -39,12 +40,16 @@ class FilterScheduler(driver.Scheduler):
def __init__(self, *args, **kwargs):
super(FilterScheduler, self).__init__(*args, **kwargs)
self.notifier = rpc.get_notifier('scheduler')
scheduler_client = client.SchedulerClient()
self.placement_client = scheduler_client.reportclient
def select_destinations(self, context, spec_obj, instance_uuids,
alloc_reqs_by_rp_uuid, provider_summaries):
"""Returns a sorted list of HostState objects that satisfy the
supplied request_spec.
These hosts will have already had their resources claimed in Placement.
:param context: The RequestContext object
:param spec_obj: The RequestSpec object
:param instance_uuids: List of UUIDs, one for each value of the spec
@ -108,6 +113,8 @@ class FilterScheduler(driver.Scheduler):
"""Returns a list of hosts that meet the required specs, ordered by
their fitness.
These hosts will have already had their resources claimed in Placement.
:param context: The RequestContext object
:param spec_obj: The RequestSpec object
:param instance_uuids: List of UUIDs, one for each value of the spec
@ -145,27 +152,144 @@ class FilterScheduler(driver.Scheduler):
hosts = self._get_all_host_states(elevated, spec_obj,
provider_summaries)
# A list of the instance UUIDs that were successfully claimed against
# in the placement API. If we are not able to successfully claim for
# all involved instances, we use this list to remove those allocations
# before returning
claimed_instance_uuids = []
selected_hosts = []
num_instances = spec_obj.num_instances
for num in range(num_instances):
hosts = self._get_sorted_hosts(spec_obj, hosts, num)
if not hosts:
# NOTE(jaypipes): If we get here, that means not all instances
# in instance_uuids were able to be matched to a selected host.
# So, let's clean up any already-claimed allocations here
# before breaking and returning
self._cleanup_allocations(claimed_instance_uuids)
break
chosen_host = hosts[0]
if (instance_uuids is None or
not self.USES_ALLOCATION_CANDIDATES or
alloc_reqs_by_rp_uuid is None):
# Unfortunately, we still need to deal with older conductors
# that may not be passing in a list of instance_uuids. In those
# cases, obviously we can't claim resources because we don't
# have instance UUIDs to claim with, so we just grab the first
# host in the list of sorted hosts. In addition to older
# conductors, we need to support the caching scheduler, which
# doesn't use the placement API (and has
# USES_ALLOCATION_CANDIDATE = False) and therefore we skip all
# the claiming logic for that scheduler driver. Finally, if
# there was a problem communicating with the placement API,
# alloc_reqs_by_rp_uuid will be None, so we skip claiming in
# that case as well
claimed_host = hosts[0]
else:
instance_uuid = instance_uuids[num]
LOG.debug("Selected host: %(host)s", {'host': chosen_host})
selected_hosts.append(chosen_host)
# Attempt to claim the resources against one or more resource
# providers, looping over the sorted list of possible hosts
# looking for an allocation request that contains that host's
# resource provider UUID
claimed_host = None
for host in hosts:
cn_uuid = host.uuid
if cn_uuid not in alloc_reqs_by_rp_uuid:
LOG.debug("Found host state %s that wasn't in "
"allocation requests. Skipping.", cn_uuid)
continue
# Now consume the resources so the filter/weights
# will change for the next instance.
chosen_host.consume_from_request(spec_obj)
alloc_reqs = alloc_reqs_by_rp_uuid[cn_uuid]
if self._claim_resources(elevated, spec_obj, instance_uuid,
alloc_reqs):
claimed_host = host
break
if claimed_host is None:
# We weren't able to claim resources in the placement API
# for any of the sorted hosts identified. So, clean up any
# successfully-claimed resources for prior instances in
# this request and return an empty list which will cause
# select_destinations() to raise NoValidHost
LOG.debug("Unable to successfully claim against any host.")
self._cleanup_allocations(claimed_instance_uuids)
return []
claimed_instance_uuids.append(instance_uuid)
LOG.debug("Selected host: %(host)s", {'host': claimed_host})
selected_hosts.append(claimed_host)
# Now consume the resources so the filter/weights will change for
# the next instance.
claimed_host.consume_from_request(spec_obj)
if spec_obj.instance_group is not None:
spec_obj.instance_group.hosts.append(chosen_host.host)
spec_obj.instance_group.hosts.append(claimed_host.host)
# hosts has to be not part of the updates when saving
spec_obj.instance_group.obj_reset_changes(['hosts'])
return selected_hosts
def _cleanup_allocations(self, instance_uuids):
"""Removes allocations for the supplied instance UUIDs."""
if not instance_uuids:
return
LOG.debug("Cleaning up allocations for %s", instance_uuids)
for uuid in instance_uuids:
self.placement_client.delete_allocation_for_instance(uuid)
def _claim_resources(self, ctx, spec_obj, instance_uuid, alloc_reqs):
"""Given an instance UUID (representing the consumer of resources), the
HostState object for the host that was chosen for the instance, and a
list of allocation request JSON objects, attempt to claim resources for
the instance in the placement API. Returns True if the claim process
was successful, False otherwise.
:param ctx: The RequestContext object
:param spec_obj: The RequestSpec object
:param instance_uuid: The UUID of the consuming instance
:param cn_uuid: UUID of the host to allocate against
:param alloc_reqs: A list of allocation request JSON objects that
allocate against (at least) the compute host
selected by the _schedule() method. These allocation
requests were constructed from a call to the GET
/allocation_candidates placement API call. Each
allocation_request satisfies the original request
for resources and can be supplied as-is (along with
the project and user ID to the placement API's
PUT /allocations/{consumer_uuid} call to claim
resources for the instance
"""
LOG.debug("Attempting to claim resources in the placement API for "
"instance %s", instance_uuid)
project_id = spec_obj.project_id
# NOTE(jaypipes): So, the RequestSpec doesn't store the user_id,
# only the project_id, so we need to grab the user information from
# the context. Perhaps we should consider putting the user ID in
# the spec object?
user_id = ctx.user_id
# TODO(jaypipes): Loop through all allocation requests instead of just
# trying the first one. For now, since we'll likely want to order the
# allocation requests in the future based on information in the
# provider summaries, we'll just try to claim resources using the first
# allocation request
alloc_req = alloc_reqs[0]
claimed = self.placement_client.claim_resources(instance_uuid,
alloc_req, project_id, user_id)
if not claimed:
return False
LOG.debug("Successfully claimed resources for instance %s using "
"allocation request %s", instance_uuid, alloc_req)
return True
def _get_sorted_hosts(self, spec_obj, host_states, index):
"""Returns a list of HostState objects that match the required
scheduling constraints for the request spec object and have been sorted

View File

@ -20,6 +20,8 @@ import mock
from nova import exception
from nova import objects
from nova.scheduler import client
from nova.scheduler.client import report
from nova.scheduler import filter_scheduler
from nova.scheduler import host_manager
from nova.scheduler import utils as scheduler_utils
@ -34,11 +36,27 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
driver_cls = filter_scheduler.FilterScheduler
@mock.patch('nova.scheduler.client.SchedulerClient')
def setUp(self, mock_client):
pc_client = mock.Mock(spec=report.SchedulerReportClient)
sched_client = mock.Mock(spec=client.SchedulerClient)
sched_client.reportclient = pc_client
mock_client.return_value = sched_client
self.placement_client = pc_client
super(FilterSchedulerTestCase, self).setUp()
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_claim_resources')
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_get_all_host_states')
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_get_sorted_hosts')
def test_schedule(self, mock_get_hosts, mock_get_all_states):
def test_schedule_placement_bad_comms(self, mock_get_hosts,
mock_get_all_states, mock_claim):
"""If there was a problem communicating with the Placement service,
alloc_reqs_by_rp_uuid will be None and we need to avoid trying to claim
in the Placement API.
"""
spec_obj = objects.RequestSpec(
num_instances=1,
flavor=objects.Flavor(memory_mb=512,
@ -50,11 +68,60 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
instance_group=None)
host_state = mock.Mock(spec=host_manager.HostState,
host=mock.sentinel.host)
host=mock.sentinel.host, uuid=uuids.cn1)
all_host_states = [host_state]
mock_get_all_states.return_value = all_host_states
mock_get_hosts.return_value = all_host_states
instance_uuids = [uuids.instance]
instance_uuids = None
ctx = mock.Mock()
selected_hosts = self.driver._schedule(ctx, spec_obj,
instance_uuids, None, mock.sentinel.provider_summaries)
mock_get_all_states.assert_called_once_with(
ctx.elevated.return_value, spec_obj,
mock.sentinel.provider_summaries)
mock_get_hosts.assert_called_once_with(spec_obj, all_host_states, 0)
self.assertEqual(len(selected_hosts), 1)
self.assertEqual([host_state], selected_hosts)
# Ensure that we have consumed the resources on the chosen host states
host_state.consume_from_request.assert_called_once_with(spec_obj)
# And ensure we never called _claim_resources()
self.assertFalse(mock_claim.called)
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_claim_resources')
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_get_all_host_states')
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_get_sorted_hosts')
def test_schedule_old_conductor(self, mock_get_hosts,
mock_get_all_states, mock_claim):
"""Old conductor can call scheduler without the instance_uuids
parameter. When this happens, we need to ensure we do not attempt to
claim resources in the placement API since obviously we need instance
UUIDs to perform those claims.
"""
spec_obj = objects.RequestSpec(
num_instances=1,
flavor=objects.Flavor(memory_mb=512,
root_gb=512,
ephemeral_gb=0,
swap=0,
vcpus=1),
project_id=uuids.project_id,
instance_group=None)
host_state = mock.Mock(spec=host_manager.HostState,
host=mock.sentinel.host, uuid=uuids.cn1)
all_host_states = [host_state]
mock_get_all_states.return_value = all_host_states
mock_get_hosts.return_value = all_host_states
instance_uuids = None
ctx = mock.Mock()
selected_hosts = self.driver._schedule(ctx, spec_obj,
instance_uuids, mock.sentinel.alloc_reqs_by_rp_uuid,
@ -71,12 +138,161 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
# Ensure that we have consumed the resources on the chosen host states
host_state.consume_from_request.assert_called_once_with(spec_obj)
# And ensure we never called _claim_resources()
self.assertFalse(mock_claim.called)
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_claim_resources')
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_get_all_host_states')
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_get_sorted_hosts')
def test_schedule_successful_claim(self, mock_get_hosts,
mock_get_all_states, mock_claim):
spec_obj = objects.RequestSpec(
num_instances=1,
flavor=objects.Flavor(memory_mb=512,
root_gb=512,
ephemeral_gb=0,
swap=0,
vcpus=1),
project_id=uuids.project_id,
instance_group=None)
host_state = mock.Mock(spec=host_manager.HostState,
host=mock.sentinel.host, uuid=uuids.cn1)
all_host_states = [host_state]
mock_get_all_states.return_value = all_host_states
mock_get_hosts.return_value = all_host_states
mock_claim.return_value = True
instance_uuids = [uuids.instance]
alloc_reqs_by_rp_uuid = {
uuids.cn1: [mock.sentinel.alloc_req],
}
ctx = mock.Mock()
selected_hosts = self.driver._schedule(ctx, spec_obj,
instance_uuids, alloc_reqs_by_rp_uuid,
mock.sentinel.provider_summaries)
mock_get_all_states.assert_called_once_with(
ctx.elevated.return_value, spec_obj,
mock.sentinel.provider_summaries)
mock_get_hosts.assert_called_once_with(spec_obj, all_host_states, 0)
mock_claim.assert_called_once_with(ctx.elevated.return_value, spec_obj,
uuids.instance, [mock.sentinel.alloc_req])
self.assertEqual(len(selected_hosts), 1)
self.assertEqual([host_state], selected_hosts)
# Ensure that we have consumed the resources on the chosen host states
host_state.consume_from_request.assert_called_once_with(spec_obj)
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_cleanup_allocations')
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_claim_resources')
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_get_all_host_states')
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_get_sorted_hosts')
def test_schedule_unsuccessful_claim(self, mock_get_hosts,
mock_get_all_states, mock_claim, mock_cleanup):
"""Tests that we return an empty list if we are unable to successfully
claim resources for the instance
"""
spec_obj = objects.RequestSpec(
num_instances=1,
flavor=objects.Flavor(memory_mb=512,
root_gb=512,
ephemeral_gb=0,
swap=0,
vcpus=1),
project_id=uuids.project_id,
instance_group=None)
host_state = mock.Mock(spec=host_manager.HostState,
host=mock.sentinel.host, uuid=uuids.cn1)
all_host_states = [host_state]
mock_get_all_states.return_value = all_host_states
mock_get_hosts.return_value = all_host_states
mock_claim.return_value = False
instance_uuids = [uuids.instance]
alloc_reqs_by_rp_uuid = {
uuids.cn1: [mock.sentinel.alloc_req],
}
ctx = mock.Mock()
selected_hosts = self.driver._schedule(ctx, spec_obj,
instance_uuids, alloc_reqs_by_rp_uuid,
mock.sentinel.provider_summaries)
mock_get_all_states.assert_called_once_with(
ctx.elevated.return_value, spec_obj,
mock.sentinel.provider_summaries)
mock_get_hosts.assert_called_once_with(spec_obj, all_host_states, 0)
mock_claim.assert_called_once_with(ctx.elevated.return_value, spec_obj,
uuids.instance, [mock.sentinel.alloc_req])
self.assertEqual([], selected_hosts)
mock_cleanup.assert_called_once_with([])
# Ensure that we have consumed the resources on the chosen host states
self.assertFalse(host_state.consume_from_request.called)
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_cleanup_allocations')
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_claim_resources')
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_get_all_host_states')
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_get_sorted_hosts')
def test_schedule_not_all_instance_clean_claimed(self, mock_get_hosts,
mock_get_all_states, mock_claim, mock_cleanup):
"""Tests that we clean up previously-allocated instances if not all
instances could be scheduled
"""
spec_obj = objects.RequestSpec(
num_instances=2,
flavor=objects.Flavor(memory_mb=512,
root_gb=512,
ephemeral_gb=0,
swap=0,
vcpus=1),
project_id=uuids.project_id,
instance_group=None)
host_state = mock.Mock(spec=host_manager.HostState,
host=mock.sentinel.host, uuid=uuids.cn1)
all_host_states = [host_state]
mock_get_all_states.return_value = all_host_states
mock_get_hosts.side_effect = [
all_host_states, # first return all the hosts (only one)
[], # then act as if no more hosts were found that meet criteria
]
mock_claim.return_value = True
instance_uuids = [uuids.instance1, uuids.instance2]
alloc_reqs_by_rp_uuid = {
uuids.cn1: [mock.sentinel.alloc_req],
}
ctx = mock.Mock()
self.driver._schedule(ctx, spec_obj, instance_uuids,
alloc_reqs_by_rp_uuid, mock.sentinel.provider_summaries)
# Ensure we cleaned up the first successfully-claimed instance
mock_cleanup.assert_called_once_with([uuids.instance1])
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_claim_resources')
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_get_all_host_states')
@mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
'_get_sorted_hosts')
def test_schedule_instance_group(self, mock_get_hosts,
mock_get_all_states):
mock_get_all_states, mock_claim):
"""Test that since the request spec object contains an instance group
object, that upon choosing a host in the primary schedule loop,
that we update the request spec's instance group information
@ -93,10 +309,18 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
project_id=uuids.project_id,
instance_group=ig)
hs1 = mock.Mock(spec=host_manager.HostState, host='host1')
hs2 = mock.Mock(spec=host_manager.HostState, host='host2')
hs1 = mock.Mock(spec=host_manager.HostState, host='host1',
uuid=uuids.cn1)
hs2 = mock.Mock(spec=host_manager.HostState, host='host2',
uuid=uuids.cn2)
all_host_states = [hs1, hs2]
mock_get_all_states.return_value = all_host_states
mock_claim.return_value = True
alloc_reqs_by_rp_uuid = {
uuids.cn1: [mock.sentinel.alloc_req_cn1],
uuids.cn2: [mock.sentinel.alloc_req_cn2],
}
# Simulate host 1 and host 2 being randomly returned first by
# _get_sorted_hosts() in the two iterations for each instance in
@ -107,8 +331,17 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
]
ctx = mock.Mock()
self.driver._schedule(ctx, spec_obj, instance_uuids,
mock.sentinel.alloc_reqs_by_rp_uuid,
mock.sentinel.provider_summaries)
alloc_reqs_by_rp_uuid, mock.sentinel.provider_summaries)
# Check that we called _claim_resources() for both the first and second
# host state
claim_calls = [
mock.call(ctx.elevated.return_value, spec_obj,
uuids.instance0, [mock.sentinel.alloc_req_cn2]),
mock.call(ctx.elevated.return_value, spec_obj,
uuids.instance1, [mock.sentinel.alloc_req_cn1]),
]
mock_claim.assert_has_calls(claim_calls)
# Check that _get_sorted_hosts() is called twice and that the
# second time, we pass it the hosts that were returned from
@ -218,6 +451,40 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
# weighed hosts and thus return [hs1, hs2]
self.assertEqual([hs1, hs2], results)
def test_cleanup_allocations(self):
instance_uuids = []
# Check we don't do anything if there's no instance UUIDs to cleanup
# allocations for
pc = self.placement_client
self.driver._cleanup_allocations(instance_uuids)
self.assertFalse(pc.delete_allocation_for_instance.called)
instance_uuids = [uuids.instance1, uuids.instance2]
self.driver._cleanup_allocations(instance_uuids)
exp_calls = [mock.call(uuids.instance1), mock.call(uuids.instance2)]
pc.delete_allocation_for_instance.assert_has_calls(exp_calls)
def test_claim_resources(self):
"""Tests that when _schedule() calls _claim_resources(), that we
appropriately call the placement client to claim resources for the
instance.
"""
ctx = mock.Mock(user_id=uuids.user_id)
spec_obj = mock.Mock(project_id=uuids.project_id)
instance_uuid = uuids.instance
alloc_reqs = [mock.sentinel.alloc_req]
res = self.driver._claim_resources(ctx, spec_obj, instance_uuid,
alloc_reqs)
pc = self.placement_client
pc.claim_resources.return_value = True
pc.claim_resources.assert_called_once_with(uuids.instance,
mock.sentinel.alloc_req, uuids.project_id, uuids.user_id)
self.assertTrue(res)
def test_add_retry_host(self):
retry = dict(num_attempts=1, hosts=[])
filter_properties = dict(retry=retry)

View File

@ -0,0 +1,14 @@
---
other:
- |
The filter scheduler will now attempt to claim a number of
resources in the placement API after determining a list of
potential hosts. We attempt to claim these resources for each instance
in the build request, and if a claim does not succeed, we try this
claim against the next potential host the scheduler selected. This
claim retry process can potentially attempt claims against a large
number of hosts, and we do not limit the number of hosts to attempt
claims against. Claims for resources may fail due to another scheduler
process concurrently claiming resources against the same compute node.
This concurrent resource claim is normal and the retry of a claim
request should be unusual but harmless.