claim resources in placement API during schedule()

Adds logic to call the placement API's PUT /allocations/{consumer_uuid} when selecting hosts in the filter scheduler's _schedule() method. We only attempt the claim of resources if and only if the scheduler driver uses allocation candidates (i.e. it isn't the caching scheduler) and the conductor has passed in a list of instance UUIDs (otherwise, there's no way to allocate in the placement API). Change-Id: Ifc5cf482209e4f6f4e3e39b24389bd3563d86444 blueprint: placement-claims
2017-07-13 17:04:34 -04:00 · 2017-07-13 17:04:34 -04:00 · 23c4eb3438
parent 9d1db10f84
commit 23c4eb3438
3 changed files with 420 additions and 15 deletions
--- a/nova/scheduler/filter_scheduler.py
+++ b/nova/scheduler/filter_scheduler.py
@ -28,6 +28,7 @@ import nova.conf
 from nova import exception
 from nova.i18n import _
 from nova import rpc
+from nova.scheduler import client
 from nova.scheduler import driver

 CONF = nova.conf.CONF
@ -39,12 +40,16 @@ class FilterScheduler(driver.Scheduler):
    def __init__(self, *args, **kwargs):
        super(FilterScheduler, self).__init__(*args, **kwargs)
        self.notifier = rpc.get_notifier('scheduler')
+        scheduler_client = client.SchedulerClient()
+        self.placement_client = scheduler_client.reportclient

    def select_destinations(self, context, spec_obj, instance_uuids,
            alloc_reqs_by_rp_uuid, provider_summaries):
        """Returns a sorted list of HostState objects that satisfy the
        supplied request_spec.

+        These hosts will have already had their resources claimed in Placement.
+
        :param context: The RequestContext object
        :param spec_obj: The RequestSpec object
        :param instance_uuids: List of UUIDs, one for each value of the spec
@ -108,6 +113,8 @@ class FilterScheduler(driver.Scheduler):
        """Returns a list of hosts that meet the required specs, ordered by
        their fitness.

+        These hosts will have already had their resources claimed in Placement.
+
        :param context: The RequestContext object
        :param spec_obj: The RequestSpec object
        :param instance_uuids: List of UUIDs, one for each value of the spec
@ -145,27 +152,144 @@ class FilterScheduler(driver.Scheduler):
        hosts = self._get_all_host_states(elevated, spec_obj,
            provider_summaries)

+        # A list of the instance UUIDs that were successfully claimed against
+        # in the placement API. If we are not able to successfully claim for
+        # all involved instances, we use this list to remove those allocations
+        # before returning
+        claimed_instance_uuids = []
+
        selected_hosts = []
        num_instances = spec_obj.num_instances
        for num in range(num_instances):
            hosts = self._get_sorted_hosts(spec_obj, hosts, num)
            if not hosts:
+                # NOTE(jaypipes): If we get here, that means not all instances
+                # in instance_uuids were able to be matched to a selected host.
+                # So, let's clean up any already-claimed allocations here
+                # before breaking and returning
+                self._cleanup_allocations(claimed_instance_uuids)
                break

-            chosen_host = hosts[0]
+            if (instance_uuids is None or
+                    not self.USES_ALLOCATION_CANDIDATES or
+                    alloc_reqs_by_rp_uuid is None):
+                # Unfortunately, we still need to deal with older conductors
+                # that may not be passing in a list of instance_uuids. In those
+                # cases, obviously we can't claim resources because we don't
+                # have instance UUIDs to claim with, so we just grab the first
+                # host in the list of sorted hosts. In addition to older
+                # conductors, we need to support the caching scheduler, which
+                # doesn't use the placement API (and has
+                # USES_ALLOCATION_CANDIDATE = False) and therefore we skip all
+                # the claiming logic for that scheduler driver. Finally, if
+                # there was a problem communicating with the placement API,
+                # alloc_reqs_by_rp_uuid will be None, so we skip claiming in
+                # that case as well
+                claimed_host = hosts[0]
+            else:
+                instance_uuid = instance_uuids[num]

-            LOG.debug("Selected host: %(host)s", {'host': chosen_host})
-            selected_hosts.append(chosen_host)
+                # Attempt to claim the resources against one or more resource
+                # providers, looping over the sorted list of possible hosts
+                # looking for an allocation request that contains that host's
+                # resource provider UUID
+                claimed_host = None
+                for host in hosts:
+                    cn_uuid = host.uuid
+                    if cn_uuid not in alloc_reqs_by_rp_uuid:
+                        LOG.debug("Found host state %s that wasn't in "
+                                  "allocation requests. Skipping.", cn_uuid)
+                        continue

-            # Now consume the resources so the filter/weights
-            # will change for the next instance.
-            chosen_host.consume_from_request(spec_obj)
+                    alloc_reqs = alloc_reqs_by_rp_uuid[cn_uuid]
+                    if self._claim_resources(elevated, spec_obj, instance_uuid,
+                            alloc_reqs):
+                        claimed_host = host
+                        break
+
+                if claimed_host is None:
+                    # We weren't able to claim resources in the placement API
+                    # for any of the sorted hosts identified. So, clean up any
+                    # successfully-claimed resources for prior instances in
+                    # this request and return an empty list which will cause
+                    # select_destinations() to raise NoValidHost
+                    LOG.debug("Unable to successfully claim against any host.")
+                    self._cleanup_allocations(claimed_instance_uuids)
+                    return []
+
+                claimed_instance_uuids.append(instance_uuid)
+
+            LOG.debug("Selected host: %(host)s", {'host': claimed_host})
+            selected_hosts.append(claimed_host)
+
+            # Now consume the resources so the filter/weights will change for
+            # the next instance.
+            claimed_host.consume_from_request(spec_obj)
            if spec_obj.instance_group is not None:
-                spec_obj.instance_group.hosts.append(chosen_host.host)
+                spec_obj.instance_group.hosts.append(claimed_host.host)
                # hosts has to be not part of the updates when saving
                spec_obj.instance_group.obj_reset_changes(['hosts'])
        return selected_hosts

+    def _cleanup_allocations(self, instance_uuids):
+        """Removes allocations for the supplied instance UUIDs."""
+        if not instance_uuids:
+            return
+        LOG.debug("Cleaning up allocations for %s", instance_uuids)
+        for uuid in instance_uuids:
+            self.placement_client.delete_allocation_for_instance(uuid)
+
+    def _claim_resources(self, ctx, spec_obj, instance_uuid, alloc_reqs):
+        """Given an instance UUID (representing the consumer of resources), the
+        HostState object for the host that was chosen for the instance, and a
+        list of allocation request JSON objects, attempt to claim resources for
+        the instance in the placement API. Returns True if the claim process
+        was successful, False otherwise.
+
+        :param ctx: The RequestContext object
+        :param spec_obj: The RequestSpec object
+        :param instance_uuid: The UUID of the consuming instance
+        :param cn_uuid: UUID of the host to allocate against
+        :param alloc_reqs: A list of allocation request JSON objects that
+                           allocate against (at least) the compute host
+                           selected by the _schedule() method. These allocation
+                           requests were constructed from a call to the GET
+                           /allocation_candidates placement API call.  Each
+                           allocation_request satisfies the original request
+                           for resources and can be supplied as-is (along with
+                           the project and user ID to the placement API's
+                           PUT /allocations/{consumer_uuid} call to claim
+                           resources for the instance
+        """
+        LOG.debug("Attempting to claim resources in the placement API for "
+                  "instance %s", instance_uuid)
+
+        project_id = spec_obj.project_id
+
+        # NOTE(jaypipes): So, the RequestSpec doesn't store the user_id,
+        # only the project_id, so we need to grab the user information from
+        # the context. Perhaps we should consider putting the user ID in
+        # the spec object?
+        user_id = ctx.user_id
+
+        # TODO(jaypipes): Loop through all allocation requests instead of just
+        # trying the first one. For now, since we'll likely want to order the
+        # allocation requests in the future based on information in the
+        # provider summaries, we'll just try to claim resources using the first
+        # allocation request
+        alloc_req = alloc_reqs[0]
+
+        claimed = self.placement_client.claim_resources(instance_uuid,
+            alloc_req, project_id, user_id)
+
+        if not claimed:
+            return False
+
+        LOG.debug("Successfully claimed resources for instance %s using "
+                  "allocation request %s", instance_uuid, alloc_req)
+
+        return True
+
    def _get_sorted_hosts(self, spec_obj, host_states, index):
        """Returns a list of HostState objects that match the required
        scheduling constraints for the request spec object and have been sorted
--- a/nova/tests/unit/scheduler/test_filter_scheduler.py
+++ b/nova/tests/unit/scheduler/test_filter_scheduler.py
@ -20,6 +20,8 @@ import mock

 from nova import exception
 from nova import objects
+from nova.scheduler import client
+from nova.scheduler.client import report
 from nova.scheduler import filter_scheduler
 from nova.scheduler import host_manager
 from nova.scheduler import utils as scheduler_utils
@ -34,11 +36,27 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):

    driver_cls = filter_scheduler.FilterScheduler

+    @mock.patch('nova.scheduler.client.SchedulerClient')
+    def setUp(self, mock_client):
+        pc_client = mock.Mock(spec=report.SchedulerReportClient)
+        sched_client = mock.Mock(spec=client.SchedulerClient)
+        sched_client.reportclient = pc_client
+        mock_client.return_value = sched_client
+        self.placement_client = pc_client
+        super(FilterSchedulerTestCase, self).setUp()
+
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_claim_resources')
    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
                '_get_all_host_states')
    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
                '_get_sorted_hosts')
-    def test_schedule(self, mock_get_hosts, mock_get_all_states):
+    def test_schedule_placement_bad_comms(self, mock_get_hosts,
+            mock_get_all_states, mock_claim):
+        """If there was a problem communicating with the Placement service,
+        alloc_reqs_by_rp_uuid will be None and we need to avoid trying to claim
+        in the Placement API.
+        """
        spec_obj = objects.RequestSpec(
            num_instances=1,
            flavor=objects.Flavor(memory_mb=512,
@ -50,11 +68,60 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
            instance_group=None)

        host_state = mock.Mock(spec=host_manager.HostState,
-            host=mock.sentinel.host)
+            host=mock.sentinel.host, uuid=uuids.cn1)
        all_host_states = [host_state]
        mock_get_all_states.return_value = all_host_states
        mock_get_hosts.return_value = all_host_states
-        instance_uuids = [uuids.instance]
+
+        instance_uuids = None
+        ctx = mock.Mock()
+        selected_hosts = self.driver._schedule(ctx, spec_obj,
+            instance_uuids, None, mock.sentinel.provider_summaries)
+
+        mock_get_all_states.assert_called_once_with(
+            ctx.elevated.return_value, spec_obj,
+            mock.sentinel.provider_summaries)
+        mock_get_hosts.assert_called_once_with(spec_obj, all_host_states, 0)
+
+        self.assertEqual(len(selected_hosts), 1)
+        self.assertEqual([host_state], selected_hosts)
+
+        # Ensure that we have consumed the resources on the chosen host states
+        host_state.consume_from_request.assert_called_once_with(spec_obj)
+
+        # And ensure we never called _claim_resources()
+        self.assertFalse(mock_claim.called)
+
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_claim_resources')
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_get_all_host_states')
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_get_sorted_hosts')
+    def test_schedule_old_conductor(self, mock_get_hosts,
+            mock_get_all_states, mock_claim):
+        """Old conductor can call scheduler without the instance_uuids
+        parameter. When this happens, we need to ensure we do not attempt to
+        claim resources in the placement API since obviously we need instance
+        UUIDs to perform those claims.
+        """
+        spec_obj = objects.RequestSpec(
+            num_instances=1,
+            flavor=objects.Flavor(memory_mb=512,
+                                  root_gb=512,
+                                  ephemeral_gb=0,
+                                  swap=0,
+                                  vcpus=1),
+            project_id=uuids.project_id,
+            instance_group=None)
+
+        host_state = mock.Mock(spec=host_manager.HostState,
+            host=mock.sentinel.host, uuid=uuids.cn1)
+        all_host_states = [host_state]
+        mock_get_all_states.return_value = all_host_states
+        mock_get_hosts.return_value = all_host_states
+
+        instance_uuids = None
        ctx = mock.Mock()
        selected_hosts = self.driver._schedule(ctx, spec_obj,
            instance_uuids, mock.sentinel.alloc_reqs_by_rp_uuid,
@ -71,12 +138,161 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
        # Ensure that we have consumed the resources on the chosen host states
        host_state.consume_from_request.assert_called_once_with(spec_obj)

+        # And ensure we never called _claim_resources()
+        self.assertFalse(mock_claim.called)
+
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_claim_resources')
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_get_all_host_states')
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_get_sorted_hosts')
+    def test_schedule_successful_claim(self, mock_get_hosts,
+            mock_get_all_states, mock_claim):
+        spec_obj = objects.RequestSpec(
+            num_instances=1,
+            flavor=objects.Flavor(memory_mb=512,
+                                  root_gb=512,
+                                  ephemeral_gb=0,
+                                  swap=0,
+                                  vcpus=1),
+            project_id=uuids.project_id,
+            instance_group=None)
+
+        host_state = mock.Mock(spec=host_manager.HostState,
+            host=mock.sentinel.host, uuid=uuids.cn1)
+        all_host_states = [host_state]
+        mock_get_all_states.return_value = all_host_states
+        mock_get_hosts.return_value = all_host_states
+        mock_claim.return_value = True
+
+        instance_uuids = [uuids.instance]
+        alloc_reqs_by_rp_uuid = {
+            uuids.cn1: [mock.sentinel.alloc_req],
+        }
+        ctx = mock.Mock()
+        selected_hosts = self.driver._schedule(ctx, spec_obj,
+            instance_uuids, alloc_reqs_by_rp_uuid,
+            mock.sentinel.provider_summaries)
+
+        mock_get_all_states.assert_called_once_with(
+            ctx.elevated.return_value, spec_obj,
+            mock.sentinel.provider_summaries)
+        mock_get_hosts.assert_called_once_with(spec_obj, all_host_states, 0)
+        mock_claim.assert_called_once_with(ctx.elevated.return_value, spec_obj,
+            uuids.instance, [mock.sentinel.alloc_req])
+
+        self.assertEqual(len(selected_hosts), 1)
+        self.assertEqual([host_state], selected_hosts)
+
+        # Ensure that we have consumed the resources on the chosen host states
+        host_state.consume_from_request.assert_called_once_with(spec_obj)
+
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_cleanup_allocations')
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_claim_resources')
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_get_all_host_states')
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_get_sorted_hosts')
+    def test_schedule_unsuccessful_claim(self, mock_get_hosts,
+            mock_get_all_states, mock_claim, mock_cleanup):
+        """Tests that we return an empty list if we are unable to successfully
+        claim resources for the instance
+        """
+        spec_obj = objects.RequestSpec(
+            num_instances=1,
+            flavor=objects.Flavor(memory_mb=512,
+                                  root_gb=512,
+                                  ephemeral_gb=0,
+                                  swap=0,
+                                  vcpus=1),
+            project_id=uuids.project_id,
+            instance_group=None)
+
+        host_state = mock.Mock(spec=host_manager.HostState,
+            host=mock.sentinel.host, uuid=uuids.cn1)
+        all_host_states = [host_state]
+        mock_get_all_states.return_value = all_host_states
+        mock_get_hosts.return_value = all_host_states
+        mock_claim.return_value = False
+
+        instance_uuids = [uuids.instance]
+        alloc_reqs_by_rp_uuid = {
+            uuids.cn1: [mock.sentinel.alloc_req],
+        }
+        ctx = mock.Mock()
+        selected_hosts = self.driver._schedule(ctx, spec_obj,
+            instance_uuids, alloc_reqs_by_rp_uuid,
+            mock.sentinel.provider_summaries)
+
+        mock_get_all_states.assert_called_once_with(
+            ctx.elevated.return_value, spec_obj,
+            mock.sentinel.provider_summaries)
+        mock_get_hosts.assert_called_once_with(spec_obj, all_host_states, 0)
+        mock_claim.assert_called_once_with(ctx.elevated.return_value, spec_obj,
+            uuids.instance, [mock.sentinel.alloc_req])
+
+        self.assertEqual([], selected_hosts)
+
+        mock_cleanup.assert_called_once_with([])
+
+        # Ensure that we have consumed the resources on the chosen host states
+        self.assertFalse(host_state.consume_from_request.called)
+
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_cleanup_allocations')
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_claim_resources')
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_get_all_host_states')
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_get_sorted_hosts')
+    def test_schedule_not_all_instance_clean_claimed(self, mock_get_hosts,
+            mock_get_all_states, mock_claim, mock_cleanup):
+        """Tests that we clean up previously-allocated instances if not all
+        instances could be scheduled
+        """
+        spec_obj = objects.RequestSpec(
+            num_instances=2,
+            flavor=objects.Flavor(memory_mb=512,
+                                  root_gb=512,
+                                  ephemeral_gb=0,
+                                  swap=0,
+                                  vcpus=1),
+            project_id=uuids.project_id,
+            instance_group=None)
+
+        host_state = mock.Mock(spec=host_manager.HostState,
+            host=mock.sentinel.host, uuid=uuids.cn1)
+        all_host_states = [host_state]
+        mock_get_all_states.return_value = all_host_states
+        mock_get_hosts.side_effect = [
+            all_host_states,  # first return all the hosts (only one)
+            [],  # then act as if no more hosts were found that meet criteria
+        ]
+        mock_claim.return_value = True
+
+        instance_uuids = [uuids.instance1, uuids.instance2]
+        alloc_reqs_by_rp_uuid = {
+            uuids.cn1: [mock.sentinel.alloc_req],
+        }
+        ctx = mock.Mock()
+        self.driver._schedule(ctx, spec_obj, instance_uuids,
+            alloc_reqs_by_rp_uuid, mock.sentinel.provider_summaries)
+
+        # Ensure we cleaned up the first successfully-claimed instance
+        mock_cleanup.assert_called_once_with([uuids.instance1])
+
+    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
+                '_claim_resources')
    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
                '_get_all_host_states')
    @mock.patch('nova.scheduler.filter_scheduler.FilterScheduler.'
                '_get_sorted_hosts')
    def test_schedule_instance_group(self, mock_get_hosts,
-            mock_get_all_states):
+            mock_get_all_states, mock_claim):
        """Test that since the request spec object contains an instance group
        object, that upon choosing a host in the primary schedule loop,
        that we update the request spec's instance group information
@ -93,10 +309,18 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
            project_id=uuids.project_id,
            instance_group=ig)

-        hs1 = mock.Mock(spec=host_manager.HostState, host='host1')
-        hs2 = mock.Mock(spec=host_manager.HostState, host='host2')
+        hs1 = mock.Mock(spec=host_manager.HostState, host='host1',
+            uuid=uuids.cn1)
+        hs2 = mock.Mock(spec=host_manager.HostState, host='host2',
+            uuid=uuids.cn2)
        all_host_states = [hs1, hs2]
        mock_get_all_states.return_value = all_host_states
+        mock_claim.return_value = True
+
+        alloc_reqs_by_rp_uuid = {
+            uuids.cn1: [mock.sentinel.alloc_req_cn1],
+            uuids.cn2: [mock.sentinel.alloc_req_cn2],
+        }

        # Simulate host 1 and host 2 being randomly returned first by
        # _get_sorted_hosts() in the two iterations for each instance in
@ -107,8 +331,17 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
        ]
        ctx = mock.Mock()
        self.driver._schedule(ctx, spec_obj, instance_uuids,
-            mock.sentinel.alloc_reqs_by_rp_uuid,
-            mock.sentinel.provider_summaries)
+            alloc_reqs_by_rp_uuid, mock.sentinel.provider_summaries)
+
+        # Check that we called _claim_resources() for both the first and second
+        # host state
+        claim_calls = [
+            mock.call(ctx.elevated.return_value, spec_obj,
+                uuids.instance0, [mock.sentinel.alloc_req_cn2]),
+            mock.call(ctx.elevated.return_value, spec_obj,
+                uuids.instance1, [mock.sentinel.alloc_req_cn1]),
+        ]
+        mock_claim.assert_has_calls(claim_calls)

        # Check that _get_sorted_hosts() is called twice and that the
        # second time, we pass it the hosts that were returned from
@ -218,6 +451,40 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
        # weighed hosts and thus return [hs1, hs2]
        self.assertEqual([hs1, hs2], results)

+    def test_cleanup_allocations(self):
+        instance_uuids = []
+        # Check we don't do anything if there's no instance UUIDs to cleanup
+        # allocations for
+        pc = self.placement_client
+
+        self.driver._cleanup_allocations(instance_uuids)
+        self.assertFalse(pc.delete_allocation_for_instance.called)
+
+        instance_uuids = [uuids.instance1, uuids.instance2]
+        self.driver._cleanup_allocations(instance_uuids)
+
+        exp_calls = [mock.call(uuids.instance1), mock.call(uuids.instance2)]
+        pc.delete_allocation_for_instance.assert_has_calls(exp_calls)
+
+    def test_claim_resources(self):
+        """Tests that when _schedule() calls _claim_resources(), that we
+        appropriately call the placement client to claim resources for the
+        instance.
+        """
+        ctx = mock.Mock(user_id=uuids.user_id)
+        spec_obj = mock.Mock(project_id=uuids.project_id)
+        instance_uuid = uuids.instance
+        alloc_reqs = [mock.sentinel.alloc_req]
+
+        res = self.driver._claim_resources(ctx, spec_obj, instance_uuid,
+            alloc_reqs)
+
+        pc = self.placement_client
+        pc.claim_resources.return_value = True
+        pc.claim_resources.assert_called_once_with(uuids.instance,
+            mock.sentinel.alloc_req, uuids.project_id, uuids.user_id)
+        self.assertTrue(res)
+
    def test_add_retry_host(self):
        retry = dict(num_attempts=1, hosts=[])
        filter_properties = dict(retry=retry)
--- a/releasenotes/notes/placement-claims-844540aa7bf52b33.yaml
+++ b/releasenotes/notes/placement-claims-844540aa7bf52b33.yaml
@ -0,0 +1,14 @@
+---
+other:
+  - |
+    The filter scheduler will now attempt to claim a number of
+    resources in the placement API after determining a list of
+    potential hosts. We attempt to claim these resources for each instance
+    in the build request, and if a claim does not succeed, we try this
+    claim against the next potential host the scheduler selected. This
+    claim retry process can potentially attempt claims against a large
+    number of hosts, and we do not limit the number of hosts to attempt
+    claims against. Claims for resources may fail due to another scheduler
+    process concurrently claiming resources against the same compute node.
+    This concurrent resource claim is normal and the retry of a claim
+    request should be unusual but harmless.