Consumer gen support for delete instance allocations

The placement API version 1.28 introduced consumer generation as a way to make updating allocation safe even if it is done from multiple places. This patch changes delete_allocation_for_instance to use PUT /allocations instead of DELETE /allocations to benefit from the consumer generation handling. In this patch the report client will GET the current allocation of the instance including the consumer generation and then try to PUT an empty allocation with that generation. If this fails due to a consumer generation conflict, meaning something modified the allocation of the instance in between GET and PUT then the report client will raise AllocationDeleteFailed exception. This will cause that the instance goes to ERROR state. This patch only detects a small portion of possible cases when allocation is modified outside of the delete code path. The rest can only be detected if nova would cache at least the consumer generation of the instance. To be able to put the instance state to ERROR the instance.destroy() call is moved to the end to of the deletion call path. To keep the instance.delete.end notification behavior consistent with this move (e.g. deleted_at field is filled) the notification sending needed to be moved too. Blueprint: use-nested-allocation-candidates Change-Id: I77f34788dd7ab8fdf60d668a4f76452e03cf9888
2018-08-14 10:37:03 +02:00 · 2018-08-14 10:37:03 +02:00 · 6f1a1f5e8e
parent 337b24ca41
commit 6f1a1f5e8e
9 changed files with 184 additions and 44 deletions
--- a/doc/source/user/placement.rst
+++ b/doc/source/user/placement.rst
@ -58,12 +58,14 @@ changed or be partially complete at this time.
 * `filter allocation candidates by aggregate membership`_
 * `perform granular allocation candidate requests`_
 * `inventory and allocation data migration`_ (reshaping provider trees)
+* `handle allocation updates in a safe way`_

 .. _Nested Resource Providers: http://specs.openstack.org/openstack/nova-specs/specs/queens/approved/nested-resource-providers.html
 .. _Request Traits During Scheduling: https://specs.openstack.org/openstack/nova-specs/specs/queens/approved/request-traits-in-nova.html
 .. _filter allocation candidates by aggregate membership: https://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/alloc-candidates-member-of.html
 .. _perform granular allocation candidate requests: http://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/granular-resource-requests.html
 .. _inventory and allocation data migration: http://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/reshape-provider-tree.html
+.. _handle allocation updates in a safe way: https://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/add-consumer-generation.html

 Deployment
 ==========
--- a/nova/api/openstack/compute/servers.py
+++ b/nova/api/openstack/compute/servers.py
@ -865,7 +865,8 @@ class ServersController(wsgi.Controller):
            raise exc.HTTPNotFound(explanation=msg)
        except exception.InstanceUnknownCell as e:
            raise exc.HTTPNotFound(explanation=e.format_message())
-        except exception.InstanceIsLocked as e:
+        except (exception.InstanceIsLocked,
+                exception.AllocationDeleteFailed) as e:
            raise exc.HTTPConflict(explanation=e.format_message())
        except exception.InstanceInvalidState as state_error:
            common.raise_http_conflict_for_instance_invalid_state(state_error,
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -749,19 +749,18 @@ class ComputeManager(manager.Manager):
        bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
                context, instance.uuid)
        self._complete_deletion(context,
-                                instance,
-                                bdms)
+                                instance)
+        self._notify_about_instance_usage(context, instance, "delete.end")
+        compute_utils.notify_about_instance_action(context, instance,
+                self.host, action=fields.NotificationAction.DELETE,
+                phase=fields.NotificationPhase.END, bdms=bdms)

-    def _complete_deletion(self, context, instance, bdms):
+    def _complete_deletion(self, context, instance):
        self._update_resource_tracker(context, instance)

        rt = self._get_resource_tracker()
        rt.reportclient.delete_allocation_for_instance(context, instance.uuid)

-        self._notify_about_instance_usage(context, instance, "delete.end")
-        compute_utils.notify_about_instance_action(context, instance,
-                self.host, action=fields.NotificationAction.DELETE,
-                phase=fields.NotificationPhase.END, bdms=bdms)
        self._clean_instance_console_tokens(context, instance)
        self._delete_scheduler_instance_info(context, instance.uuid)

@ -2624,11 +2623,17 @@ class ComputeManager(manager.Manager):
        instance.power_state = power_state.NOSTATE
        instance.terminated_at = timeutils.utcnow()
        instance.save()
+
+        self._complete_deletion(context, instance)
+        # only destroy the instance in the db if the _complete_deletion
+        # doesn't raise and therefore allocation is successfully
+        # deleted in placement
        instance.destroy()

-        self._complete_deletion(context,
-                                instance,
-                                bdms)
+        self._notify_about_instance_usage(context, instance, "delete.end")
+        compute_utils.notify_about_instance_action(context, instance,
+                self.host, action=fields.NotificationAction.DELETE,
+                phase=fields.NotificationPhase.END, bdms=bdms)

    @wrap_exception()
    @reverts_task_state
--- a/nova/exception.py
+++ b/nova/exception.py
@ -2322,6 +2322,11 @@ class AllocationUpdateFailed(NovaException):
                'Error: %(error)s')


+class AllocationDeleteFailed(NovaException):
+    msg_fmt = _('Failed to delete allocations for consumer %(consumer_uuid)s. '
+                'Error: %(error)s')
+
+
 class TooManyComputesForHost(NovaException):
    msg_fmt = _('Unexpected number of compute node records '
                '(%(num_computes)d) found for host %(host)s. There should '
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@ -2047,21 +2047,63 @@ class SchedulerReportClient(object):

    @safe_connect
    def delete_allocation_for_instance(self, context, uuid):
+        """Delete the instance allocation from placement
+
+        :param context: The security context
+        :param uuid: the instance UUID which will be used as the consumer UUID
+                     towards placement
+        :return: Returns True if the allocation is successfully deleted by this
+                 call. Returns False if the allocation does not exist.
+        :raises AllocationDeleteFailed: If the allocation cannot be read from
+                placement or it is changed by another process while we tried to
+                delete it.
+        """
        url = '/allocations/%s' % uuid
-        r = self.delete(url, global_request_id=context.global_id)
-        if r:
+        # We read the consumer generation then try to put an empty allocation
+        # for that consumer. If between the GET and the PUT the consumer
+        # generation changes then we raise AllocationDeleteFailed.
+        # NOTE(gibi): This only detect a small portion of possible cases when
+        # allocation is modified outside of the delete code path. The rest can
+        # only be detected if nova would cache at least the consumer generation
+        # of the instance.
+        # NOTE(gibi): placement does not return 404 for non-existing consumer
+        # but returns an empty consumer instead. Putting an empty allocation to
+        # that non-existing consumer won't be 404 or other error either.
+        r = self.get(url, global_request_id=context.global_id,
+                     version=CONSUMER_GENERATION_VERSION)
+        if not r:
+            # at the moment there is no way placement returns a failure so we
+            # could even delete this code
+            LOG.warning('Unable to delete allocation for instance '
+                        '%(uuid)s: (%(code)i %(text)s)',
+                        {'uuid': uuid,
+                         'code': r.status_code,
+                         'text': r.text})
+            raise exception.AllocationDeleteFailed(consumer_uuid=uuid,
+                                                   error=r.text)
+        allocations = r.json()
+        if allocations['allocations'] == {}:
+            # the consumer did not exist in the first place
+            LOG.debug('Cannot delete allocation for %s consumer in placement '
+                      'as consumer does not exists', uuid)
+            return False
+
+        # removing all resources from the allocation will auto delete the
+        # consumer in placement
+        allocations['allocations'] = {}
+        r = self.put(url, allocations, global_request_id=context.global_id,
+                     version=CONSUMER_GENERATION_VERSION)
+        if r.status_code == 204:
            LOG.info('Deleted allocation for instance %s', uuid)
            return True
        else:
-            # Check for 404 since we don't need to log a warning if we tried to
-            # delete something which doesn't actually exist.
-            if r.status_code != 404:
-                LOG.warning('Unable to delete allocation for instance '
-                            '%(uuid)s: (%(code)i %(text)s)',
-                            {'uuid': uuid,
-                             'code': r.status_code,
-                             'text': r.text})
-            return False
+            LOG.warning('Unable to delete allocation for instance '
+                        '%(uuid)s: (%(code)i %(text)s)',
+                        {'uuid': uuid,
+                         'code': r.status_code,
+                         'text': r.text})
+            raise exception.AllocationDeleteFailed(consumer_uuid=uuid,
+                                                   error=r.text)

    def get_allocations_for_resource_provider(self, context, rp_uuid):
        """Retrieves the allocations for a specific provider.
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -20,6 +20,7 @@ import zlib
 import mock
 from oslo_log import log as logging
 from oslo_serialization import base64
+from oslo_serialization import jsonutils
 from oslo_utils.fixture import uuidsentinel as uuids
 from oslo_utils import timeutils
 import six
@ -40,6 +41,7 @@ from nova.tests.functional import integrated_helpers
 from nova.tests.unit.api.openstack import fakes
 from nova.tests.unit import fake_block_device
 from nova.tests.unit import fake_notifier
+from nova.tests.unit import fake_requests
 import nova.tests.unit.image.fake
 from nova.virt import fake
 from nova import volume
@ -4529,3 +4531,87 @@ class ServerTestV256RescheduleTestCase(ServerTestV256Common):
        found_server = self.api.get_server(server['id'])
        # Check that rescheduling is not occurred.
        self.assertNotEqual(other_host, found_server['OS-EXT-SRV-ATTR:host'])
+
+
+class ConsumerGenerationConflictTest(
+        integrated_helpers.ProviderUsageBaseTestCase):
+
+    # we need the medium driver to be able to allocate resource not just for
+    # a single instance
+    compute_driver = 'fake.MediumFakeDriver'
+
+    def setUp(self):
+        super(ConsumerGenerationConflictTest, self).setUp()
+        flavors = self.api.get_flavors()
+        self.flavor = flavors[0]
+        self.other_flavor = flavors[1]
+        self.compute1 = self._start_compute('compute1')
+        self.compute2 = self._start_compute('compute2')
+
+    def test_server_delete_fails_due_to_conflict(self):
+        source_hostname = self.compute1.host
+
+        server = self._boot_and_check_allocations(self.flavor, source_hostname)
+
+        rsp = fake_requests.FakeResponse(
+            409, jsonutils.dumps({'text': 'consumer generation conflict'}))
+
+        with mock.patch('keystoneauth1.adapter.Adapter.put',
+                        autospec=True) as mock_put:
+            mock_put.return_value = rsp
+
+            self.api.delete_server(server['id'])
+            server = self._wait_for_state_change(self.admin_api, server,
+                                                 'ERROR')
+            self.assertEqual(1, mock_put.call_count)
+
+        # We still have the allocations as deletion failed
+        source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
+        source_usages = self._get_provider_usages(source_rp_uuid)
+        self.assertFlavorMatchesAllocation(self.flavor, source_usages)
+
+        allocations = self._get_allocations_by_server_uuid(server['id'])
+        self.assertEqual(1, len(allocations))
+        source_allocation = allocations[source_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(self.flavor, source_allocation)
+
+        # retry the delete to make sure that allocations are removed this time
+        self._delete_and_check_allocations(server)
+
+    def test_server_local_delete_fails_due_to_conflict(self):
+        source_hostname = self.compute1.host
+
+        server = self._boot_and_check_allocations(self.flavor, source_hostname)
+        source_compute_id = self.admin_api.get_services(
+            host=self.compute1.host, binary='nova-compute')[0]['id']
+        self.compute1.stop()
+        self.admin_api.put_service(
+            source_compute_id, {'forced_down': 'true'})
+
+        rsp = fake_requests.FakeResponse(
+            409, jsonutils.dumps({'text': 'consumer generation conflict'}))
+
+        with mock.patch('keystoneauth1.adapter.Adapter.put',
+                        autospec=True) as mock_put:
+            mock_put.return_value = rsp
+
+            ex = self.assertRaises(client.OpenStackApiException,
+                                   self.api.delete_server, server['id'])
+            self.assertEqual(409, ex.response.status_code)
+            self.assertIn('Failed to delete allocations for consumer',
+                          jsonutils.loads(ex.response.content)[
+                              'conflictingRequest']['message'])
+            self.assertEqual(1, mock_put.call_count)
+
+        # We still have the allocations as deletion failed
+        source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
+        source_usages = self._get_provider_usages(source_rp_uuid)
+        self.assertFlavorMatchesAllocation(self.flavor, source_usages)
+
+        allocations = self._get_allocations_by_server_uuid(server['id'])
+        self.assertEqual(1, len(allocations))
+        source_allocation = allocations[source_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(self.flavor, source_allocation)
+
+        # retry the delete to make sure that allocations are removed this time
+        self._delete_and_check_allocations(server)
--- a/nova/tests/unit/compute/test_compute.py
+++ b/nova/tests/unit/compute/test_compute.py
@ -7646,6 +7646,12 @@ class ComputeTestCase(BaseTestCase,
        self.stub_out('nova.objects.quotas.Quotas.reserve',
                      lambda *a, **k: None)

+        self.stub_out('nova.compute.utils.notify_about_instance_usage',
+                      lambda *a, **k: None)
+
+        self.stub_out('nova.compute.utils.notify_about_instance_action',
+                      lambda *a, **k: None)
+
        self.compute._complete_partial_deletion(admin_context, instance)

        self.assertNotEqual(0, instance.deleted)
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -232,14 +232,22 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        methods_called = [n for n, a, k in call_tracker.mock_calls]
        self.assertEqual(['clear_events_for_instance',
                          '_notify_about_instance_usage',
-                          '_shutdown_instance'],
+                          '_shutdown_instance',
+                          '_notify_about_instance_usage'],
                         methods_called)
-        mock_notify.assert_called_once_with(self.context,
-                                            mock_inst,
-                                            specd_compute.host,
-                                            action='delete',
-                                            phase='start',
-                                            bdms=mock_bdms)
+        mock_notify.assert_has_calls([
+            mock.call(self.context,
+                      mock_inst,
+                      specd_compute.host,
+                      action='delete',
+                      phase='start',
+                      bdms=mock_bdms),
+            mock.call(self.context,
+                      mock_inst,
+                      specd_compute.host,
+                      action='delete',
+                      phase='end',
+                      bdms=mock_bdms)])

    def _make_compute_node(self, hyp_hostname, cn_id):
            cn = mock.Mock(spec_set=['hypervisor_hostname', 'id',
--- a/nova/tests/unit/scheduler/client/test_report.py
+++ b/nova/tests/unit/scheduler/client/test_report.py
@ -3222,21 +3222,6 @@ class TestAllocations(SchedulerReportClientTestCase):
        }
        self.assertEqual(expected, result)

-    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
-                'delete')
-    @mock.patch('nova.scheduler.client.report.LOG')
-    def test_delete_allocation_for_instance_ignore_404(self, mock_log,
-                                                       mock_delete):
-        """Tests that we don't log a warning on a 404 response when trying to
-        delete an allocation record.
-        """
-        mock_delete.return_value = fake_requests.FakeResponse(404)
-        self.client.delete_allocation_for_instance(self.context, uuids.rp_uuid)
-        # make sure we didn't screw up the logic or the mock
-        mock_log.info.assert_not_called()
-        # make sure warning wasn't called for the 404
-        mock_log.warning.assert_not_called()
-
    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
                "delete")
    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."