Merge "[placement] Retry allocation writes server side" into stable/queens

2018-08-09 22:35:09 +00:00 · 2018-08-09 22:35:09 +00:00 · ff200e12ef
parent 5ffa9cbb51 66a47b7623
commit ff200e12ef
3 changed files with 138 additions and 4 deletions
--- a/nova/exception.py
+++ b/nova/exception.py
@ -2122,6 +2122,11 @@ class ConcurrentUpdateDetected(NovaException):
                "Please retry your update")


+class ResourceProviderConcurrentUpdateDetected(ConcurrentUpdateDetected):
+    msg_fmt = _("Another thread concurrently updated the resource provider "
+                "data. Please retry your update")
+
+
 class ResourceClassNotFound(NotFound):
    msg_fmt = _("No such resource class %(resource_class)s.")

--- a/nova/objects/resource_provider.py
+++ b/nova/objects/resource_provider.py
@ -279,7 +279,7 @@ def _increment_provider_generation(ctx, rp):

    res = ctx.session.execute(upd_stmt)
    if res.rowcount != 1:
-        raise exception.ConcurrentUpdateDetected
+        raise exception.ResourceProviderConcurrentUpdateDetected()
    return new_generation


@ -1992,6 +1992,10 @@ def _get_allocations_by_consumer_uuid(ctx, consumer_uuid):
@base.NovaObjectRegistry.register_if(False)
 class AllocationList(base.ObjectListBase, base.NovaObject):

+    # The number of times to retry set_allocations if there has
+    # been a resource provider (not consumer) generation coflict.
+    RP_CONFLICT_RETRY_COUNT = 10
+
    fields = {
        'objects': fields.ListOfObjectsField('Allocation'),
    }
@ -2139,9 +2143,41 @@ class AllocationList(base.ObjectListBase, base.NovaObject):
        returned to the caller, nor are their database ids set. If
        those ids are required use one of the get_all_by* methods.
        """
-        # TODO(jaypipes): Retry the allocation writes on
-        # ConcurrentUpdateDetected
-        self._set_allocations(self._context, self.objects)
+        # Retry _set_allocations server side if there is a
+        # ResourceProviderConcurrentUpdateDetected. We don't care about
+        # sleeping, we simply want to reset the resource provider objects
+        # and try again. For sake of simplicity (and because we don't have
+        # easy access to the information) we reload all the resource
+        # providers that may be present.
+        retries = self.RP_CONFLICT_RETRY_COUNT
+        while retries:
+            retries -= 1
+            try:
+                self._set_allocations(self._context, self.objects)
+                break
+            except exception.ResourceProviderConcurrentUpdateDetected:
+                LOG.debug('Retrying allocations write on resource provider '
+                          'generation conflict')
+                # We only want to reload each unique resource provider once.
+                alloc_rp_uuids = set(
+                    alloc.resource_provider.uuid for alloc in self.objects)
+                seen_rps = {}
+                for rp_uuid in alloc_rp_uuids:
+                    seen_rps[rp_uuid] = ResourceProvider.get_by_uuid(
+                        self._context, rp_uuid)
+                for alloc in self.objects:
+                    rp_uuid = alloc.resource_provider.uuid
+                    alloc.resource_provider = seen_rps[rp_uuid]
+        else:
+            # We ran out of retries so we need to raise again.
+            # The log will automatically have request id info associated with
+            # it that will allow tracing back to specific allocations.
+            # Attempting to extract specific consumer or resource provider
+            # information from the allocations is not coherent as this
+            # could be multiple consumers and providers.
+            LOG.warning('Exceeded retry limit of %d on allocations write',
+                        self.RP_CONFLICT_RETRY_COUNT)
+            raise exception.ResourceProviderConcurrentUpdateDetected()

    def delete_all(self):
        # Allocations can only have a single consumer, so take advantage of
--- a/nova/tests/functional/db/test_resource_provider.py
+++ b/nova/tests/functional/db/test_resource_provider.py
@ -11,6 +11,7 @@
 #    under the License.


+import functools
 import mock
 import os_traits
 from oslo_db import exception as db_exc
@ -42,6 +43,15 @@ DISK_ALLOCATION = dict(
 )


+def add_inventory(rp, rc, total, **kwargs):
+    kwargs.setdefault('max_unit', total)
+    inv = rp_obj.Inventory(rp._context, resource_provider=rp,
+                           resource_class=rc, total=total, **kwargs)
+    inv.obj_set_defaults()
+    rp.add_inventory(inv)
+    return inv
+
+
 class ResourceProviderBaseCase(test.NoDBTestCase):

    USES_DB_SELF = True
@ -1866,6 +1876,89 @@ class TestAllocationListCreateDelete(ResourceProviderBaseCase):
            self.ctx, migration_uuid)
        self.assertEqual(0, len(allocations))

+    @mock.patch('nova.objects.resource_provider.LOG')
+    def test_set_allocations_retry(self, mock_log):
+        """Test server side allocation write retry handling."""
+
+        # Create a single resource provider and give it some inventory.
+        rp1 = rp_obj.ResourceProvider(
+            self.ctx, name='rp1', uuid=uuidsentinel.rp1)
+        rp1.create()
+        add_inventory(rp1, fields.ResourceClass.VCPU, 24,
+                      allocation_ratio=16.0)
+        add_inventory(rp1, fields.ResourceClass.MEMORY_MB, 1024,
+                      min_unit=64,
+                      max_unit=1024,
+                      step_size=64)
+        original_generation = rp1.generation
+        # Verify the generation is what we expect (we'll be checking again
+        # later).
+        self.assertEqual(2, original_generation)
+
+        inst_consumer = uuidsentinel.instance
+
+        alloc_list = rp_obj.AllocationList(context=self.ctx,
+            objects=[
+                rp_obj.Allocation(
+                    context=self.ctx,
+                    consumer_id=inst_consumer,
+                    resource_provider=rp1,
+                    resource_class=fields.ResourceClass.VCPU,
+                    used=12),
+                rp_obj.Allocation(
+                    context=self.ctx,
+                    consumer_id=inst_consumer,
+                    resource_provider=rp1,
+                    resource_class=fields.ResourceClass.MEMORY_MB,
+                    used=1024)
+            ])
+
+        # Make sure the right exception happens when the retry loop expires.
+        with mock.patch.object(rp_obj.AllocationList,
+                               'RP_CONFLICT_RETRY_COUNT', 0):
+            self.assertRaises(
+                exception.ResourceProviderConcurrentUpdateDetected,
+                alloc_list.create_all)
+            mock_log.warning.assert_called_with(
+                'Exceeded retry limit of %d on allocations write', 0)
+
+        # Make sure the right thing happens after a small number of failures.
+        # There's a bit of mock magic going on here to enusre that we can
+        # both do some side effects on _set_allocations as well as have the
+        # real behavior. Two generation conflicts and then a success.
+        mock_log.reset_mock()
+        with mock.patch.object(rp_obj.AllocationList,
+                               'RP_CONFLICT_RETRY_COUNT', 3):
+            unmocked_set = functools.partial(
+                rp_obj.AllocationList._set_allocations, alloc_list)
+            with mock.patch(
+                'nova.objects.resource_provider.'
+                'AllocationList._set_allocations') as mock_set:
+                exceptions = iter([
+                    exception.ResourceProviderConcurrentUpdateDetected(),
+                    exception.ResourceProviderConcurrentUpdateDetected(),
+                ])
+
+                def side_effect(*args, **kwargs):
+                    try:
+                        raise next(exceptions)
+                    except StopIteration:
+                        return unmocked_set(*args, **kwargs)
+
+                mock_set.side_effect = side_effect
+                alloc_list.create_all()
+                self.assertEqual(2, mock_log.debug.call_count)
+                mock_log.debug.called_with(
+                    'Retrying allocations write on resource provider '
+                    'generation conflict')
+                self.assertEqual(3, mock_set.call_count)
+
+        # Confirm we're using a different rp object after the change
+        # and that it has a higher generation.
+        new_rp = alloc_list[0].resource_provider
+        self.assertEqual(original_generation, rp1.generation)
+        self.assertEqual(original_generation + 1, new_rp.generation)
+

 class UsageListTestCase(ResourceProviderBaseCase):