From 66a47b7623035cff38a12fdace9a325bbfdd9a14 Mon Sep 17 00:00:00 2001
From: Chris Dent <cdent@anticdent.org>
Date: Thu, 26 Jul 2018 12:00:14 +0100
Subject: [PATCH] [placement] Retry allocation writes server side

This change adds a fast retry loop around
AllocationList._set_allocations if a resource provider generation
conflict happens. It turns out that under high concurrency of allocation
claims being made on the same resource provider conflicts can be quite
common and client side retries are insufficient.

Because both consumer generation and resource provider generations had
raised the same exception there was no way to distinguish between the
two so a child of ConcurrentUpdateDetected has been created as
ResourceProviderConcurrentUpdateDetected. In the future this will allow
us to send different error codes to the client as well, but that change
is not done here.

When the conflict is detected, all the resource providers in the
AllocationList are reloaded and the list objects refreshed.

Logging is provided to indicate:

* at debug that a retry is going to happen
* at warning that all the retries failed and the client is going to
  see the conflict

The tests for this are a bit funky: Some mocks are used to cause the
conflicts, then the real actions after a couple of iterations.

This was backported from 72e4c4c8d7fb146862b899337626485dad10f15b with
conflicts because exceptions, tests and object files were moved to
new locations with Rocky and the AllocationList.create_all method was
renamed to the more accurate replace_all. Prior to Rocky there were
no Consumer objects and fewer helper methods in functional tests,
so the test is adjusted accordingly.

Change-Id: Id614d609fc8f3ed2d2ff29a2b52143f53b3b1b9a
Closes-Bug: #1719933
(cherry picked from commit 72e4c4c8d7fb146862b899337626485dad10f15b)
---
 nova/exception.py                             |  5 +
 nova/objects/resource_provider.py             | 44 ++++++++-
 .../functional/db/test_resource_provider.py   | 93 +++++++++++++++++++
 3 files changed, 138 insertions(+), 4 deletions(-)

diff --git a/nova/exception.py b/nova/exception.py
index 01499a1f17f9..b60f4e0c0e9d 100644
--- a/nova/exception.py
+++ b/nova/exception.py
@@ -2122,6 +2122,11 @@ class ConcurrentUpdateDetected(NovaException):
                 "Please retry your update")
 
 
+class ResourceProviderConcurrentUpdateDetected(ConcurrentUpdateDetected):
+    msg_fmt = _("Another thread concurrently updated the resource provider "
+                "data. Please retry your update")
+
+
 class ResourceClassNotFound(NotFound):
     msg_fmt = _("No such resource class %(resource_class)s.")
 
diff --git a/nova/objects/resource_provider.py b/nova/objects/resource_provider.py
index b81aa1805f00..7ce5ec9685e4 100644
--- a/nova/objects/resource_provider.py
+++ b/nova/objects/resource_provider.py
@@ -279,7 +279,7 @@ def _increment_provider_generation(ctx, rp):
 
     res = ctx.session.execute(upd_stmt)
     if res.rowcount != 1:
-        raise exception.ConcurrentUpdateDetected
+        raise exception.ResourceProviderConcurrentUpdateDetected()
     return new_generation
 
 
@@ -1992,6 +1992,10 @@ def _get_allocations_by_consumer_uuid(ctx, consumer_uuid):
 @base.NovaObjectRegistry.register_if(False)
 class AllocationList(base.ObjectListBase, base.NovaObject):
 
+    # The number of times to retry set_allocations if there has
+    # been a resource provider (not consumer) generation coflict.
+    RP_CONFLICT_RETRY_COUNT = 10
+
     fields = {
         'objects': fields.ListOfObjectsField('Allocation'),
     }
@@ -2139,9 +2143,41 @@ class AllocationList(base.ObjectListBase, base.NovaObject):
         returned to the caller, nor are their database ids set. If
         those ids are required use one of the get_all_by* methods.
         """
-        # TODO(jaypipes): Retry the allocation writes on
-        # ConcurrentUpdateDetected
-        self._set_allocations(self._context, self.objects)
+        # Retry _set_allocations server side if there is a
+        # ResourceProviderConcurrentUpdateDetected. We don't care about
+        # sleeping, we simply want to reset the resource provider objects
+        # and try again. For sake of simplicity (and because we don't have
+        # easy access to the information) we reload all the resource
+        # providers that may be present.
+        retries = self.RP_CONFLICT_RETRY_COUNT
+        while retries:
+            retries -= 1
+            try:
+                self._set_allocations(self._context, self.objects)
+                break
+            except exception.ResourceProviderConcurrentUpdateDetected:
+                LOG.debug('Retrying allocations write on resource provider '
+                          'generation conflict')
+                # We only want to reload each unique resource provider once.
+                alloc_rp_uuids = set(
+                    alloc.resource_provider.uuid for alloc in self.objects)
+                seen_rps = {}
+                for rp_uuid in alloc_rp_uuids:
+                    seen_rps[rp_uuid] = ResourceProvider.get_by_uuid(
+                        self._context, rp_uuid)
+                for alloc in self.objects:
+                    rp_uuid = alloc.resource_provider.uuid
+                    alloc.resource_provider = seen_rps[rp_uuid]
+        else:
+            # We ran out of retries so we need to raise again.
+            # The log will automatically have request id info associated with
+            # it that will allow tracing back to specific allocations.
+            # Attempting to extract specific consumer or resource provider
+            # information from the allocations is not coherent as this
+            # could be multiple consumers and providers.
+            LOG.warning('Exceeded retry limit of %d on allocations write',
+                        self.RP_CONFLICT_RETRY_COUNT)
+            raise exception.ResourceProviderConcurrentUpdateDetected()
 
     def delete_all(self):
         # Allocations can only have a single consumer, so take advantage of
diff --git a/nova/tests/functional/db/test_resource_provider.py b/nova/tests/functional/db/test_resource_provider.py
index 60ec04ca2574..adce2419ee3f 100644
--- a/nova/tests/functional/db/test_resource_provider.py
+++ b/nova/tests/functional/db/test_resource_provider.py
@@ -11,6 +11,7 @@
 #    under the License.
 
 
+import functools
 import mock
 import os_traits
 from oslo_db import exception as db_exc
@@ -42,6 +43,15 @@ DISK_ALLOCATION = dict(
 )
 
 
+def add_inventory(rp, rc, total, **kwargs):
+    kwargs.setdefault('max_unit', total)
+    inv = rp_obj.Inventory(rp._context, resource_provider=rp,
+                           resource_class=rc, total=total, **kwargs)
+    inv.obj_set_defaults()
+    rp.add_inventory(inv)
+    return inv
+
+
 class ResourceProviderBaseCase(test.NoDBTestCase):
 
     USES_DB_SELF = True
@@ -1866,6 +1876,89 @@ class TestAllocationListCreateDelete(ResourceProviderBaseCase):
             self.ctx, migration_uuid)
         self.assertEqual(0, len(allocations))
 
+    @mock.patch('nova.objects.resource_provider.LOG')
+    def test_set_allocations_retry(self, mock_log):
+        """Test server side allocation write retry handling."""
+
+        # Create a single resource provider and give it some inventory.
+        rp1 = rp_obj.ResourceProvider(
+            self.ctx, name='rp1', uuid=uuidsentinel.rp1)
+        rp1.create()
+        add_inventory(rp1, fields.ResourceClass.VCPU, 24,
+                      allocation_ratio=16.0)
+        add_inventory(rp1, fields.ResourceClass.MEMORY_MB, 1024,
+                      min_unit=64,
+                      max_unit=1024,
+                      step_size=64)
+        original_generation = rp1.generation
+        # Verify the generation is what we expect (we'll be checking again
+        # later).
+        self.assertEqual(2, original_generation)
+
+        inst_consumer = uuidsentinel.instance
+
+        alloc_list = rp_obj.AllocationList(context=self.ctx,
+            objects=[
+                rp_obj.Allocation(
+                    context=self.ctx,
+                    consumer_id=inst_consumer,
+                    resource_provider=rp1,
+                    resource_class=fields.ResourceClass.VCPU,
+                    used=12),
+                rp_obj.Allocation(
+                    context=self.ctx,
+                    consumer_id=inst_consumer,
+                    resource_provider=rp1,
+                    resource_class=fields.ResourceClass.MEMORY_MB,
+                    used=1024)
+            ])
+
+        # Make sure the right exception happens when the retry loop expires.
+        with mock.patch.object(rp_obj.AllocationList,
+                               'RP_CONFLICT_RETRY_COUNT', 0):
+            self.assertRaises(
+                exception.ResourceProviderConcurrentUpdateDetected,
+                alloc_list.create_all)
+            mock_log.warning.assert_called_with(
+                'Exceeded retry limit of %d on allocations write', 0)
+
+        # Make sure the right thing happens after a small number of failures.
+        # There's a bit of mock magic going on here to enusre that we can
+        # both do some side effects on _set_allocations as well as have the
+        # real behavior. Two generation conflicts and then a success.
+        mock_log.reset_mock()
+        with mock.patch.object(rp_obj.AllocationList,
+                               'RP_CONFLICT_RETRY_COUNT', 3):
+            unmocked_set = functools.partial(
+                rp_obj.AllocationList._set_allocations, alloc_list)
+            with mock.patch(
+                'nova.objects.resource_provider.'
+                'AllocationList._set_allocations') as mock_set:
+                exceptions = iter([
+                    exception.ResourceProviderConcurrentUpdateDetected(),
+                    exception.ResourceProviderConcurrentUpdateDetected(),
+                ])
+
+                def side_effect(*args, **kwargs):
+                    try:
+                        raise next(exceptions)
+                    except StopIteration:
+                        return unmocked_set(*args, **kwargs)
+
+                mock_set.side_effect = side_effect
+                alloc_list.create_all()
+                self.assertEqual(2, mock_log.debug.call_count)
+                mock_log.debug.called_with(
+                    'Retrying allocations write on resource provider '
+                    'generation conflict')
+                self.assertEqual(3, mock_set.call_count)
+
+        # Confirm we're using a different rp object after the change
+        # and that it has a higher generation.
+        new_rp = alloc_list[0].resource_provider
+        self.assertEqual(original_generation, rp1.generation)
+        self.assertEqual(original_generation + 1, new_rp.generation)
+
 
 class UsageListTestCase(ResourceProviderBaseCase):