Always put 'uuid' into sort_keys for stable instance lists

If we're listing by sort keys that yield many ambiguous results, we may exacerbate issues in client pagination because we're not even bound by insertion order given that we have multiple databases being queried in parallel. So, even if the client didn't ask for it, throw 'uuid' into the end of sort_keys to provide us a stable ordering. This was done for the default case by always including 'id' in the default set of sort_keys, although a user could still break if they request their own keys. Note this also removes the recently-added explicit sort in the test_bug_1689692 case, since we're enforcing a strict ordering with this patch. Also, mriedem is awesome. Change-Id: Ida446acb1286a8b215451a5d8d7d23882643ef13 Closes-Bug: #1721791
2017-10-06 07:40:42 -07:00 · 2017-10-06 07:40:42 -07:00 · add69c0507
parent 00bbc2ffd3
commit add69c0507
3 changed files with 51 additions and 6 deletions
--- a/nova/compute/instance_list.py
+++ b/nova/compute/instance_list.py
@ -114,6 +114,17 @@ def get_instances_sorted(ctx, filters, limit, marker, columns_to_join,
        sort_keys = ['created_at', 'id']
        sort_dirs = ['asc', 'asc']

+    if 'uuid' not in sort_keys:
+        # Historically the default sort includes 'id' (see above), which
+        # should give us a stable ordering. Since we're striping across
+        # cell databases here, many sort_keys arrangements will yield
+        # nothing unique across all the databases to give us a stable
+        # ordering, which can mess up expected client pagination behavior.
+        # So, throw uuid into the sort_keys at the end if it's not already
+        # there to keep us repeatable.
+        sort_keys = copy.copy(sort_keys) + ['uuid']
+        sort_dirs = copy.copy(sort_dirs) + ['asc']
+
    sort_ctx = InstanceSortContext(sort_keys, sort_dirs)

    if marker:
--- a/nova/tests/functional/compute/test_instance_list.py
+++ b/nova/tests/functional/compute/test_instance_list.py
@ -31,7 +31,8 @@ class InstanceListTestCase(test.TestCase):
        self.num_instances = 3
        self.instances = []

-        dt = datetime.datetime(1985, 10, 25, 1, 21, 0)
+        start = datetime.datetime(1985, 10, 25, 1, 21, 0)
+        dt = start
        spread = datetime.timedelta(minutes=10)

        self.cells = objects.CellMappingList.get_all(self.context)
@ -45,6 +46,7 @@ class InstanceListTestCase(test.TestCase):
                        context=cctx,
                        project_id=self.context.project_id,
                        user_id=self.context.user_id,
+                        created_at=start,
                        launched_at=dt,
                        instance_type_id=i,
                        hostname='%s-inst%i' % (cell.name, i))
@ -164,6 +166,11 @@ class InstanceListTestCase(test.TestCase):
                break
            insts.extend(batch)
            page += 1
+            if page > len(self.instances) * 2:
+                # Do this sanity check in case we introduce (or find) another
+                # repeating page bug like #1721791. Without this we loop
+                # until timeout, which is less obvious.
+                raise Exception('Infinite paging loop')

        # We should have requested exactly (or one more unlimited) pages
        self.assertIn(page, (pages, pages + 1))
@ -172,11 +179,12 @@ class InstanceListTestCase(test.TestCase):
        found = [x[sort_by] for x in insts]
        had = [x[sort_by] for x in self.instances]

-        if sort_by == 'launched_at':
+        if sort_by in ('launched_at', 'created_at'):
            # We're comparing objects and database entries, so we need to
            # squash the tzinfo of the object ones so we can compare
            had = [x.replace(tzinfo=None) for x in had]

+        self.assertEqual(len(had), len(found))
        self.assertEqual(sorted(had), found)

    def test_get_sorted_with_limit_marker_stable(self):
@ -219,6 +227,14 @@ class InstanceListTestCase(test.TestCase):
        """
        self._test_get_sorted_with_limit_marker(sort_by='launched_at')

+    def test_get_sorted_with_limit_marker_datetime_same(self):
+        """Test sorted by created_at.
+
+        This tests that we can do all of this, but with datetime
+        fields that are identical.
+        """
+        self._test_get_sorted_with_limit_marker(sort_by='created_at')
+
    def test_get_sorted_with_deleted_marker(self):
        marker = self.instances[1]['uuid']

@ -376,7 +392,8 @@ class TestInstanceListObjects(test.TestCase):
        self.num_instances = 3
        self.instances = []

-        dt = datetime.datetime(1985, 10, 25, 1, 21, 0)
+        start = datetime.datetime(1985, 10, 25, 1, 21, 0)
+        dt = start
        spread = datetime.timedelta(minutes=10)

        cells = objects.CellMappingList.get_all(self.context)
@ -390,6 +407,7 @@ class TestInstanceListObjects(test.TestCase):
                        context=cctx,
                        project_id=self.context.project_id,
                        user_id=self.context.user_id,
+                        created_at=start,
                        launched_at=dt,
                        instance_type_id=i,
                        hostname='%s-inst%i' % (cell.name, i))
@ -451,3 +469,20 @@ class TestInstanceListObjects(test.TestCase):
        # actual faults
        self.assertEqual(2, len([inst for inst in insts
                                 if inst.fault]))
+
+    def test_get_instance_objects_sorted_paged(self):
+        """Query a full first page and ensure an empty second one.
+
+        This uses created_at which is enforced to be the same across
+        each instance by setUp(). This will help make sure we still
+        have a stable ordering, even when we only claim to care about
+        created_at.
+        """
+        instp1 = instance_list.get_instance_objects_sorted(
+            self.context, {}, None, None, [],
+            ['created_at'], ['asc'])
+        self.assertEqual(len(self.instances), len(instp1))
+        instp2 = instance_list.get_instance_objects_sorted(
+            self.context, {}, None, instp1[-1]['uuid'], [],
+            ['created_at'], ['asc'])
+        self.assertEqual(0, len(instp2))
--- a/nova/tests/functional/regressions/test_bug_1689692.py
+++ b/nova/tests/functional/regressions/test_bug_1689692.py
@ -71,14 +71,13 @@ class ServerListLimitMarkerCell0Test(test.TestCase,
            self.addCleanup(self.api.delete_server, server['id'])
            self._wait_for_state_change(self.api, server, 'ERROR')

-        servers = self.api.get_servers(search_opts=dict(sort_key='uuid'))
+        servers = self.api.get_servers()
        self.assertEqual(3, len(servers))

        # Take the first server and user that as our marker.
        marker = servers[0]['id']
        # Since we're paging after the first server as our marker, there are
        # only two left so specifying three should just return two.
-        servers = self.api.get_servers(search_opts=dict(sort_key='uuid',
-                                                        marker=marker,
+        servers = self.api.get_servers(search_opts=dict(marker=marker,
                                                        limit=3))
        self.assertEqual(2, len(servers))