Split build/buildset list queries into two

The only way to get reasonable performance out of all three databases (mariadb, mysql, postgres) appears to be to make the following changes: * Use group-by instead of distinct. Some versions of mysql produce the wrong output with distinct; this is more descriptive of what we want anyway. * Split the query into two statements instead of joining on a subquery. Mariadb seems to be unable to produce a good query plan for the buildset list query. Neither mariadb nor mysql support using "IN" with a subquery (so the idea of using IN instead of JOIN for the subquery is out). These methods now perform a query to get the ids of the builds or buildsets that match the criteria, then perform a second query to load the ORM objects that match those ids. This appears to be quite fast for all three queries with the latest versions of all three database systems. Change-Id: I30bb3214807dfa8b26a848f85bb7a7bc660c6c1d
2024-04-08 16:02:40 -07:00 · 2024-04-08 16:02:40 -07:00 · 95bd778fcc
parent 014717bde6
commit 95bd778fcc
2 changed files with 96 additions and 52 deletions
--- a/tests/unit/test_circular_dependencies.py
+++ b/tests/unit/test_circular_dependencies.py
@ -2477,6 +2477,45 @@ class TestGerritCircularDependencies(ZuulTestCase):
    def test_job_deduplication_semaphore_resources_first(self):
        self._test_job_deduplication_semaphore()

+    def _merge_noop_pair(self):
+        A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
+        B = self.fake_gerrit.addFakeChange('org/project1', 'master', 'B')
+
+        # A <-> B
+        A.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+            A.subject, B.data["url"]
+        )
+        B.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+            B.subject, A.data["url"]
+        )
+
+        A.addApproval('Code-Review', 2)
+        B.addApproval('Code-Review', 2)
+
+        self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+        self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
+
+        self.waitUntilSettled()
+
+        self.assertEqual(A.data['status'], 'MERGED')
+        self.assertEqual(B.data['status'], 'MERGED')
+
+    @simple_layout('layouts/job-dedup-noop.yaml')
+    def test_job_deduplication_buildsets(self):
+        self._merge_noop_pair()
+        self._merge_noop_pair()
+        self._merge_noop_pair()
+
+        buildsets = self.scheds.first.connections.connections[
+            'database'].getBuildsets()
+        self.assertEqual(len(buildsets), 3)
+
+        # If we accidentally limit by rows, we should get fewer than 2
+        # buildsets returned here.
+        buildsets = self.scheds.first.connections.connections[
+            'database'].getBuildsets(limit=2)
+        self.assertEqual(len(buildsets), 2)
+
    # Start check tests

    def _test_job_deduplication_check(self):
--- a/zuul/driver/sql/sqlconnection.py
+++ b/zuul/driver/sql/sqlconnection.py
@ -174,10 +174,10 @@ class DatabaseSession(object):
        provides_table = self.connection.zuul_provides_table

        q = self.session().query(
-            self.connection.buildModel.id.label('inner_id')).\
-            distinct().\
+            self.connection.buildModel.id).\
            join(self.connection.buildSetModel).\
-            join(self.connection.refModel)
+            join(self.connection.refModel).\
+            group_by(self.connection.buildModel.id)

        # If the query planner isn't able to reduce either the number
        # of rows returned by the buildset or build tables, then it
@ -241,8 +241,26 @@ class DatabaseSession(object):
        else:
            q = q.order_by(build_table.c.id.desc())

-        q = q.limit(limit).offset(offset).subquery()
+        q = q.limit(limit).offset(offset)

+        if query_timeout:
+            # For MySQL, we can add a query hint directly.
+            q = q.prefix_with(
+                f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
+                dialect='mysql')
+            # For Postgres, we add a comment that we parse in our
+            # event handler.
+            q = q.with_statement_hint(
+                f'/* statement_timeout={query_timeout} */',
+                dialect_name='postgresql')
+
+        ids = [x[0] for x in q.all()]
+        # We use two queries here, principally because mariadb does
+        # not produce a reasonable plan if we combine these into one
+        # query with a join, and neither mysql nor mariadb support
+        # using limit inside a subquery that is used with the "in"
+        # operator.  Therefore, we run a query to get the ids, then
+        # run a second to load the data.
        # contains_eager allows us to perform eager loading on the
        # buildset *and* use that table in filters (unlike
        # joinedload).
@ -254,6 +272,8 @@ class DatabaseSession(object):
                    orm.selectinload(self.connection.buildModel.provides),
                    orm.selectinload(self.connection.buildModel.artifacts))

+        bq = bq.filter(self.connection.buildModel.id.in_(ids))
+
        if sort_by_buildset:
            # If we don't need the builds to be strictly ordered, this
            # query can be much faster as it may avoid the use of a
@ -262,19 +282,6 @@ class DatabaseSession(object):
        else:
            bq = bq.order_by(build_table.c.id.desc())

-        bq = bq.join(q, self.connection.buildModel.id == q.c.inner_id)
-
-        if query_timeout:
-            # For MySQL, we can add a query hint directly.
-            bq = bq.prefix_with(
-                f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
-                dialect='mysql')
-            # For Postgres, we add a comment that we parse in our
-            # event handler.
-            bq = bq.with_statement_hint(
-                f'/* statement_timeout={query_timeout} */',
-                dialect_name='postgresql')
-
        try:
            return bq.all()
        except sqlalchemy.orm.exc.NoResultFound:
@ -326,11 +333,11 @@ class DatabaseSession(object):
        buildset_table = self.connection.zuul_buildset_table

        q = self.session().query(
-            self.connection.buildModel.id.label('inner_id'),
-            self.connection.buildModel.end_time.label('inner_end_time')).\
-            distinct().\
+            self.connection.buildModel.id,
+            self.connection.buildModel.end_time).\
            join(self.connection.buildSetModel).\
-            join(self.connection.refModel)
+            join(self.connection.refModel).\
+            group_by(self.connection.buildModel.id)

        # See note above about the hint.
        if not (project):
@ -357,9 +364,20 @@ class DatabaseSession(object):

        q = q.order_by(build_table.c.end_time.desc()).\
            limit(limit).\
-            offset(offset).\
-            subquery()
+            offset(offset)

+        if query_timeout:
+            # For MySQL, we can add a query hint directly.
+            q = q.prefix_with(
+                f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
+                dialect='mysql')
+            # For Postgres, we add a comment that we parse in our
+            # event handler.
+            q = q.with_statement_hint(
+                f'/* statement_timeout={query_timeout} */',
+                dialect_name='postgresql')
+
+        ids = [x[0] for x in q.all()]
        # contains_eager allows us to perform eager loading on the
        # buildset *and* use that table in filters (unlike
        # joinedload).
@ -367,21 +385,9 @@ class DatabaseSession(object):
            join(self.connection.buildSetModel).\
            join(self.connection.refModel).\
            options(orm.contains_eager(self.connection.buildModel.buildset),
-                    orm.contains_eager(self.connection.buildModel.ref))
-        bq = bq.order_by(build_table.c.id.desc())
-
-        if query_timeout:
-            # For MySQL, we can add a query hint directly.
-            bq = bq.prefix_with(
-                f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
-                dialect='mysql')
-            # For Postgres, we add a comment that we parse in our
-            # event handler.
-            bq = bq.with_statement_hint(
-                f'/* statement_timeout={query_timeout} */',
-                dialect_name='postgresql')
-
-        bq = bq.join(q, self.connection.buildModel.id == q.c.inner_id)
+                    orm.contains_eager(self.connection.buildModel.ref)).\
+            filter(self.connection.buildModel.id.in_(ids)).\
+            order_by(build_table.c.end_time.desc())

        try:
            return bq.all()
@ -430,10 +436,10 @@ class DatabaseSession(object):
        ref_table = self.connection.zuul_ref_table

        q = self.session().query(
-            self.connection.buildSetModel.id.label('inner_id')).\
-            distinct().\
+            self.connection.buildSetModel.id).\
            join(self.connection.buildSetRefModel).\
-            join(self.connection.refModel)
+            join(self.connection.refModel).\
+            group_by(self.connection.buildSetModel.id)

        q = self.listFilter(q, buildset_table.c.tenant, tenant)
        q = self.listFilterFuzzy(q, buildset_table.c.pipeline, pipeline)
@ -459,27 +465,26 @@ class DatabaseSession(object):
            q = q.filter(buildset_table.c.updated < updated_max)
        q = q.order_by(buildset_table.c.id.desc()).\
            limit(limit).\
-            offset(offset).\
-            subquery()
-
-        bq = self.session().query(self.connection.buildSetModel).\
-            join(self.connection.buildSetRefModel).\
-            join(self.connection.refModel).\
-            options(orm.contains_eager(self.connection.buildSetModel.refs))
-        bq = bq.order_by(buildset_table.c.id.desc())
+            offset(offset)

        if query_timeout:
            # For MySQL, we can add a query hint directly.
-            bq = bq.prefix_with(
+            q = q.prefix_with(
                f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
                dialect='mysql')
            # For Postgres, we add a comment that we parse in our
            # event handler.
-            bq = bq.with_statement_hint(
+            q = q.with_statement_hint(
                f'/* statement_timeout={query_timeout} */',
                dialect_name='postgresql')

-        bq = bq.join(q, self.connection.buildSetModel.id == q.c.inner_id)
+        ids = [x[0] for x in q.all()]
+        bq = self.session().query(self.connection.buildSetModel).\
+            join(self.connection.buildSetRefModel).\
+            join(self.connection.refModel).\
+            options(orm.contains_eager(self.connection.buildSetModel.refs)).\
+            filter(self.connection.buildSetModel.id.in_(ids)).\
+            order_by(buildset_table.c.id.desc())

        try:
            return bq.all()