Further fixes to check deduplication

This fixes a similar case as I86c159c82b858e67433bdaa1e479471b21ea8b86
but at a different point in the pipeline lifecycle.

In both cases, a queue item in check has completed and been
dequeued, while a second item that deduplicated one of its jobs
remains.

We store both the node request and nodeset information on the buildset
object.  When builds are finished, we clear the node request from the
buildset object.  Depending on timing, we may have information about the
nodeset while we may or may not have the request id.

We use the presence of nodeset information to decide whether to run a
job, but we pass the node request to the executor for it to lock the
nodeset.

This patch handles the case where we have deduplicated nodeset information
but have already cleared the node request id.  In that case, we would
attempt to run the job a second time while passing in None as the
node request id.

To correct this, if we learn that we are unable to restore information
about a deduplicated build, clear all of the nodeset and node request
information about that job from the secondary buildset so that when it
attempts to re-run the build, it starts from a clean slate.

This case may be a superset of the other case which means we won't hit
the failsafe code added in I86c159c82b858e67433bdaa1e479471b21ea8b86.
That should be fine, and we will remove code related to both cases once
the circular dependency refactor is complete.

Change-Id: I95511763236d5a99c14477989d573bd5c1726b76
This commit is contained in:
James E. Blair 2023-11-21 14:49:07 -08:00
parent edbe5405e8
commit e6856c8f98
2 changed files with 127 additions and 2 deletions

View File

@ -2169,6 +2169,63 @@ class TestGerritCircularDependencies(ZuulTestCase):
dict(name="project2-job2", result="SUCCESS", changes="2,1 1,1"),
], ordered=False)
@simple_layout('layouts/job-dedup-multi-sched-complete.yaml')
def test_job_deduplication_multi_scheduler_complete2(self):
# Test that a second scheduler can correctly refresh
# deduplicated builds after the first scheduler has completed
# the builds for one item.
# This is similar to the previous test but starts the second
# scheduler slightly later.
self.executor_server.hold_jobs_in_build = True
A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
B = self.fake_gerrit.addFakeChange('org/project2', 'master', 'B')
# A <-> B
A.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
A.subject, B.data["url"]
)
B.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
B.subject, A.data["url"]
)
A.addApproval('Code-Review', 2)
B.addApproval('Code-Review', 2)
self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
self.waitUntilSettled()
self.executor_server.release('common-job')
self.waitUntilSettled()
self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
self.waitUntilSettled()
self.executor_server.release('project1-job')
self.waitUntilSettled()
app = self.createScheduler()
app.start()
self.assertEqual(len(self.scheds), 2)
# Hold the lock on the first scheduler so that only the second
# will act.
with self.scheds.first.sched.run_handler_lock:
self.executor_server.hold_jobs_in_build = False
self.executor_server.release('project2-job2')
self.waitUntilSettled(matcher=[app])
self.executor_server.release()
self.waitUntilSettled(matcher=[app])
# We expect common-job to be "un-deduplicated".
self.assertHistory([
dict(name="project1-job", result="SUCCESS", changes="1,1 2,1"),
dict(name="common-job", result="SUCCESS", changes="1,1 2,1"),
dict(name="project2-job", result="SUCCESS", changes="1,1 2,1"),
dict(name="project2-job2", result="SUCCESS", changes="1,1 2,1"),
], ordered=False)
@simple_layout('layouts/job-dedup-noop.yaml')
def test_job_deduplication_noop(self):
# Test that we don't deduplicate noop (there's no good reason
@ -2938,6 +2995,62 @@ class TestGerritCircularDependencies(ZuulTestCase):
dict(name="project2-job2", result="SUCCESS", changes="1,1 2,1"),
], ordered=False)
@simple_layout('layouts/job-dedup-multi-sched-complete.yaml')
def test_job_deduplication_check_multi_scheduler_complete2(self):
# Test that a second scheduler can correctly refresh
# deduplicated builds after the first scheduler has completed
# the builds for one item.
# This is similar to the previous test but starts the second
# scheduler slightly later.
self.executor_server.hold_jobs_in_build = True
A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
B = self.fake_gerrit.addFakeChange('org/project2', 'master', 'B')
# A <-> B
A.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
A.subject, B.data["url"]
)
B.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
B.subject, A.data["url"]
)
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.executor_server.release('common-job')
self.waitUntilSettled()
self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.executor_server.release('project1-job')
self.waitUntilSettled()
app = self.createScheduler()
app.start()
self.assertEqual(len(self.scheds), 2)
# Hold the lock on the first scheduler so that only the second
# will act.
with self.scheds.first.sched.run_handler_lock:
self.executor_server.hold_jobs_in_build = False
self.executor_server.release('project2-job2')
self.waitUntilSettled(matcher=[app])
self.executor_server.release()
self.waitUntilSettled(matcher=[app])
# We expect common-job to be "un-deduplicated".
self.assertHistory([
dict(name="project1-job", result="SUCCESS", changes="2,1 1,1"),
dict(name="common-job", result="SUCCESS", changes="2,1 1,1"),
dict(name="project2-job", result="SUCCESS", changes="1,1 2,1"),
dict(name="common-job", result="SUCCESS", changes="1,1 2,1"),
dict(name="project2-job2", result="SUCCESS", changes="1,1 2,1"),
], ordered=False)
@simple_layout('layouts/job-dedup-noop.yaml')
def test_job_deduplication_check_noop(self):
# Test that we don't deduplicate noop (there's no good reason

View File

@ -922,6 +922,7 @@ class PipelineState(zkobject.ZKObject):
for build_job, build in buildset.builds.items():
if isinstance(build, BuildReference):
to_replace_dicts.append((item,
buildset,
buildset.builds,
build_job,
build._path))
@ -931,12 +932,14 @@ class PipelineState(zkobject.ZKObject):
for build in build_list:
if isinstance(build, BuildReference):
to_replace_lists.append((item,
None,
build_list,
build,
build._path))
else:
build_map[build.getPath()] = build
for (item, build_dict, build_job, build_path) in to_replace_dicts:
for (item, buildset, build_dict, build_job, build_path
) in to_replace_dicts:
orig_build = build_map.get(build_path)
if orig_build:
build_dict[build_job] = orig_build
@ -944,7 +947,16 @@ class PipelineState(zkobject.ZKObject):
log.warning("Unable to find deduplicated build %s for %s",
build_path, item)
del build_dict[build_job]
for (item, build_list, build, build_path) in to_replace_lists:
# We're not going to be able to use the results of
# this deduplication, which means we're going to try
# to launch the job again. To make sure that happens
# cleanly, go ahead and remove any nodeset information
# that we copied when we thought we were going to
# deduplicate it.
buildset.nodeset_info.pop(build_job, None)
buildset.node_requests.pop(build_job, None)
for (item, buildset, build_list, build, build_path
) in to_replace_lists:
idx = build_list.index(build)
orig_build = build_map.get(build_path)
if orig_build: