From 28b3477fddd3f2201699712e9451ae8404e36d58 Mon Sep 17 00:00:00 2001 From: Tobias Henkel Date: Sun, 3 Feb 2019 09:31:39 +0100 Subject: [PATCH] Reset dependent jobs when aborting paused job A paused job often holds temporary artifacts that the child jobs rely on. If a paused job gets aborted (e.g. by an executor restart) all child jobs must be canceled and restarted regardless of their state. Otherwise this can cause weird and hard do debug failure symptoms in the child jobs. Change-Id: I7adf64d5121891f038c773245bcd5561f9b86020 --- tests/base.py | 18 ++++++- tests/unit/test_v3.py | 111 +++++++++++++++++++++++++++++++++++++++ zuul/executor/server.py | 2 + zuul/manager/__init__.py | 17 +++++- 4 files changed, 144 insertions(+), 4 deletions(-) diff --git a/tests/base.py b/tests/base.py index 3d7c457383..b9fd2ae330 100644 --- a/tests/base.py +++ b/tests/base.py @@ -1357,6 +1357,7 @@ class FakeBuild(object): self.name = self.parameters['job'] self.wait_condition = threading.Condition() self.waiting = False + self.paused = False self.aborted = False self.requeue = False self.created = time.time() @@ -1530,7 +1531,9 @@ class RecordingAnsibleJob(zuul.executor.server.AnsibleJob): if self.executor_server._run_ansible: # Call run on the fake build omitting the result so we also can # hold real ansible jobs. - build.run() + if playbook.path: + build.run() + result = super(RecordingAnsibleJob, self).runAnsible( cmd, timeout, playbook, wrapped) else: @@ -1554,6 +1557,17 @@ class RecordingAnsibleJob(zuul.executor.server.AnsibleJob): host_keys=[])) return hosts + def pause(self): + build = self.executor_server.job_builds[self.job.unique] + build.paused = True + super().pause() + + def resume(self): + build = self.executor_server.job_builds.get(self.job.unique) + if build: + build.paused = False + super().resume() + class RecordingExecutorServer(zuul.executor.server.ExecutorServer): """An Ansible executor to be used in tests. @@ -3055,7 +3069,7 @@ class ZuulTestCase(BaseTestCase): worker_build = self.executor_server.job_builds.get( server_job.unique.decode('utf8')) if worker_build: - if worker_build.isWaiting(): + if worker_build.isWaiting() or worker_build.paused: continue else: self.log.debug("%s is running" % worker_build) diff --git a/tests/unit/test_v3.py b/tests/unit/test_v3.py index 0f697e3452..c144c57ec6 100644 --- a/tests/unit/test_v3.py +++ b/tests/unit/test_v3.py @@ -4631,6 +4631,117 @@ class TestJobPause(AnsibleZuulTestCase): self.assertEqual('compile1', history_compile1.name) self.assertEqual('compile2', history_compile2.name) + def test_job_pause_retry(self): + """ + Tests that a paused job that gets lost due to an executor restart is + retried together with all child jobs. + + This test will wait until compile1 is paused and then fails it. The + expectation is that all child jobs are retried even if they already + were successful. + + compile1 --+ + +--> test1-after-compile1 + +--> test2-after-compile1 + +--> compile2 --+ + +--> test-after-compile2 + test-good + test-fail + """ + self.wait_timeout = 120 + + self.executor_server.hold_jobs_in_build = True + + # Output extra ansible info so we might see errors. + self.executor_server.verbose = True + self.executor_server.keep_jobdir = True + + A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A') + + self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1)) + self.waitUntilSettled() + + self.executor_server.release('test-.*') + self.executor_server.release('compile1') + self.waitUntilSettled() + + # test-fail and test-good must be finished by now + self.assertHistory([ + dict(name='test-fail', result='FAILURE', changes='1,1'), + dict(name='test-good', result='SUCCESS', changes='1,1'), + ], ordered=False) + + # Further compile1 must be in paused state and its three children in + # the queue. waitUltilSettled can return either directly after the job + # pause or after the child jobs are enqueued. So to make this + # deterministic we wait for the child jobs here + for _ in iterate_timeout(30, 'waiting for child jobs'): + if len(self.builds) == 4: + break + self.waitUntilSettled() + + compile1 = self.builds[0] + self.assertTrue(compile1.paused) + + # Now resume resume the compile2 sub tree so we can later check if all + # children restarted + self.executor_server.release('compile2') + for _ in iterate_timeout(30, 'waiting for child jobs'): + if len(self.builds) == 5: + break + self.waitUntilSettled() + self.executor_server.release('test-after-compile2') + self.waitUntilSettled() + self.executor_server.release('compile2') + self.waitUntilSettled() + self.assertHistory([ + dict(name='test-fail', result='FAILURE', changes='1,1'), + dict(name='test-good', result='SUCCESS', changes='1,1'), + dict(name='compile2', result='SUCCESS', changes='1,1'), + dict(name='test-after-compile2', result='SUCCESS', changes='1,1'), + ], ordered=False) + + # Stop the job worker of compile1 to simulate an executor restart + for job_worker in self.executor_server.job_workers.values(): + if job_worker.job.unique == compile1.unique: + job_worker.stop() + self.waitUntilSettled() + + # All still running child jobs must be aborted + self.assertHistory([ + dict(name='test-fail', result='FAILURE', changes='1,1'), + dict(name='test-good', result='SUCCESS', changes='1,1'), + dict(name='compile2', result='SUCCESS', changes='1,1'), + dict(name='test-after-compile2', result='SUCCESS', changes='1,1'), + dict(name='compile1', result='ABORTED', changes='1,1'), + dict(name='test1-after-compile1', result='ABORTED', changes='1,1'), + dict(name='test2-after-compile1', result='ABORTED', changes='1,1'), + ], ordered=False) + + # Only compile1 must be waiting + for _ in iterate_timeout(30, 'waiting for compile1 job'): + if len(self.builds) == 1: + break + + self.executor_server.hold_jobs_in_build = False + self.executor_server.release() + self.waitUntilSettled() + + self.assertHistory([ + dict(name='test-fail', result='FAILURE', changes='1,1'), + dict(name='test-good', result='SUCCESS', changes='1,1'), + dict(name='compile2', result='SUCCESS', changes='1,1'), + dict(name='compile2', result='SUCCESS', changes='1,1'), + dict(name='test-after-compile2', result='SUCCESS', changes='1,1'), + dict(name='test-after-compile2', result='SUCCESS', changes='1,1'), + dict(name='compile1', result='ABORTED', changes='1,1'), + dict(name='compile1', result='SUCCESS', changes='1,1'), + dict(name='test1-after-compile1', result='ABORTED', changes='1,1'), + dict(name='test2-after-compile1', result='ABORTED', changes='1,1'), + dict(name='test1-after-compile1', result='SUCCESS', changes='1,1'), + dict(name='test2-after-compile1', result='SUCCESS', changes='1,1'), + ], ordered=False) + def test_job_node_failure_resume(self): self.wait_timeout = 120 diff --git a/zuul/executor/server.py b/zuul/executor/server.py index bf1745135e..c60db8ec6d 100644 --- a/zuul/executor/server.py +++ b/zuul/executor/server.py @@ -1172,6 +1172,8 @@ class AnsibleJob(object): pause = result_data.get('zuul', {}).get('pause') if pause: self.pause() + if self.aborted: + return 'ABORTED' post_timeout = args['post_timeout'] unreachable = False diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py index 95922e7113..3e0672b975 100644 --- a/zuul/manager/__init__.py +++ b/zuul/manager/__init__.py @@ -794,6 +794,15 @@ class PipelineManager(object): self.sched.executor.resumeBuild(build) build.paused = False + def _resetDependentBuilds(self, build_set, build): + jobgraph = build_set.item.job_graph + + for job in jobgraph.getDependentJobsRecursively(build.job.name): + self.sched.cancelJob(build_set, job) + build = build_set.getBuild(job.name) + if build: + build_set.removeBuild(build) + def onBuildCompleted(self, build): item = build.build_set.item @@ -804,8 +813,12 @@ class PipelineManager(object): self.log.debug("Item %s status is now:\n %s" % (item, item.formatStatus())) - if build.retry and build.build_set.getJobNodeSet(build.job.name): - build.build_set.removeJobNodeSet(build.job.name) + if build.retry: + if build.build_set.getJobNodeSet(build.job.name): + build.build_set.removeJobNodeSet(build.job.name) + + # in case this was a paused build we need to retry all child jobs + self._resetDependentBuilds(build.build_set, build) self._resumeBuilds(build.build_set) return True