From 8f9aff7ca9795dcea3d20cabd7ac5c96962cd87e Mon Sep 17 00:00:00 2001 From: Simon Westphahl Date: Thu, 11 Oct 2018 13:20:56 +0200 Subject: [PATCH] Check paused parent on node failure of child job When all or the last child jobs of a paused job have failed because of a node failure we have to resume the parent. Otherwise the builset will hang indefinitely. Change-Id: If4bea8b7b2d3395ec33aef3bdccce2fcd0b17413 --- .../org_project2/playbooks/just-pause.yaml | 7 ++++++ .../playbooks/test-node-failure.yaml | 4 ++++ .../job-pause/git/org_project2/zuul.yaml | 24 +++++++++++++++++++ tests/fixtures/config/job-pause/main.yaml | 1 + tests/unit/test_v3.py | 20 ++++++++++++++++ zuul/manager/__init__.py | 1 + 6 files changed, 57 insertions(+) create mode 100644 tests/fixtures/config/job-pause/git/org_project2/playbooks/just-pause.yaml create mode 100644 tests/fixtures/config/job-pause/git/org_project2/playbooks/test-node-failure.yaml create mode 100644 tests/fixtures/config/job-pause/git/org_project2/zuul.yaml diff --git a/tests/fixtures/config/job-pause/git/org_project2/playbooks/just-pause.yaml b/tests/fixtures/config/job-pause/git/org_project2/playbooks/just-pause.yaml new file mode 100644 index 0000000000..9d6f2dae80 --- /dev/null +++ b/tests/fixtures/config/job-pause/git/org_project2/playbooks/just-pause.yaml @@ -0,0 +1,7 @@ +- hosts: all + tasks: + - name: Pause and let child run + zuul_return: + data: + zuul: + pause: true diff --git a/tests/fixtures/config/job-pause/git/org_project2/playbooks/test-node-failure.yaml b/tests/fixtures/config/job-pause/git/org_project2/playbooks/test-node-failure.yaml new file mode 100644 index 0000000000..bd661afdb2 --- /dev/null +++ b/tests/fixtures/config/job-pause/git/org_project2/playbooks/test-node-failure.yaml @@ -0,0 +1,4 @@ +- hosts: all + tasks: + - debug: + msg: "This should not be executed" diff --git a/tests/fixtures/config/job-pause/git/org_project2/zuul.yaml b/tests/fixtures/config/job-pause/git/org_project2/zuul.yaml new file mode 100644 index 0000000000..284ddb0ec5 --- /dev/null +++ b/tests/fixtures/config/job-pause/git/org_project2/zuul.yaml @@ -0,0 +1,24 @@ +- job: + name: just-pause + run: playbooks/just-pause.yaml + nodeset: + nodes: + - name: fake + label: fake + +- job: + name: test-node-failure + run: playbooks/test-node-failure.yaml + nodeset: + nodes: + - name: fail + label: fail + + +- project: + check: + jobs: + - just-pause + - test-node-failure: + dependencies: + - just-pause diff --git a/tests/fixtures/config/job-pause/main.yaml b/tests/fixtures/config/job-pause/main.yaml index a2130154fd..d4d7c324af 100644 --- a/tests/fixtures/config/job-pause/main.yaml +++ b/tests/fixtures/config/job-pause/main.yaml @@ -5,3 +5,4 @@ config-projects: - common-config - org/project + - org/project2 diff --git a/tests/unit/test_v3.py b/tests/unit/test_v3.py index e203059b9a..5bef1238f2 100644 --- a/tests/unit/test_v3.py +++ b/tests/unit/test_v3.py @@ -4216,3 +4216,23 @@ class TestJobPause(AnsibleZuulTestCase): history_compile2 = self.history[-2] self.assertEqual('compile1', history_compile1.name) self.assertEqual('compile2', history_compile2.name) + + def test_job_node_failure_resume(self): + self.wait_timeout = 120 + + # Output extra ansible info so we might see errors. + self.executor_server.verbose = True + + # Second node request should fail + fail = {'_oid': '200-0000000001'} + self.fake_nodepool.addFailRequest(fail) + + A = self.fake_gerrit.addFakeChange('org/project2', 'master', 'A') + + self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1)) + self.waitUntilSettled() + + self.assertEqual([], self.builds) + self.assertHistory([ + dict(name='just-pause', result='SUCCESS', changes='1,1'), + ], ordered=False) diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py index 4d16042b0b..a21411bba4 100644 --- a/zuul/manager/__init__.py +++ b/zuul/manager/__init__.py @@ -799,6 +799,7 @@ class PipelineManager(object): self.log.info("Node request %s: failure for %s" % (request, request.job.name,)) build_set.item.setNodeRequestFailure(request.job) + self._resumeParents(request) self.log.info("Completed node request %s for job %s of item %s " "with nodes %s" % (request, request.job, build_set.item,