From c5e6f5cefe3ee33c65e0007a18241ab7e197b1d7 Mon Sep 17 00:00:00 2001 From: Tobias Henkel Date: Mon, 9 Apr 2018 16:02:45 +0200 Subject: [PATCH] Fix missing semaphore release on zk error During problems with zk connectivity jobs can fail locking nodes [1]. In this case the build doesn't get created and attached to the queue item. However semaphores are already aquired at this point and don't get released in this case. Fix this by releasing the semaphore when hitting this exception. [1] Trace: 2018-04-05 10:56:56,936 ERROR zuul.Pipeline.example.check: Exception while executing job example-test for change : Traceback (most recent call last): File "/usr/lib/python3.6/site-packages/zuul/manager/__init__.py", line 396, in _executeJobs self.sched.nodepool.useNodeSet(nodeset) File "/usr/lib/python3.6/site-packages/zuul/nodepool.py", line 117, in useNodeSet self.sched.zk.storeNode(node) File "/usr/lib/python3.6/site-packages/zuul/zk.py", line 213, in storeNode self.client.set(path, self._dictToStr(node.toDict())) File "/usr/lib/python3.6/site-packages/kazoo/client.py", line 1242, in set return self.set_async(path, value, version).get() File "/usr/lib/python3.6/site-packages/kazoo/handlers/utils.py", line 79, in get raise self._exception kazoo.exceptions.NoNodeError Change-Id: I851876ece318aa047e523c50f4c721417d1af6b7 --- .../semaphore/git/common-config/zuul.yaml | 15 ++++++++++ .../config/semaphore/git/org_project2/README | 1 + tests/fixtures/config/semaphore/main.yaml | 1 + tests/unit/test_scheduler.py | 30 +++++++++++++++++++ zuul/manager/__init__.py | 8 +++++ 5 files changed, 55 insertions(+) create mode 100644 tests/fixtures/config/semaphore/git/org_project2/README diff --git a/tests/fixtures/config/semaphore/git/common-config/zuul.yaml b/tests/fixtures/config/semaphore/git/common-config/zuul.yaml index 52a0e7d61d..600543cdb4 100644 --- a/tests/fixtures/config/semaphore/git/common-config/zuul.yaml +++ b/tests/fixtures/config/semaphore/git/common-config/zuul.yaml @@ -47,6 +47,15 @@ semaphore: test-semaphore-two run: playbooks/semaphore-two-test2.yaml +- job: + name: semaphore-one-test3 + semaphore: test-semaphore + run: playbooks/semaphore-two-test1.yaml + nodeset: + nodes: + - name: controller + label: label1 + - project: name: org/project check: @@ -62,3 +71,9 @@ - project-test1 - semaphore-two-test1 - semaphore-two-test2 + +- project: + name: org/project2 + check: + jobs: + - semaphore-one-test3 diff --git a/tests/fixtures/config/semaphore/git/org_project2/README b/tests/fixtures/config/semaphore/git/org_project2/README new file mode 100644 index 0000000000..9daeafb986 --- /dev/null +++ b/tests/fixtures/config/semaphore/git/org_project2/README @@ -0,0 +1 @@ +test diff --git a/tests/fixtures/config/semaphore/main.yaml b/tests/fixtures/config/semaphore/main.yaml index 5f57245ccb..83ed0925a9 100644 --- a/tests/fixtures/config/semaphore/main.yaml +++ b/tests/fixtures/config/semaphore/main.yaml @@ -7,3 +7,4 @@ untrusted-projects: - org/project - org/project1 + - org/project2 diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py index 675c2c6f6f..39a13241e1 100755 --- a/tests/unit/test_scheduler.py +++ b/tests/unit/test_scheduler.py @@ -22,6 +22,7 @@ import os import shutil import time from unittest import skip +from kazoo.exceptions import NoNodeError import git import testtools @@ -5442,6 +5443,35 @@ class TestSemaphore(ZuulTestCase): self.assertEqual(A.reported, 1) self.assertEqual(B.reported, 1) + def test_semaphore_zk_error(self): + "Test semaphore release with zk error" + tenant = self.sched.abide.tenants.get('tenant-one') + + A = self.fake_gerrit.addFakeChange('org/project2', 'master', 'A') + self.assertFalse('test-semaphore' in + tenant.semaphore_handler.semaphores) + + # Simulate a single zk error in useNodeSet + orig_useNodeSet = self.nodepool.useNodeSet + + def broken_use_nodeset(nodeset): + # restore original useNodeSet + self.nodepool.useNodeSet = orig_useNodeSet + raise NoNodeError() + + self.nodepool.useNodeSet = broken_use_nodeset + + self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1)) + self.waitUntilSettled() + + # The semaphore should be released + self.assertFalse('test-semaphore' in + tenant.semaphore_handler.semaphores) + + # cleanup the queue + self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1)) + self.waitUntilSettled() + def test_semaphore_abandon(self): "Test abandon with job semaphores" self.executor_server.hold_jobs_in_build = True diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py index c1bec2d4b9..c29dedac02 100644 --- a/zuul/manager/__init__.py +++ b/zuul/manager/__init__.py @@ -399,6 +399,14 @@ class PipelineManager(object): except Exception: self.log.exception("Exception while executing job %s " "for change %s:" % (job, item.change)) + try: + # If we hit an exception we don't have a build in the + # current item so a potentially aquired semaphore must be + # released as it won't be released on dequeue of the item. + tenant = item.pipeline.layout.tenant + tenant.semaphore_handler.release(item, job) + except Exception: + self.log.exception("Exception while releasing semaphore") def executeJobs(self, item): # TODO(jeblair): This should return a value indicating a job