Fix builds queued forever after failure to get node request

When accepting nodes we check if a node request still exists. If this throws a ZK exception this is catched at the run_handler level [1]. This leads to a build waiting forever in queued state which is only recoverable by a rebasing the change or restarting the scheduler. Further we don't have to ask ZK for an empty node request. [1] Trace: 2018-01-23 16:38:57,285 ERROR zuul.Scheduler: Exception in run handler: Traceback (most recent call last): File "/usr/lib/python3.6/site-packages/zuul/scheduler.py", line 843, in run self.process_result_queue() File "/usr/lib/python3.6/site-packages/zuul/scheduler.py", line 952, in process_result_queue self._doNodesProvisionedEvent(event) File "/usr/lib/python3.6/site-packages/zuul/scheduler.py", line 1039, in _doNodesProvisionedEvent self.nodepool.acceptNodes(request, request_id) File "/usr/lib/python3.6/site-packages/zuul/nodepool.py", line 210, in acceptNodes if not self.sched.zk.nodeRequestExists(request): File "/usr/lib/python3.6/site-packages/zuul/zk.py", line 199, in nodeRequestExists if self.client.exists(path): File "/usr/lib/python3.6/site-packages/kazoo/client.py", line 1007, in exists return self.exists_async(path, watch).get() File "/usr/lib/python3.6/site-packages/kazoo/handlers/utils.py", line 73, in get raise self._exception kazoo.exceptions.SessionExpiredError Change-Id: I8e6f0ace8955f02bb97bd0c62961234191d5e0bf
2018-01-24 10:45:01 +01:00 · 2018-01-24 10:45:01 +01:00 · f3cbe4b4a8
parent bdcd29b8fe
commit f3cbe4b4a8
1 changed files with 20 additions and 9 deletions
--- a/zuul/nodepool.py
+++ b/zuul/nodepool.py
@ -61,7 +61,7 @@ class Nodepool(object):
        if nodeset.nodes:
            self.sched.zk.submitNodeRequest(req, self._updateNodeRequest)
            # Logged after submission so that we have the request id
-            self.log.info("Submited node request %s" % (req,))
+            self.log.info("Submitted node request %s" % (req,))
            self.emitStats(req)
        else:
            self.log.info("Fulfilling empty node request %s" % (req,))
@ -215,14 +215,25 @@ class Nodepool(object):
        # response was added to our queue, and when we actually get around to
        # processing it. Nodepool will automatically reallocate the assigned
        # nodes in that situation.
-        if not self.sched.zk.nodeRequestExists(request):
-            self.log.info("Request %s no longer exists, resubmitting",
-                          request.id)
-            request.id = None
-            request.state = model.STATE_REQUESTED
-            self.requests[request.uid] = request
-            self.sched.zk.submitNodeRequest(request, self._updateNodeRequest)
-            return False
+        try:
+            if not self.sched.zk.nodeRequestExists(request):
+                self.log.info("Request %s no longer exists, resubmitting",
+                              request.id)
+                request.id = None
+                request.state = model.STATE_REQUESTED
+                self.requests[request.uid] = request
+                self.sched.zk.submitNodeRequest(
+                    request, self._updateNodeRequest)
+                return False
+        except Exception:
+            # If we cannot retrieve the node request from ZK we probably lost
+            # the connection and thus the ZK session. Resubmitting the node
+            # request probably doesn't make sense at this point in time as it
+            # is likely to directly fail again. So just log the problem
+            # with zookeeper and fail here.
+            self.log.exception("Error getting node request %s:" % request_id)
+            request.failed = True
+            return True

        locked = False
        if request.fulfilled: