Fix builds queued forever after failure to get node request

When accepting nodes we check if a node request still exists. If this
throws a ZK exception this is catched at the run_handler level [1].

This leads to a build waiting forever in queued state which is only
recoverable by a rebasing the change or restarting the scheduler.

Further we don't have to ask ZK for an empty node request.

[1] Trace:
2018-01-23 16:38:57,285 ERROR zuul.Scheduler: Exception in run handler:
Traceback (most recent call last):
  File "/usr/lib/python3.6/site-packages/zuul/scheduler.py", line 843, in run
    self.process_result_queue()
  File "/usr/lib/python3.6/site-packages/zuul/scheduler.py", line 952, in process_result_queue
    self._doNodesProvisionedEvent(event)
  File "/usr/lib/python3.6/site-packages/zuul/scheduler.py", line 1039, in _doNodesProvisionedEvent
    self.nodepool.acceptNodes(request, request_id)
  File "/usr/lib/python3.6/site-packages/zuul/nodepool.py", line 210, in acceptNodes
    if not self.sched.zk.nodeRequestExists(request):
  File "/usr/lib/python3.6/site-packages/zuul/zk.py", line 199, in nodeRequestExists
    if self.client.exists(path):
  File "/usr/lib/python3.6/site-packages/kazoo/client.py", line 1007, in exists
    return self.exists_async(path, watch).get()
  File "/usr/lib/python3.6/site-packages/kazoo/handlers/utils.py", line 73, in get
    raise self._exception
kazoo.exceptions.SessionExpiredError

Change-Id: I8e6f0ace8955f02bb97bd0c62961234191d5e0bf
This commit is contained in:
Tobias Henkel 2018-01-24 10:45:01 +01:00
parent bdcd29b8fe
commit f3cbe4b4a8
1 changed files with 20 additions and 9 deletions

View File

@ -61,7 +61,7 @@ class Nodepool(object):
if nodeset.nodes:
self.sched.zk.submitNodeRequest(req, self._updateNodeRequest)
# Logged after submission so that we have the request id
self.log.info("Submited node request %s" % (req,))
self.log.info("Submitted node request %s" % (req,))
self.emitStats(req)
else:
self.log.info("Fulfilling empty node request %s" % (req,))
@ -215,14 +215,25 @@ class Nodepool(object):
# response was added to our queue, and when we actually get around to
# processing it. Nodepool will automatically reallocate the assigned
# nodes in that situation.
if not self.sched.zk.nodeRequestExists(request):
self.log.info("Request %s no longer exists, resubmitting",
request.id)
request.id = None
request.state = model.STATE_REQUESTED
self.requests[request.uid] = request
self.sched.zk.submitNodeRequest(request, self._updateNodeRequest)
return False
try:
if not self.sched.zk.nodeRequestExists(request):
self.log.info("Request %s no longer exists, resubmitting",
request.id)
request.id = None
request.state = model.STATE_REQUESTED
self.requests[request.uid] = request
self.sched.zk.submitNodeRequest(
request, self._updateNodeRequest)
return False
except Exception:
# If we cannot retrieve the node request from ZK we probably lost
# the connection and thus the ZK session. Resubmitting the node
# request probably doesn't make sense at this point in time as it
# is likely to directly fail again. So just log the problem
# with zookeeper and fail here.
self.log.exception("Error getting node request %s:" % request_id)
request.failed = True
return True
locked = False
if request.fulfilled: