Add autohold debug info

Also:

* Check the lock before attempting to write the znodes (matches
  other methodes in nodepool.py).
* Process exceptions one level higher in the scheduler, and
  separate autohold exceptions from nodeset return exceptions, so
  that in the unlikely event we throw an exception dealing with the
  autohold, we just might later succeed when returning the node.
* Don't remove the autohold request on exception.  The error is
  most likely to come from a zookeeper problem and not be systemic.
  Let Zuul try again after the system has recovered.

Change-Id: Idba331576a43f738883d61be72a6f400c233bf0e
This commit is contained in:
James E. Blair 2018-03-22 07:50:11 -07:00
parent 0901777d37
commit acef0f5ee8
2 changed files with 13 additions and 12 deletions

View File

@ -81,16 +81,19 @@ class Nodepool(object):
def holdNodeSet(self, nodeset, autohold_key):
'''
If requested, perform a hold on the given set of nodes.
Perform a hold on the given set of nodes.
:param NodeSet nodeset: The object containing the set of nodes to hold.
:param set autohold_key: A set with the tenant/project/job names
associated with the given NodeSet.
'''
self.log.info("Holding nodeset %s" % (nodeset,))
(hold_iterations, reason) = self.sched.autohold_requests[autohold_key]
nodes = nodeset.getNodes()
for node in nodes:
if node.lock is None:
raise Exception("Node %s is not locked" % (node,))
node.state = model.STATE_HOLD
node.hold_job = " ".join(autohold_key)
node.comment = reason

View File

@ -1019,6 +1019,7 @@ class Scheduler(threading.Thread):
# of requests - the most specific is selected.
autohold_key = None
scope = Scope.NONE
self.log.debug("Checking build autohold key %s", autohold_key_base)
for request in self.autohold_requests:
ref_filter = request[-1]
if not autohold_key_base_issubset(autohold_key_base, request) \
@ -1032,6 +1033,8 @@ class Scheduler(threading.Thread):
else:
candidate_scope = Scope.REF
self.log.debug("Build autohold key %s matched scope %s",
autohold_key_base, candidate_scope)
if candidate_scope > scope:
scope = candidate_scope
autohold_key = request
@ -1039,7 +1042,6 @@ class Scheduler(threading.Thread):
return autohold_key
def _processAutohold(self, build):
# We explicitly only want to hold nodes for jobs if they have
# failed / retry_limit / post_failure and have an autohold request.
hold_list = ["FAILURE", "RETRY_LIMIT", "POST_FAILURE"]
@ -1047,16 +1049,9 @@ class Scheduler(threading.Thread):
return
autohold_key = self._getAutoholdRequestKey(build)
try:
if autohold_key is not None:
self.nodepool.holdNodeSet(build.nodeset, autohold_key)
except Exception:
self.log.exception("Unable to process autohold for %s:",
autohold_key)
if autohold_key in self.autohold_requests:
self.log.debug("Removing autohold %s due to exception",
autohold_key)
del self.autohold_requests[autohold_key]
self.log.debug("Got autohold key %s", autohold_key)
if autohold_key is not None:
self.nodepool.holdNodeSet(build.nodeset, autohold_key)
def _doBuildCompletedEvent(self, event):
build = event.build
@ -1066,6 +1061,9 @@ class Scheduler(threading.Thread):
# the nodes to nodepool.
try:
self._processAutohold(build)
except Exception:
self.log.exception("Unable to process autohold for %s" % build)
try:
self.nodepool.returnNodeSet(build.nodeset)
except Exception:
self.log.exception("Unable to return nodeset %s" % build.nodeset)