Add autohold debug info

Also: * Check the lock before attempting to write the znodes (matches other methodes in nodepool.py). * Process exceptions one level higher in the scheduler, and separate autohold exceptions from nodeset return exceptions, so that in the unlikely event we throw an exception dealing with the autohold, we just might later succeed when returning the node. * Don't remove the autohold request on exception. The error is most likely to come from a zookeeper problem and not be systemic. Let Zuul try again after the system has recovered. Change-Id: Idba331576a43f738883d61be72a6f400c233bf0e
2018-03-22 07:50:11 -07:00 · 2018-03-22 07:50:11 -07:00 · acef0f5ee8
parent 0901777d37
commit acef0f5ee8
2 changed files with 13 additions and 12 deletions
--- a/zuul/nodepool.py
+++ b/zuul/nodepool.py
@ -81,16 +81,19 @@ class Nodepool(object):

    def holdNodeSet(self, nodeset, autohold_key):
        '''
-        If requested, perform a hold on the given set of nodes.
+        Perform a hold on the given set of nodes.

        :param NodeSet nodeset: The object containing the set of nodes to hold.
        :param set autohold_key: A set with the tenant/project/job names
            associated with the given NodeSet.
        '''
+        self.log.info("Holding nodeset %s" % (nodeset,))
        (hold_iterations, reason) = self.sched.autohold_requests[autohold_key]
        nodes = nodeset.getNodes()

        for node in nodes:
+            if node.lock is None:
+                raise Exception("Node %s is not locked" % (node,))
            node.state = model.STATE_HOLD
            node.hold_job = " ".join(autohold_key)
            node.comment = reason
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@ -1019,6 +1019,7 @@ class Scheduler(threading.Thread):
        # of requests - the most specific is selected.
        autohold_key = None
        scope = Scope.NONE
+        self.log.debug("Checking build autohold key %s", autohold_key_base)
        for request in self.autohold_requests:
            ref_filter = request[-1]
            if not autohold_key_base_issubset(autohold_key_base, request) \
@ -1032,6 +1033,8 @@ class Scheduler(threading.Thread):
            else:
                candidate_scope = Scope.REF

+            self.log.debug("Build autohold key %s matched scope %s",
+                           autohold_key_base, candidate_scope)
            if candidate_scope > scope:
                scope = candidate_scope
                autohold_key = request
@ -1039,7 +1042,6 @@ class Scheduler(threading.Thread):
        return autohold_key

    def _processAutohold(self, build):
-
        # We explicitly only want to hold nodes for jobs if they have
        # failed / retry_limit / post_failure and have an autohold request.
        hold_list = ["FAILURE", "RETRY_LIMIT", "POST_FAILURE"]
@ -1047,16 +1049,9 @@ class Scheduler(threading.Thread):
            return

        autohold_key = self._getAutoholdRequestKey(build)
-        try:
-            if autohold_key is not None:
-                self.nodepool.holdNodeSet(build.nodeset, autohold_key)
-        except Exception:
-            self.log.exception("Unable to process autohold for %s:",
-                               autohold_key)
-            if autohold_key in self.autohold_requests:
-                self.log.debug("Removing autohold %s due to exception",
-                               autohold_key)
-                del self.autohold_requests[autohold_key]
+        self.log.debug("Got autohold key %s", autohold_key)
+        if autohold_key is not None:
+            self.nodepool.holdNodeSet(build.nodeset, autohold_key)

    def _doBuildCompletedEvent(self, event):
        build = event.build
@ -1066,6 +1061,9 @@ class Scheduler(threading.Thread):
        # the nodes to nodepool.
        try:
            self._processAutohold(build)
+        except Exception:
+            self.log.exception("Unable to process autohold for %s" % build)
+        try:
            self.nodepool.returnNodeSet(build.nodeset)
        except Exception:
            self.log.exception("Unable to return nodeset %s" % build.nodeset)