Add post-timeout setting

This adds a post-timeout setting which applies as a timeout to each post-run playbook. This is separate and independent of the normal job timeout which now only applies to the pre-run and run playbooks in a cumulative fashion. The reason for this change is when a pre-run or run playbook hits the timeout and the job fails you still want to do your best to copy all of the log data that you can find so that you can debug the timeout. Similarly to timeout, if post-timeout is not set then post-run playbooks will have no timeout and can run indefinitely. Change-Id: I830a6a14d2623f50fbc3f05396cc909d79de04bb
2018-02-16 11:00:50 -08:00 · 2018-02-16 11:00:50 -08:00 · b2c22b31fe
parent 5e898285a8
commit b2c22b31fe
9 changed files with 51 additions and 5 deletions
--- a/doc/source/user/config.rst
+++ b/doc/source/user/config.rst
@ -710,6 +710,21 @@ Here is an example of two job definitions:
      timeout is supplied, the job may run indefinitely.  Supplying a
      timeout is highly recommended.

+      This timeout only applies to the pre-run and run playbooks in a
+      job.
+
+   .. attr:: post-timeout
+
+      The time in seconds that each post playbook should be allowed to run
+      before it is automatically aborted and failure is reported.  If no
+      post-timeout is supplied, the job may run indefinitely.  Supplying a
+      post-timeout is highly recommended.
+
+      The post-timeout is handled separately from the above timeout because
+      the post playbooks are typically where you will copy jobs logs.
+      In the event of the pre-run or run playbooks timing out we want to
+      do our best to copy the job logs in the post-run playbooks.
+
   .. attr:: attempts
      :default: 3

--- a/doc/source/user/jobs.rst
+++ b/doc/source/user/jobs.rst
@ -289,6 +289,10 @@ of item.

      The job timeout, in seconds.

+   .. var:: post_timeout
+
+      The post-run playbook timeout, in seconds.
+
   .. var:: jobtags

      A list of tags associated with the job.  Not to be confused with
--- a/tests/fixtures/config/ansible/git/common-config/zuul.yaml
+++ b/tests/fixtures/config/ansible/git/common-config/zuul.yaml
@ -97,6 +97,12 @@
    run: playbooks/timeout.yaml
    timeout: 1

+- job:
+    parent: python27
+    name: post-timeout
+    post-run: playbooks/timeout.yaml
+    post-timeout: 1
+
 - job:
    parent: python27
    name: check-vars
--- a/tests/fixtures/config/ansible/git/org_project/.zuul.yaml
+++ b/tests/fixtures/config/ansible/git/org_project/.zuul.yaml
@ -17,5 +17,6 @@
        - check-vars
        - check-secret-names
        - timeout
+        - post-timeout
        - hello-world
        - failpost
--- a/tests/unit/test_v3.py
+++ b/tests/unit/test_v3.py
@ -2048,6 +2048,12 @@ class TestAnsible(AnsibleZuulTestCase):
        build_timeout = self.getJobFromHistory('timeout')
        with self.jobLog(build_timeout):
            self.assertEqual(build_timeout.result, 'TIMED_OUT')
+            post_flag_path = os.path.join(self.test_root, build_timeout.uuid +
+                                          '.post.flag')
+            self.assertTrue(os.path.exists(post_flag_path))
+        build_post_timeout = self.getJobFromHistory('post-timeout')
+        with self.jobLog(build_post_timeout):
+            self.assertEqual(build_post_timeout.result, 'POST_FAILURE')
        build_faillocal = self.getJobFromHistory('faillocal')
        with self.jobLog(build_faillocal):
            self.assertEqual(build_faillocal.result, 'FAILURE')
--- a/zuul/configloader.py
+++ b/zuul/configloader.py
@ -491,6 +491,7 @@ class JobParser(object):
                      # validation happens in NodeSetParser
                      'nodeset': vs.Any(dict, str),
                      'timeout': int,
+                      'post-timeout': int,
                      'attempts': int,
                      'pre-run': to_list(str),
                      'post-run': to_list(str),
@ -518,6 +519,7 @@ class JobParser(object):
        'abstract',
        'protected',
        'timeout',
+        'post-timeout',
        'workspace',
        'voting',
        'hold-following-changes',
@ -627,6 +629,10 @@ class JobParser(object):
           int(conf['timeout']) > tenant.max_job_timeout:
            raise MaxTimeoutError(job, tenant)

+        if conf.get('post-timeout') and tenant.max_job_timeout != -1 and \
+           int(conf['post-timeout']) > tenant.max_job_timeout:
+            raise MaxTimeoutError(job, tenant)
+
        if 'post-review' in conf:
            if conf['post-review']:
                job.post_review = True
--- a/zuul/executor/client.py
+++ b/zuul/executor/client.py
@ -186,6 +186,7 @@ class ExecutorClient(object):
        params = dict()
        params['job'] = job.name
        params['timeout'] = job.timeout
+        params['post_timeout'] = job.post_timeout
        params['items'] = merger_items
        params['projects'] = []
        if hasattr(item.change, 'branch'):
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@ -878,8 +878,10 @@ class AnsibleJob(object):
        success = False
        self.started = True
        time_started = time.time()
-        # timeout value is total job timeout or put another way
-        # the cummulative time that pre, run, and post can consume.
+        # timeout value is "total" job timeout which accounts for
+        # pre-run and run playbooks. post-run is different because
+        # it is used to copy out job logs and we want to do our best
+        # to copy logs even when the job has timed out.
        job_timeout = args['timeout']
        for index, playbook in enumerate(self.jobdir.pre_playbooks):
            # TODOv3(pabelanger): Implement pre-run timeout setting.
@ -914,11 +916,15 @@ class AnsibleJob(object):
                # run it again.
                return None

+        post_timeout = args['post_timeout']
        for index, playbook in enumerate(self.jobdir.post_playbooks):
-            # TODOv3(pabelanger): Implement post-run timeout setting.
-            ansible_timeout = self.getAnsibleTimeout(time_started, job_timeout)
+            # Post timeout operates a little differently to the main job
+            # timeout. We give each post playbook the full post timeout to
+            # do its job because post is where you'll often record job logs
+            # which are vital to understanding why timeouts have happened in
+            # the first place.
            post_status, post_code = self.runAnsiblePlaybook(
-                playbook, ansible_timeout, success, phase='post', index=index)
+                playbook, post_timeout, success, phase='post', index=index)
            if post_status == self.RESULT_ABORTED:
                return 'ABORTED'
            if post_status != self.RESULT_NORMAL or post_code != 0:
--- a/zuul/model.py
+++ b/zuul/model.py
@ -839,6 +839,7 @@ class Job(object):
        self.execution_attributes = dict(
            parent=None,
            timeout=None,
+            post_timeout=None,
            variables={},
            nodeset=NodeSet(),
            workspace=None,