From b2c22b31fef3174bb55553c545b053341401dd83 Mon Sep 17 00:00:00 2001 From: Clark Boylan Date: Fri, 16 Feb 2018 11:00:50 -0800 Subject: [PATCH] Add post-timeout setting This adds a post-timeout setting which applies as a timeout to each post-run playbook. This is separate and independent of the normal job timeout which now only applies to the pre-run and run playbooks in a cumulative fashion. The reason for this change is when a pre-run or run playbook hits the timeout and the job fails you still want to do your best to copy all of the log data that you can find so that you can debug the timeout. Similarly to timeout, if post-timeout is not set then post-run playbooks will have no timeout and can run indefinitely. Change-Id: I830a6a14d2623f50fbc3f05396cc909d79de04bb --- doc/source/user/config.rst | 15 +++++++++++++++ doc/source/user/jobs.rst | 4 ++++ .../config/ansible/git/common-config/zuul.yaml | 6 ++++++ .../config/ansible/git/org_project/.zuul.yaml | 1 + tests/unit/test_v3.py | 6 ++++++ zuul/configloader.py | 6 ++++++ zuul/executor/client.py | 1 + zuul/executor/server.py | 16 +++++++++++----- zuul/model.py | 1 + 9 files changed, 51 insertions(+), 5 deletions(-) diff --git a/doc/source/user/config.rst b/doc/source/user/config.rst index 8492423e33..7d2dae1c82 100644 --- a/doc/source/user/config.rst +++ b/doc/source/user/config.rst @@ -710,6 +710,21 @@ Here is an example of two job definitions: timeout is supplied, the job may run indefinitely. Supplying a timeout is highly recommended. + This timeout only applies to the pre-run and run playbooks in a + job. + + .. attr:: post-timeout + + The time in seconds that each post playbook should be allowed to run + before it is automatically aborted and failure is reported. If no + post-timeout is supplied, the job may run indefinitely. Supplying a + post-timeout is highly recommended. + + The post-timeout is handled separately from the above timeout because + the post playbooks are typically where you will copy jobs logs. + In the event of the pre-run or run playbooks timing out we want to + do our best to copy the job logs in the post-run playbooks. + .. attr:: attempts :default: 3 diff --git a/doc/source/user/jobs.rst b/doc/source/user/jobs.rst index 820e3166f3..4e1c33dc59 100644 --- a/doc/source/user/jobs.rst +++ b/doc/source/user/jobs.rst @@ -289,6 +289,10 @@ of item. The job timeout, in seconds. + .. var:: post_timeout + + The post-run playbook timeout, in seconds. + .. var:: jobtags A list of tags associated with the job. Not to be confused with diff --git a/tests/fixtures/config/ansible/git/common-config/zuul.yaml b/tests/fixtures/config/ansible/git/common-config/zuul.yaml index d0a8f7b315..0112141e4e 100644 --- a/tests/fixtures/config/ansible/git/common-config/zuul.yaml +++ b/tests/fixtures/config/ansible/git/common-config/zuul.yaml @@ -97,6 +97,12 @@ run: playbooks/timeout.yaml timeout: 1 +- job: + parent: python27 + name: post-timeout + post-run: playbooks/timeout.yaml + post-timeout: 1 + - job: parent: python27 name: check-vars diff --git a/tests/fixtures/config/ansible/git/org_project/.zuul.yaml b/tests/fixtures/config/ansible/git/org_project/.zuul.yaml index 447f6cd6de..a1e144f0aa 100644 --- a/tests/fixtures/config/ansible/git/org_project/.zuul.yaml +++ b/tests/fixtures/config/ansible/git/org_project/.zuul.yaml @@ -17,5 +17,6 @@ - check-vars - check-secret-names - timeout + - post-timeout - hello-world - failpost diff --git a/tests/unit/test_v3.py b/tests/unit/test_v3.py index 1338d2015f..e36c8f61e2 100755 --- a/tests/unit/test_v3.py +++ b/tests/unit/test_v3.py @@ -2048,6 +2048,12 @@ class TestAnsible(AnsibleZuulTestCase): build_timeout = self.getJobFromHistory('timeout') with self.jobLog(build_timeout): self.assertEqual(build_timeout.result, 'TIMED_OUT') + post_flag_path = os.path.join(self.test_root, build_timeout.uuid + + '.post.flag') + self.assertTrue(os.path.exists(post_flag_path)) + build_post_timeout = self.getJobFromHistory('post-timeout') + with self.jobLog(build_post_timeout): + self.assertEqual(build_post_timeout.result, 'POST_FAILURE') build_faillocal = self.getJobFromHistory('faillocal') with self.jobLog(build_faillocal): self.assertEqual(build_faillocal.result, 'FAILURE') diff --git a/zuul/configloader.py b/zuul/configloader.py index f0f78b76b1..d251106cbf 100644 --- a/zuul/configloader.py +++ b/zuul/configloader.py @@ -491,6 +491,7 @@ class JobParser(object): # validation happens in NodeSetParser 'nodeset': vs.Any(dict, str), 'timeout': int, + 'post-timeout': int, 'attempts': int, 'pre-run': to_list(str), 'post-run': to_list(str), @@ -518,6 +519,7 @@ class JobParser(object): 'abstract', 'protected', 'timeout', + 'post-timeout', 'workspace', 'voting', 'hold-following-changes', @@ -627,6 +629,10 @@ class JobParser(object): int(conf['timeout']) > tenant.max_job_timeout: raise MaxTimeoutError(job, tenant) + if conf.get('post-timeout') and tenant.max_job_timeout != -1 and \ + int(conf['post-timeout']) > tenant.max_job_timeout: + raise MaxTimeoutError(job, tenant) + if 'post-review' in conf: if conf['post-review']: job.post_review = True diff --git a/zuul/executor/client.py b/zuul/executor/client.py index d561232f39..c09d4e1811 100644 --- a/zuul/executor/client.py +++ b/zuul/executor/client.py @@ -186,6 +186,7 @@ class ExecutorClient(object): params = dict() params['job'] = job.name params['timeout'] = job.timeout + params['post_timeout'] = job.post_timeout params['items'] = merger_items params['projects'] = [] if hasattr(item.change, 'branch'): diff --git a/zuul/executor/server.py b/zuul/executor/server.py index 8de6fe09d5..fbc34c466b 100644 --- a/zuul/executor/server.py +++ b/zuul/executor/server.py @@ -878,8 +878,10 @@ class AnsibleJob(object): success = False self.started = True time_started = time.time() - # timeout value is total job timeout or put another way - # the cummulative time that pre, run, and post can consume. + # timeout value is "total" job timeout which accounts for + # pre-run and run playbooks. post-run is different because + # it is used to copy out job logs and we want to do our best + # to copy logs even when the job has timed out. job_timeout = args['timeout'] for index, playbook in enumerate(self.jobdir.pre_playbooks): # TODOv3(pabelanger): Implement pre-run timeout setting. @@ -914,11 +916,15 @@ class AnsibleJob(object): # run it again. return None + post_timeout = args['post_timeout'] for index, playbook in enumerate(self.jobdir.post_playbooks): - # TODOv3(pabelanger): Implement post-run timeout setting. - ansible_timeout = self.getAnsibleTimeout(time_started, job_timeout) + # Post timeout operates a little differently to the main job + # timeout. We give each post playbook the full post timeout to + # do its job because post is where you'll often record job logs + # which are vital to understanding why timeouts have happened in + # the first place. post_status, post_code = self.runAnsiblePlaybook( - playbook, ansible_timeout, success, phase='post', index=index) + playbook, post_timeout, success, phase='post', index=index) if post_status == self.RESULT_ABORTED: return 'ABORTED' if post_status != self.RESULT_NORMAL or post_code != 0: diff --git a/zuul/model.py b/zuul/model.py index 45fc1a8f00..ff8404884d 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -839,6 +839,7 @@ class Job(object): self.execution_attributes = dict( parent=None, timeout=None, + post_timeout=None, variables={}, nodeset=NodeSet(), workspace=None,