Add post-timeout setting

This adds a post-timeout setting which applies as a timeout to each
post-run playbook. This is separate and independent of the normal job
timeout which now only applies to the pre-run and run playbooks in a
cumulative fashion.

The reason for this change is when a pre-run or run playbook hits the
timeout and the job fails you still want to do your best to copy all of
the log data that you can find so that you can debug the timeout.
Similarly to timeout, if post-timeout is not set then post-run playbooks
will have no timeout and can run indefinitely.

Change-Id: I830a6a14d2623f50fbc3f05396cc909d79de04bb
This commit is contained in:
Clark Boylan 2018-02-16 11:00:50 -08:00
parent 5e898285a8
commit b2c22b31fe
9 changed files with 51 additions and 5 deletions

View File

@ -710,6 +710,21 @@ Here is an example of two job definitions:
timeout is supplied, the job may run indefinitely. Supplying a
timeout is highly recommended.
This timeout only applies to the pre-run and run playbooks in a
job.
.. attr:: post-timeout
The time in seconds that each post playbook should be allowed to run
before it is automatically aborted and failure is reported. If no
post-timeout is supplied, the job may run indefinitely. Supplying a
post-timeout is highly recommended.
The post-timeout is handled separately from the above timeout because
the post playbooks are typically where you will copy jobs logs.
In the event of the pre-run or run playbooks timing out we want to
do our best to copy the job logs in the post-run playbooks.
.. attr:: attempts
:default: 3

View File

@ -289,6 +289,10 @@ of item.
The job timeout, in seconds.
.. var:: post_timeout
The post-run playbook timeout, in seconds.
.. var:: jobtags
A list of tags associated with the job. Not to be confused with

View File

@ -97,6 +97,12 @@
run: playbooks/timeout.yaml
timeout: 1
- job:
parent: python27
name: post-timeout
post-run: playbooks/timeout.yaml
post-timeout: 1
- job:
parent: python27
name: check-vars

View File

@ -17,5 +17,6 @@
- check-vars
- check-secret-names
- timeout
- post-timeout
- hello-world
- failpost

View File

@ -2048,6 +2048,12 @@ class TestAnsible(AnsibleZuulTestCase):
build_timeout = self.getJobFromHistory('timeout')
with self.jobLog(build_timeout):
self.assertEqual(build_timeout.result, 'TIMED_OUT')
post_flag_path = os.path.join(self.test_root, build_timeout.uuid +
'.post.flag')
self.assertTrue(os.path.exists(post_flag_path))
build_post_timeout = self.getJobFromHistory('post-timeout')
with self.jobLog(build_post_timeout):
self.assertEqual(build_post_timeout.result, 'POST_FAILURE')
build_faillocal = self.getJobFromHistory('faillocal')
with self.jobLog(build_faillocal):
self.assertEqual(build_faillocal.result, 'FAILURE')

View File

@ -491,6 +491,7 @@ class JobParser(object):
# validation happens in NodeSetParser
'nodeset': vs.Any(dict, str),
'timeout': int,
'post-timeout': int,
'attempts': int,
'pre-run': to_list(str),
'post-run': to_list(str),
@ -518,6 +519,7 @@ class JobParser(object):
'abstract',
'protected',
'timeout',
'post-timeout',
'workspace',
'voting',
'hold-following-changes',
@ -627,6 +629,10 @@ class JobParser(object):
int(conf['timeout']) > tenant.max_job_timeout:
raise MaxTimeoutError(job, tenant)
if conf.get('post-timeout') and tenant.max_job_timeout != -1 and \
int(conf['post-timeout']) > tenant.max_job_timeout:
raise MaxTimeoutError(job, tenant)
if 'post-review' in conf:
if conf['post-review']:
job.post_review = True

View File

@ -186,6 +186,7 @@ class ExecutorClient(object):
params = dict()
params['job'] = job.name
params['timeout'] = job.timeout
params['post_timeout'] = job.post_timeout
params['items'] = merger_items
params['projects'] = []
if hasattr(item.change, 'branch'):

View File

@ -878,8 +878,10 @@ class AnsibleJob(object):
success = False
self.started = True
time_started = time.time()
# timeout value is total job timeout or put another way
# the cummulative time that pre, run, and post can consume.
# timeout value is "total" job timeout which accounts for
# pre-run and run playbooks. post-run is different because
# it is used to copy out job logs and we want to do our best
# to copy logs even when the job has timed out.
job_timeout = args['timeout']
for index, playbook in enumerate(self.jobdir.pre_playbooks):
# TODOv3(pabelanger): Implement pre-run timeout setting.
@ -914,11 +916,15 @@ class AnsibleJob(object):
# run it again.
return None
post_timeout = args['post_timeout']
for index, playbook in enumerate(self.jobdir.post_playbooks):
# TODOv3(pabelanger): Implement post-run timeout setting.
ansible_timeout = self.getAnsibleTimeout(time_started, job_timeout)
# Post timeout operates a little differently to the main job
# timeout. We give each post playbook the full post timeout to
# do its job because post is where you'll often record job logs
# which are vital to understanding why timeouts have happened in
# the first place.
post_status, post_code = self.runAnsiblePlaybook(
playbook, ansible_timeout, success, phase='post', index=index)
playbook, post_timeout, success, phase='post', index=index)
if post_status == self.RESULT_ABORTED:
return 'ABORTED'
if post_status != self.RESULT_NORMAL or post_code != 0:

View File

@ -839,6 +839,7 @@ class Job(object):
self.execution_attributes = dict(
parent=None,
timeout=None,
post_timeout=None,
variables={},
nodeset=NodeSet(),
workspace=None,