Support fail-fast in project pipelines

In some cases like resource constrained environments it is beneficial
to report on changes in a fail fast manner to immediately report if
one job fails. This can be useful especially if a project has many
expensive long-running jobs. This introduces a fail-fast flag in the
project pipeline that let's the project choose the trade off between
full information and quick feedback.

Change-Id: Ie4a5ac8e025362dbaacd3ae82f2e8369f7447a62
This commit is contained in:
Tobias Henkel 2019-04-15 20:07:55 +02:00
parent 4affbd1152
commit b0d7c3c69a
No known key found for this signature in database
GPG Key ID: 03750DEC158E5FA2
13 changed files with 232 additions and 2 deletions

View File

@ -1419,6 +1419,19 @@ pipeline.
difficult to determine why Zuul did or did not run a certain
job, the additional information this provides may help.
.. attr:: fail-fast
:default: false
If this is set to `true`, Zuul will report a build failure
immediately and abort all still running builds. This can be used
to save resources in resource constrained environments at the cost
of potentially requiring multiple attempts if more than one problem
is present.
Once this is defined it cannot be overridden afterwards. So this
can be forced to a specific value by e.g. defining it in a config
repo.
.. _project-template:
Project Template

View File

@ -0,0 +1,5 @@
---
features:
- |
Zuul now supports :attr:`project.<pipeline>.fail-fast` to immediately
report and cancel builds on the first failure in a buildset.

View File

@ -0,0 +1,2 @@
- hosts: all
tasks: []

View File

@ -0,0 +1,63 @@
- pipeline:
name: check
manager: independent
trigger:
gerrit:
- event: patchset-created
success:
gerrit:
Verified: 1
failure:
gerrit:
Verified: -1
- job:
name: base
parent: null
run: playbooks/run.yaml
- job:
name: project-merge
- job:
name: project-test1
- job:
name: project-test2
- job:
name: project-test3
- job:
name: project-test4
- job:
name: project-test5
nodeset:
nodes:
- name: controller
label: label1
- job:
name: project-test6
- project:
name: org/project
check:
fail-fast: true
jobs:
- project-merge
- project-test1:
dependencies: project-merge
- project-test2:
dependencies: project-merge
- project-test3:
dependencies:
- name: project-test2
soft: true
- project-test4:
dependencies: project-test2
- project-test5
- project-test6:
dependencies: project-merge
voting: false

View File

@ -0,0 +1 @@
test

View File

@ -0,0 +1,5 @@
# This tries to unset fail-fast which should not be possible because it's
# already set to true in common-config.
- project:
check:
fail-fast: false

View File

@ -0,0 +1,8 @@
- tenant:
name: tenant-one
source:
gerrit:
config-projects:
- common-config
untrusted-projects:
- org/project

View File

@ -7104,3 +7104,92 @@ class TestSchedulerBranchMatcher(ZuulTestCase):
"A should report start and success")
self.assertIn('gate', A.messages[1],
"A should transit gate")
class TestSchedulerFailFast(ZuulTestCase):
tenant_config_file = 'config/fail-fast/main.yaml'
def test_fail_fast(self):
"""
Tests that a pipeline that is flagged with fail-fast
aborts jobs early.
"""
self.executor_server.hold_jobs_in_build = True
self.fake_nodepool.pause()
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
self.executor_server.failJob('project-test1', A)
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.waitUntilSettled()
self.assertEqual(len(self.builds), 1)
self.assertEqual(self.builds[0].name, 'project-merge')
self.executor_server.release('project-merge')
self.waitUntilSettled()
# Now project-test1, project-test2 and project-test6
# should be running
self.assertEqual(len(self.builds), 3)
# Release project-test1 which will fail
self.executor_server.release('project-test1')
self.waitUntilSettled()
self.fake_nodepool.unpause()
self.waitUntilSettled()
# Now project-test2 must be aborted
self.assertEqual(len(self.builds), 0)
self.assertEqual(A.reported, 1)
self.assertHistory([
dict(name='project-merge', result='SUCCESS', changes='1,1'),
dict(name='project-test1', result='FAILURE', changes='1,1'),
dict(name='project-test2', result='ABORTED', changes='1,1'),
dict(name='project-test6', result='ABORTED', changes='1,1'),
], ordered=False)
def test_fail_fast_nonvoting(self):
"""
Tests that a pipeline that is flagged with fail-fast
doesn't abort jobs due to a non-voting job.
"""
self.executor_server.hold_jobs_in_build = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
self.executor_server.failJob('project-test6', A)
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.waitUntilSettled()
self.assertEqual(len(self.builds), 2)
self.assertEqual(self.builds[0].name, 'project-merge')
self.executor_server.release('project-merge')
self.waitUntilSettled()
# Now project-test1, project-test2, project-test5 and project-test6
# should be running
self.assertEqual(len(self.builds), 4)
# Release project-test6 which will fail
self.executor_server.release('project-test6')
self.waitUntilSettled()
# Now project-test1, project-test2 and project-test5 should be running
self.assertEqual(len(self.builds), 3)
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
self.waitUntilSettled()
self.assertEqual(len(self.builds), 0)
self.assertEqual(A.reported, 1)
self.assertHistory([
dict(name='project-merge', result='SUCCESS', changes='1,1'),
dict(name='project-test1', result='SUCCESS', changes='1,1'),
dict(name='project-test2', result='SUCCESS', changes='1,1'),
dict(name='project-test3', result='SUCCESS', changes='1,1'),
dict(name='project-test4', result='SUCCESS', changes='1,1'),
dict(name='project-test5', result='SUCCESS', changes='1,1'),
dict(name='project-test6', result='FAILURE', changes='1,1'),
], ordered=False)

View File

@ -926,6 +926,7 @@ class ProjectTemplateParser(object):
pipeline_contents = {
'queue': str,
'debug': bool,
'fail-fast': bool,
'jobs': job_list
}
@ -955,6 +956,8 @@ class ProjectTemplateParser(object):
project_template.pipelines[pipeline_name] = project_pipeline
project_pipeline.queue_name = conf_pipeline.get('queue')
project_pipeline.debug = conf_pipeline.get('debug')
project_pipeline.fail_fast = conf_pipeline.get(
'fail-fast')
self.parseJobList(
conf_pipeline.get('jobs', []),
source_context, start_mark, project_pipeline.job_list)
@ -1007,6 +1010,7 @@ class ProjectParser(object):
pipeline_contents = {
'queue': str,
'debug': bool,
'fail-fast': bool,
'jobs': job_list
}

View File

@ -449,6 +449,13 @@ class ExecutorClient(object):
# track of which results are non-final.
if build.retry:
result = None
# If the build was canceled, we did actively cancel the job so
# don't overwrite the result and don't retry.
if build.canceled:
result = build.result
build.retry = False
self.sched.onBuildCompleted(build, result, result_data, warnings)
# The test suite expects the build to be removed from the
# internal dict after it's added to the report queue.

View File

@ -845,6 +845,13 @@ class PipelineManager(object):
if build:
build_set.removeBuild(build)
def _cancelRunningBuilds(self, build_set):
item = build_set.item
for job in item.getJobs():
build = build_set.getBuild(job.name)
if not build or not build.result:
self.sched.cancelJob(build_set, job, final=True)
def onBuildCompleted(self, build):
item = build.build_set.item
@ -863,6 +870,15 @@ class PipelineManager(object):
self._resetDependentBuilds(build.build_set, build)
self._resumeBuilds(build.build_set)
if (item.project_pipeline_config.fail_fast and
build.failed and build.job.voting):
# If fail-fast is set and the build is not successful
# cancel all remaining jobs.
self.log.debug("Build %s failed and fail-fast enabled, canceling "
"running builds", build)
self._cancelRunningBuilds(build.build_set)
return True
def onFilesChangesCompleted(self, event):

View File

@ -1818,6 +1818,12 @@ class Build(object):
return ('<Build %s of %s voting:%s on %s>' %
(self.uuid, self.job.name, self.job.voting, self.worker))
@property
def failed(self):
if self.result and self.result not in ['SUCCESS', 'SKIPPED']:
return True
return False
@property
def pipeline(self):
return self.build_set.item.pipeline
@ -2466,7 +2472,7 @@ class QueueItem(object):
build = build_set.getBuild(job.name)
if build and (build.result == 'SUCCESS' or build.paused):
successful_job_names.add(job.name)
elif build and build.result in ('SKIPPED', 'FAILURE'):
elif build and build.result in ('SKIPPED', 'FAILURE', 'CANCELED'):
pass
else:
nodeset = build_set.getJobNodeSet(job.name)
@ -3125,6 +3131,7 @@ class ProjectPipelineConfig(ConfigObject):
self.queue_name = None
self.debug = False
self.debug_messages = []
self.fail_fast = None
self.variables = {}
def addDebug(self, msg):
@ -3137,6 +3144,8 @@ class ProjectPipelineConfig(ConfigObject):
self.queue_name = other.queue_name
if other.debug:
self.debug = other.debug
if self.fail_fast is None:
self.fail_fast = other.fail_fast
self.job_list.inheritFrom(other.job_list)
def updateVariables(self, other):

View File

@ -37,6 +37,7 @@ from zuul.lib.config import get_default
from zuul.lib.gear_utils import getGearmanFunctions
from zuul.lib.statsd import get_statsd
import zuul.lib.queue
from zuul.model import Build
COMMANDS = ['full-reconfigure', 'stop']
@ -1417,7 +1418,7 @@ class Scheduler(threading.Thread):
other_change.refresh_deps = True
change.refresh_deps = True
def cancelJob(self, buildset, job, build=None):
def cancelJob(self, buildset, job, build=None, final=False):
item = buildset.item
job_name = job.name
try:
@ -1457,6 +1458,13 @@ class Scheduler(threading.Thread):
nodeset = buildset.getJobNodeSet(job_name)
if nodeset:
self.nodepool.returnNodeSet(nodeset)
if final:
# If final is set make sure that the job is not resurrected
# later by re-requesting nodes.
fakebuild = Build(job, None)
fakebuild.result = 'CANCELED'
buildset.addBuild(fakebuild)
finally:
# Release the semaphore in any case
tenant = buildset.item.pipeline.tenant