Fix bug with removing a failed job

A recent change in I96195cb9996b01075e3705b6cd89a9863528898e added
code to re-set build states for changes during a re-enqueue to
ensure that any side effects (such as marking new builds as skipped)
occurred.  However, we have found that if a change has a
non-succeeding build and the job for that build is removed, then
this new code will run and attempt to perform a lookup on the removed
job, fail, and raise an exception that aborts reconfiguration.

Correct this by fully removing builds from their build sets when
the corresponding job is removed so that no further processing
happens on them.

Change-Id: I2d5cc8f4c5b73d6a2b184cd21016943b8c08949d
This commit is contained in:
James E. Blair 2015-07-30 17:44:45 -07:00
parent 5bf78a3ade
commit 400e8fd0a5
3 changed files with 84 additions and 5 deletions

View File

@ -0,0 +1,25 @@
pipelines:
- name: check
manager: IndependentPipelineManager
trigger:
gerrit:
- event: patchset-created
success:
gerrit:
verified: 1
failure:
gerrit:
verified: -1
jobs:
- name: ^.*-merge$
failure-message: Unable to merge change
hold-following-changes: true
projects:
- name: org/project
merge-mode: cherry-pick
check:
- project-merge:
- project-test2
- project-testfile

View File

@ -2325,7 +2325,7 @@ class TestScheduler(ZuulTestCase):
'SUCCESS')
self.assertEqual(len(self.history), 4)
def test_live_reconfiguration_failed_job(self):
def test_live_reconfiguration_failed_root(self):
# An extrapolation of test_live_reconfiguration_merge_conflict
# that tests a job added to a job tree with a failed root does
# not run.
@ -2387,6 +2387,59 @@ class TestScheduler(ZuulTestCase):
self.assertEqual(self.history[4].result, 'SUCCESS')
self.assertEqual(len(self.history), 5)
def test_live_reconfiguration_failed_job(self):
# Test that a change with a removed failing job does not
# disrupt reconfiguration. If a change has a failed job and
# that job is removed during a reconfiguration, we observed a
# bug where the code to re-set build statuses would run on
# that build and raise an exception because the job no longer
# existed.
self.worker.hold_jobs_in_build = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
# This change will fail and later be removed by the reconfiguration.
self.worker.addFailTest('project-test1', A)
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.worker.release('.*-merge')
self.waitUntilSettled()
self.worker.release('project-test1')
self.waitUntilSettled()
self.assertEqual(A.data['status'], 'NEW')
self.assertEqual(A.reported, 0)
self.assertEqual(self.getJobFromHistory('project-merge').result,
'SUCCESS')
self.assertEqual(self.getJobFromHistory('project-test1').result,
'FAILURE')
self.assertEqual(len(self.history), 2)
# Remove the test1 job.
self.config.set('zuul', 'layout_config',
'tests/fixtures/layout-live-'
'reconfiguration-failed-job.yaml')
self.sched.reconfigure(self.config)
self.waitUntilSettled()
self.worker.hold_jobs_in_build = False
self.worker.release()
self.waitUntilSettled()
self.assertEqual(self.getJobFromHistory('project-test2').result,
'SUCCESS')
self.assertEqual(self.getJobFromHistory('project-testfile').result,
'SUCCESS')
self.assertEqual(len(self.history), 4)
self.assertEqual(A.data['status'], 'NEW')
self.assertEqual(A.reported, 1)
self.assertIn('Build succeeded', A.messages[0])
# Ensure the removed job was not included in the report.
self.assertNotIn('project-test1', A.messages[0])
def test_live_reconfiguration_functions(self):
"Test live reconfiguration with a custom function"
self.worker.registerFunction('build:node-project-test1:debian')

View File

@ -679,7 +679,7 @@ class Scheduler(threading.Thread):
continue
self.log.debug("Re-enqueueing changes for pipeline %s" % name)
items_to_remove = []
builds_to_remove = []
builds_to_cancel = []
last_head = None
for shared_queue in old_pipeline.queues:
for item in shared_queue.queue:
@ -703,14 +703,15 @@ class Scheduler(threading.Thread):
if job:
build.job = job
else:
builds_to_remove.append(build)
item.removeBuild(build)
builds_to_cancel.append(build)
if not new_pipeline.manager.reEnqueueItem(item,
last_head):
items_to_remove.append(item)
for item in items_to_remove:
for build in item.current_build_set.getBuilds():
builds_to_remove.append(build)
for build in builds_to_remove:
builds_to_cancel.append(build)
for build in builds_to_cancel:
self.log.warning(
"Canceling build %s during reconfiguration" % (build,))
try: