zuul autohold: allow operator to specify nodes TTL

Add the option --node-hold-expiration to `zuul autohold`. This parameter
allows an operator to specify how long a node set should remain in
HOLD state after a build failure.

Change-Id: I25020d1722de97426e6699653ff72eba03c46b16
Depends-On: I9a09728e5728c537ee44721f5d5e774dc0dcefa7
This commit is contained in:
mhuin 2018-02-12 11:06:10 +01:00
parent f5ea4cf8a3
commit e4bf201286
8 changed files with 58 additions and 9 deletions

View File

@ -1667,7 +1667,8 @@ class FakeNodepool(object):
updated_time=now,
image_id=None,
host_keys=host_keys,
executor='fake-nodepool')
executor='fake-nodepool',
hold_expiration=None)
if self.remote_ansible:
data['connection_type'] = 'ssh'
if 'fakeuser' in node_type:

View File

@ -1755,6 +1755,38 @@ class TestScheduler(ZuulTestCase):
break
self.assertIsNone(held_node)
@simple_layout('layouts/autohold.yaml')
def test_autohold_hold_expiration(self):
client = zuul.rpcclient.RPCClient('127.0.0.1',
self.gearman_server.port)
self.addCleanup(client.shutdown)
r = client.autohold('tenant-one', 'org/project', 'project-test2',
"", "", "reason text", 1, node_hold_expiration=30)
self.assertTrue(r)
# Hold a failed job
B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
self.executor_server.failJob('project-test2', B)
self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.assertEqual(B.data['status'], 'NEW')
self.assertEqual(B.reported, 1)
# project-test2
self.assertEqual(self.history[0].result, 'FAILURE')
# Check nodepool for a held node
held_node = None
for node in self.fake_nodepool.getNodes():
if node['state'] == zuul.model.STATE_HOLD:
held_node = node
break
self.assertIsNotNone(held_node)
# Validate node has hold_expiration property
self.assertEqual(int(held_node['hold_expiration']), 30)
@simple_layout('layouts/autohold.yaml')
def test_autohold_list(self):
client = zuul.rpcclient.RPCClient('127.0.0.1',
@ -1779,7 +1811,7 @@ class TestScheduler(ZuulTestCase):
self.assertEqual(".*", ref_filter)
# Note: the value is converted from set to list by json.
self.assertEqual([1, "reason text"], autohold_requests[key])
self.assertEqual([1, "reason text", None], autohold_requests[key])
@simple_layout('layouts/three-projects.yaml')
def test_dependent_behind_dequeue(self):

View File

@ -61,6 +61,12 @@ class Client(zuul.cmd.ZuulApp):
cmd_autohold.add_argument('--count',
help='number of job runs (default: 1)',
required=False, type=int, default=1)
cmd_autohold.add_argument('--node-hold-expiration',
help=('how long in seconds should the '
'node set be in HOLD status '
'(default: nodepool\'s max-hold-age '
'if set, or indefinitely)'),
required=False, default=0)
cmd_autohold.set_defaults(func=self.autohold)
cmd_autohold_list = subparsers.add_parser(
@ -182,13 +188,15 @@ class Client(zuul.cmd.ZuulApp):
print("Change and ref can't be both used for the same request")
return False
node_hold_expiration = self.args.node_hold_expiration
r = client.autohold(tenant=self.args.tenant,
project=self.args.project,
job=self.args.job,
change=self.args.change,
ref=self.args.ref,
reason=self.args.reason,
count=self.args.count)
count=self.args.count,
node_hold_expiration=node_hold_expiration)
return r
def autohold_list(self):
@ -209,7 +217,7 @@ class Client(zuul.cmd.ZuulApp):
# The key comes to us as a CSV string because json doesn't like
# non-str keys.
tenant_name, project_name, job_name, ref_filter = key.split(',')
count, reason = value
count, reason, node_hold_expiration = value
table.add_row([
tenant_name, project_name, job_name, ref_filter, count, reason

View File

@ -390,6 +390,7 @@ class Node(object):
self.provider = None
self.region = None
self.username = None
self.hold_expiration = None
@property
def state(self):

View File

@ -88,7 +88,9 @@ class Nodepool(object):
associated with the given NodeSet.
'''
self.log.info("Holding nodeset %s" % (nodeset,))
(hold_iterations, reason) = self.sched.autohold_requests[autohold_key]
(hold_iterations,
reason,
node_hold_expiration) = self.sched.autohold_requests[autohold_key]
nodes = nodeset.getNodes()
for node in nodes:
@ -97,6 +99,8 @@ class Nodepool(object):
node.state = model.STATE_HOLD
node.hold_job = " ".join(autohold_key)
node.comment = reason
if node_hold_expiration:
node.hold_expiration = node_hold_expiration
self.sched.zk.storeNode(node)
# We remove the autohold when the number of nodes in hold

View File

@ -48,14 +48,16 @@ class RPCClient(object):
self.log.debug("Job complete, success: %s" % (not job.failure))
return job
def autohold(self, tenant, project, job, change, ref, reason, count):
def autohold(self, tenant, project, job, change, ref, reason, count,
node_hold_expiration=None):
data = {'tenant': tenant,
'project': project,
'job': job,
'change': change,
'ref': ref,
'reason': reason,
'count': count}
'count': count,
'node_hold_expiration': node_hold_expiration}
return not self.submitJob('zuul:autohold', data).failure
def autohold_list(self):

View File

@ -172,6 +172,7 @@ class RPCListener(object):
return
params['count'] = args['count']
params['node_hold_expiration'] = args['node_hold_expiration']
self.sched.autohold(**params)
job.sendWorkComplete()

View File

@ -439,14 +439,14 @@ class Scheduler(threading.Thread):
# TODOv3(jeblair): reconfigure time should be per-tenant
def autohold(self, tenant_name, project_name, job_name, ref_filter,
reason, count):
reason, count, node_hold_expiration):
key = (tenant_name, project_name, job_name, ref_filter)
if count == 0 and key in self.autohold_requests:
self.log.debug("Removing autohold for %s", key)
del self.autohold_requests[key]
else:
self.log.debug("Autohold requested for %s", key)
self.autohold_requests[key] = (count, reason)
self.autohold_requests[key] = (count, reason, node_hold_expiration)
def promote(self, tenant_name, pipeline_name, change_ids):
event = PromoteEvent(tenant_name, pipeline_name, change_ids)