Merge "Clean held nodes automatically after configurable timeout"

This commit is contained in:
Zuul 2018-02-21 15:39:04 +00:00 committed by Gerrit Code Review
commit a3aec88423
8 changed files with 321 additions and 1 deletions

View File

@ -146,6 +146,17 @@ written to run on nodes of a certain label. Example::
to -1 to have the label considered disabled, so that no nodes will
be created at all.
.. _maxholdage:
max-hold-age
------------
Maximum number of seconds a node shall be in "hold" state. If
this is exceeded the node will be deleted. A value of 0 disables this.
Defaults to 0.
This setting is applied to all nodes, regardless of label or provider.
.. _diskimages:
diskimages

View File

@ -68,6 +68,7 @@ class ConfigValidator:
'providers': list,
'labels': [label],
'diskimages': [diskimage],
'max-hold-age': int,
}
log.info("validating %s" % self.config_file)

View File

@ -16,6 +16,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import time
import yaml
@ -115,6 +116,9 @@ def loadConfig(config_path):
newconfig.provider_managers = {}
newconfig.zookeeper_servers = {}
newconfig.diskimages = {}
newconfig.max_hold_age = config.get('max-hold-age', math.inf)
if newconfig.max_hold_age <= 0:
newconfig.max_hold_age = math.inf
for server in config.get('zookeeper-servers', []):
z = zk.ZooKeeperConnectionConfig(server['host'],

View File

@ -17,6 +17,7 @@
# limitations under the License.
import logging
import math
import os
import os.path
import socket
@ -538,6 +539,62 @@ class CleanupWorker(BaseCleanupWorker):
finally:
zk_conn.unlockNode(node)
def _cleanupMaxHoldAge(self):
'''
Delete any held server past their max-hold-age.
Remove any servers which are longer than max-hold-age in hold state.
'''
self.log.debug('Cleaning up held nodes...')
zk_conn = self._nodepool.getZK()
held_nodes = [n for n in zk_conn.nodeIterator() if n.state == zk.HOLD]
for node in held_nodes:
# Can't do anything if we aren't configured for this provider.
if node.provider not in self._nodepool.config.providers:
continue
if node.hold_expiration is not None and node.hold_expiration > 0:
expiration = node.hold_expiration
else:
expiration = math.inf
max_uptime = min(expiration, self._nodepool.config.max_hold_age)
if math.isinf(max_uptime):
continue
# check state time against now
now = int(time.time())
if (now - node.state_time) < max_uptime:
continue
try:
zk_conn.lockNode(node, blocking=False)
except exceptions.ZKLockException:
continue
# Double check the state now that we have a lock since it
# may have changed on us.
if node.state != zk.HOLD:
zk_conn.unlockNode(node)
continue
self.log.debug("Node %s exceeds max hold age (%s): %s >= %s",
node.id,
("manual setting"
and node.hold_expiration == max_uptime
or "configuration setting"),
now - node.state_time,
max_uptime)
try:
node.state = zk.DELETING
zk_conn.storeNode(node)
except Exception:
self.log.exception(
"Failure marking aged node %s for delete:", node.id)
finally:
zk_conn.unlockNode(node)
def _run(self):
'''
Catch exceptions individually so that other cleanup routines may
@ -567,6 +624,12 @@ class CleanupWorker(BaseCleanupWorker):
self.log.exception(
"Exception in CleanupWorker (max ready age cleanup):")
try:
self._cleanupMaxHoldAge()
except Exception:
self.log.exception(
"Exception in CleanupWorker (max hold age cleanup):")
class DeletedNodeWorker(BaseCleanupWorker):
'''

View File

@ -0,0 +1,50 @@
elements-dir: .
images-dir: '{images_dir}'
build-log-dir: '{build_log_dir}'
max-hold-age: 2
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
labels:
- name: fake-label
min-ready: 1
providers:
- name: fake-provider
cloud: fake
driver: fake
region-name: fake-region
rate: 0.0001
diskimages:
- name: fake-image
meta:
key: value
key2: value
pools:
- name: main
max-servers: 96
availability-zones:
- az1
networks:
- net-name
labels:
- name: fake-label
diskimage: fake-image
min-ram: 8192
flavor-name: 'Fake'
diskimages:
- name: fake-image
elements:
- fedora
- vm
release: 21
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

View File

@ -0,0 +1,50 @@
elements-dir: .
images-dir: '{images_dir}'
build-log-dir: '{build_log_dir}'
max-hold-age: 10
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
labels:
- name: fake-label
min-ready: 2
providers:
- name: fake-provider
cloud: fake
driver: fake
region-name: fake-region
rate: 0.0001
diskimages:
- name: fake-image
meta:
key: value
key2: value
pools:
- name: main
max-servers: 96
availability-zones:
- az1
networks:
- net-name
labels:
- name: fake-label
diskimage: fake-image
min-ram: 8192
flavor-name: 'Fake'
diskimages:
- name: fake-image
elements:
- fedora
- vm
release: 21
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

View File

@ -660,6 +660,143 @@ class TestLauncher(tests.DBTestCase):
manager = pool.getProviderManager('fake-provider')
self.waitForInstanceDeletion(manager, nodes[0].external_id)
def test_max_hold_age(self):
"""Test a held node with exceeded max-hold-age is deleted"""
configfile = self.setup_config('node_max_hold_age.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
self.useBuilder(configfile)
pool.start()
self.waitForImage('fake-provider', 'fake-image')
self.log.debug("Waiting for initial pool...")
nodes = self.waitForNodes('fake-label')
self.log.debug("...done waiting for initial pool.")
node = nodes[0]
self.log.debug("Holding node %s..." % node.id)
# hold the node
node.state = zk.HOLD
node.comment = 'testing'
self.zk.lockNode(node, blocking=False)
self.zk.storeNode(node)
self.zk.unlockNode(node)
znode = self.zk.getNode(node.id)
self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
# Wait for the instance to be cleaned up
manager = pool.getProviderManager('fake-provider')
self.waitForInstanceDeletion(manager, node.external_id)
def test_hold_expiration_no_default(self):
"""Test a held node is deleted when past its operator-specified TTL,
no max-hold-age set"""
configfile = self.setup_config('node_max_ready_age.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
self.useBuilder(configfile)
pool.start()
self.waitForImage('fake-provider', 'fake-image')
self.log.debug("Waiting for initial pool...")
nodes = self.waitForNodes('fake-label')
self.log.debug("...done waiting for initial pool.")
node = nodes[0]
self.log.debug("Holding node %s..." % node.id)
# hold the node
node.state = zk.HOLD
node.comment = 'testing'
node.hold_expiration = 5
self.zk.lockNode(node, blocking=False)
self.zk.storeNode(node)
self.zk.unlockNode(node)
znode = self.zk.getNode(node.id)
self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
# Wait for the instance to be cleaned up
manager = pool.getProviderManager('fake-provider')
self.waitForInstanceDeletion(manager, node.external_id)
def test_hold_expiration_lower_than_default(self):
"""Test a held node is deleted when past its operator-specified TTL,
with max-hold-age set in the configuration"""
configfile = self.setup_config('node_max_hold_age_2.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
self.useBuilder(configfile)
pool.start()
self.waitForImage('fake-provider', 'fake-image')
self.log.debug("Waiting for initial pool...")
nodes = self.waitForNodes('fake-label', 2)
self.log.debug("...done waiting for initial pool.")
node_custom = nodes[0]
# TODO make it a fraction of fixture's max-hold-age
hold_expiration = 2
node = nodes[1]
self.log.debug("Holding node %s... (default)" % node.id)
self.log.debug("Holding node %s...(%s seconds)" % (node_custom.id,
hold_expiration))
# hold the nodes
node.state = zk.HOLD
node.comment = 'testing'
node_custom.state = zk.HOLD
node_custom.comment = 'testing hold_expiration'
node_custom.hold_expiration = hold_expiration
self.zk.lockNode(node, blocking=False)
self.zk.storeNode(node)
self.zk.unlockNode(node)
self.zk.lockNode(node_custom, blocking=False)
self.zk.storeNode(node_custom)
self.zk.unlockNode(node_custom)
znode = self.zk.getNode(node.id)
self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
znode_custom = self.zk.getNode(node_custom.id)
self.log.debug("Node %s in state '%s'" % (znode_custom.id,
znode_custom.state))
# Wait for the instance to be cleaned up
manager = pool.getProviderManager('fake-provider')
self.waitForInstanceDeletion(manager, node_custom.external_id)
# control node should still be held
held_nodes = [n for n in self.zk.nodeIterator() if n.state == zk.HOLD]
self.assertTrue(any(n.id == node.id for n in held_nodes),
held_nodes)
# finally, control node gets deleted
self.waitForInstanceDeletion(manager, node.external_id)
def test_hold_expiration_higher_than_default(self):
"""Test a held node is deleted after max-hold-age seconds if the
operator specifies a larger TTL"""
configfile = self.setup_config('node_max_hold_age_2.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
self.useBuilder(configfile)
pool.start()
self.waitForImage('fake-provider', 'fake-image')
self.log.debug("Waiting for initial pool...")
nodes = self.waitForNodes('fake-label', 2)
self.log.debug("...done waiting for initial pool.")
node_custom = nodes[0]
# TODO make it a multiple of fixture's max-hold-age
hold_expiration = 20
node = nodes[1]
self.log.debug("Holding node %s... (default)" % node.id)
self.log.debug("Holding node %s...(%s seconds)" % (node_custom.id,
hold_expiration))
# hold the nodes
node.state = zk.HOLD
node.comment = 'testing'
node_custom.state = zk.HOLD
node_custom.comment = 'testing hold_expiration'
node_custom.hold_expiration = hold_expiration
self.zk.lockNode(node, blocking=False)
self.zk.storeNode(node)
self.zk.unlockNode(node)
self.zk.lockNode(node_custom, blocking=False)
self.zk.storeNode(node_custom)
self.zk.unlockNode(node_custom)
znode = self.zk.getNode(node.id)
self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
znode_custom = self.zk.getNode(node_custom.id)
self.log.debug("Node %s in state '%s'" % (znode_custom.id,
znode_custom.state))
# Wait for the instance to be cleaned up
manager = pool.getProviderManager('fake-provider')
self.waitForInstanceDeletion(manager, node.external_id)
# custom node should be deleted as well
held_nodes = [n for n in self.zk.nodeIterator() if n.state == zk.HOLD]
self.assertEqual(0, len(held_nodes), held_nodes)
def test_label_provider(self):
"""Test that only providers listed in the label satisfy the request"""
configfile = self.setup_config('node_label_provider.yaml')

View File

@ -447,6 +447,7 @@ class Node(BaseModel):
self.username = None
self.connection_type = None
self.host_keys = []
self.hold_expiration = None
def __repr__(self):
d = self.toDict()
@ -479,7 +480,8 @@ class Node(BaseModel):
self.hold_job == other.hold_job and
self.username == other.username and
self.connection_type == other.connection_type and
self.host_keys == other.host_keys)
self.host_keys == other.host_keys and
self.hold_expiration == other.hold_expiration)
else:
return False
@ -513,6 +515,7 @@ class Node(BaseModel):
d['host_keys'] = self.host_keys
d['username'] = self.username
d['connection_type'] = self.connection_type
d['hold_expiration'] = self.hold_expiration
return d
@staticmethod
@ -549,6 +552,7 @@ class Node(BaseModel):
o.username = d.get('username', 'zuul')
o.connection_type = d.get('connection_type')
o.host_keys = d.get('host_keys', [])
o.hold_expiration = d.get('hold_expiration')
return o