Add max-age to metastatic driver

This allows the metastatic driver to gracefully remove a backing
node from service after a certain amount of time.  This forced
retirement can be used to periodically ensure that fresh backing
nodes are used even in busy systems (which can help ensure that,
over time, job behavior does not change based on the contents of
the backing node server).

Change-Id: I62a95411a5d0b75185739a3c2553c75124c78c25
This commit is contained in:
James E. Blair 2024-04-24 14:15:45 -07:00
parent c253ebfef3
commit a57231cb60
5 changed files with 73 additions and 1 deletions

View File

@ -203,6 +203,18 @@ itself, which is "meta".
used to ensure that the backing node is retained for at
least the minimum billing interval.
.. attr:: max-age
:type: int
If this value is set, the backing node will be removed
from service after this amount of time (in seconds) has
passed since the backing node was launched. After a
backing node reaches this point, any existing jobs will
be permitted to run to completion, but no new metastatic
nodes will be created with that backing node and once all
metastatic nodes using it have been deleted, then backing
node will be deleted.
.. attr:: host-key-checking
:type: bool
:default: False

View File

@ -298,6 +298,15 @@ class MetastaticAdapter(statemachine.Adapter):
if label_config:
grace_time = label_config.grace_time
min_time = label_config.min_retention_time
if label_config.max_age:
if now - bnr.launched > label_config.max_age:
# Mark it as failed; even though it
# hasn't really failed, the lifecycle
# is the same: do not allocate any
# more jobs to this node but let any
# remaining ones finish, then delete
# ASAP.
bnr.failed = True
else:
# The label doesn't exist in our config any more,
# it must have been removed.

View File

@ -46,8 +46,12 @@ class MetastaticLabel(ConfigValue):
self.max_parallel_jobs = label.get('max-parallel-jobs', 1)
self.grace_time = label.get('grace-time', 60)
self.min_retention_time = label.get('min-retention-time', 0)
self.max_age = label.get('max-age', None)
self.host_key_checking = label.get('host-key-checking',
self.pool.host_key_checking)
if self.max_age and self.max_age < self.min_retention_time:
raise Exception("The max_age must be greater than or "
"equal to the min_retention_time")
@staticmethod
def getSchema():
@ -57,6 +61,7 @@ class MetastaticLabel(ConfigValue):
'max-parallel-jobs': int,
'grace-time': int,
'min-retention-time': int,
'max-age': int,
'host-key-checking': bool,
}
@ -66,7 +71,8 @@ class MetastaticLabel(ConfigValue):
self.backing_label == other.backing_label and
self.max_parallel_jobs == other.max_parallel_jobs and
self.grace_time == other.grace_time and
self.min_retention_time == other.min_retention_time
self.min_retention_time == other.min_retention_time and
self.max_age == other.max_age
)

View File

@ -66,6 +66,7 @@ providers:
backing-label: backing-label
max-parallel-jobs: 2
grace-time: 2
max-age: 300
host-key-checking: true
- name: user-label-min-retention
backing-label: backing-label-min-retention

View File

@ -372,3 +372,47 @@ class TestDriverMetastatic(tests.DBTestCase):
meta_manager.adapter.listResources()
nodes = self._getNodes()
self.waitForNodeDeletion(bn1)
def test_metastatic_max_age(self):
# Test the max-age option
configfile = self.setup_config('metastatic.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
self.startPool(pool)
manager = pool.getProviderManager('fake-provider')
manager.adapter._client.create_image(name="fake-image")
# Launch one metastatic node on a backing node
node1 = self._requestNode()
nodes = self._getNodes()
self.assertEqual(len(nodes), 2)
bn1 = nodes[1]
self.assertEqual(bn1.provider, 'fake-provider')
self.assertEqual(bn1.id, node1.driver_data['backing_node'])
# Create a second node and verify it uses the same backing node.
node2 = self._requestNode()
nodes = self._getNodes()
self.assertEqual(len(nodes), 3)
self.assertEqual(bn1.id, node2.driver_data['backing_node'])
# Delete the second node.
node2.state = zk.DELETING
self.zk.storeNode(node2)
self.waitForNodeDeletion(node2)
nodes = self._getNodes()
self.assertEqual(len(nodes), 2)
# Falsify the launch time so that the node is older than
# max_age (300).
meta_manager = pool.getProviderManager('meta-provider')
bnr = meta_manager.adapter.backing_node_records['user-label'][0]
bnr.launched = 0
# This has the side effect of marking the backing node as failed.
meta_manager.adapter.listResources()
# Create another node and verify it gets a new backing node.
node3 = self._requestNode()
nodes = self._getNodes()
self.assertEqual(len(nodes), 4)
self.assertNotEqual(bn1.id, node3.driver_data['backing_node'])