Merge "Add host-key-checking to metastatic driver"

This commit is contained in:
Zuul 2024-02-27 19:03:48 +00:00 committed by Gerrit Code Review
commit fe70068909
7 changed files with 188 additions and 17 deletions

View File

@ -107,6 +107,23 @@ itself, which is "meta".
to apply to all pools within that provider, or it can be
overridden here for a specific pool.
.. attr:: host-key-checking
:type: bool
:default: False
Whether to validate SSH host keys. When true, this helps
ensure that nodes are ready to receive SSH connections
before they are supplied to the requestor. When set to
false, nodepool-launcher will not attempt to ssh-keyscan
nodes after they are booted. Unlike other drivers, this
defaults to false here because it is presumed that the
backing node has already been checked for connectivity.
Enabling it here will cause the launcher to check
connectivity each time it allocates a new slot on the
backing node, and if a check fails, it will mark the backing
node as failed and stop allocating any more slots on that
node.
.. attr:: node-attributes
:type: dict
@ -175,3 +192,23 @@ itself, which is "meta".
last requested node is deleted to see if new requests
arrive for this label before deleting the backing node.
Set this value to the amount of time in seconds to wait.
.. attr:: host-key-checking
:type: bool
:default: False
Whether to validate SSH host keys. When true, this helps
ensure that nodes are ready to receive SSH connections
before they are supplied to the requestor. When set to
false, nodepool-launcher will not attempt to ssh-keyscan
nodes after they are booted. Unlike other drivers, this
defaults to false here because it is presumed that the
backing node has already been checked for connectivity.
Enabling it here will cause the launcher to check
connectivity each time it allocates a new slot on the
backing node, and if a check fails, it will mark the backing
node as failed and stop allocating any more slots on that
node.
.. note:: This value will override the value for
:attr:`providers.[metastatic].pools.host-key-checking`.

View File

@ -220,6 +220,8 @@ class BackingNodeRecord:
self.last_used = time.time()
def hasAvailableSlot(self):
if self.failed:
return False
return None in self.allocated_nodes
def isEmpty(self):
@ -297,6 +299,8 @@ class MetastaticAdapter(statemachine.Adapter):
# The label doesn't exist in our config any more,
# it must have been removed.
grace_time = 0
if bnr.failed:
grace_time = 0
if (bnr.isEmpty() and
now - bnr.last_used > grace_time):
self.log.info("Backing node %s has been idle for "
@ -325,6 +329,21 @@ class MetastaticAdapter(statemachine.Adapter):
def getQuotaForLabel(self, label):
return QuotaInformation(instances=1)
def notifyNodescanFailure(self, label, external_id):
with self.allocation_lock:
backing_node_records = self.backing_node_records.get(
label.name, [])
for bnr in backing_node_records:
if bnr.backsNode(external_id):
self.log.info(
"Nodescan failure of %s on %s, failing backing node",
external_id, bnr.node_id)
bnr.failed = True
backing_node = self._getNode(bnr.node_id)
backing_node.user_data = self._makeBackingNodeUserData(bnr)
self.zk.storeNode(backing_node)
return
# Local implementation below
def _init(self):
@ -352,6 +371,7 @@ class MetastaticAdapter(statemachine.Adapter):
backing_node_record = BackingNodeRecord(user_data['label'],
user_data['slots'])
backing_node_record.node_id = node.id
backing_node_record.failed = user_data.get('failed', False)
self.log.info("Found backing node %s for %s",
node.id, user_data['label'])
self._addBackingNode(user_data['label'],
@ -381,6 +401,9 @@ class MetastaticAdapter(statemachine.Adapter):
# if we have room for the label, allocate and return existing slot
# otherwise, make a new backing node
with self.allocation_lock:
# First, find out if we're retrying a request for the same
# node id; if so, immediately deallocate the old one.
self._deallocateBackingNodeInner(node_id)
backing_node_record = None
for bnr in self.backing_node_records.get(label.name, []):
if bnr.hasAvailableSlot():
@ -411,15 +434,26 @@ class MetastaticAdapter(statemachine.Adapter):
def _deallocateBackingNode(self, node_id):
self._init()
with self.allocation_lock:
for label_name, backing_node_records in \
self.backing_node_records.items():
for bn in backing_node_records:
if bn.backsNode(node_id):
slot = bn.deallocateSlot(node_id)
self.log.info(
"Unassigned node %s from backing node %s slot %s",
node_id, bn.node_id, slot)
return
self._deallocateBackingNodeInner(node_id)
def _deallocateBackingNodeInner(self, node_id):
for label_name, backing_node_records in \
self.backing_node_records.items():
for bnr in backing_node_records:
if bnr.backsNode(node_id):
slot = bnr.deallocateSlot(node_id)
self.log.info(
"Unassigned node %s from backing node %s slot %s",
node_id, bnr.node_id, slot)
return
def _makeBackingNodeUserData(self, bnr):
return json.dumps({
'owner': self.my_id,
'label': bnr.label_name,
'slots': bnr.slot_count,
'failed': bnr.failed,
})
def _checkBackingNodeRequests(self):
self._init()
@ -452,11 +486,7 @@ class MetastaticAdapter(statemachine.Adapter):
node = self._getNode(node_id)
self.zk.lockNode(node, blocking=True, timeout=30,
ephemeral=False, identifier=self.my_id)
node.user_data = json.dumps({
'owner': self.my_id,
'label': bnr.label_name,
'slots': bnr.slot_count,
})
node.user_data = self._makeBackingNodeUserData(bnr)
node.state = zk.IN_USE
self.zk.storeNode(node)
self.zk.deleteNodeRequest(request)

View File

@ -45,7 +45,8 @@ class MetastaticLabel(ConfigValue):
self.cloud_image = MetastaticCloudImage()
self.max_parallel_jobs = label.get('max-parallel-jobs', 1)
self.grace_time = label.get('grace-time', 60)
self.host_key_checking = self.pool.host_key_checking
self.host_key_checking = label.get('host-key-checking',
self.pool.host_key_checking)
@staticmethod
def getSchema():
@ -54,6 +55,7 @@ class MetastaticLabel(ConfigValue):
v.Required('backing-label'): str,
'max-parallel-jobs': int,
'grace-time': int,
'host-key-checking': bool,
}
def isBackingConfigEqual(self, other):
@ -74,7 +76,9 @@ class MetastaticPool(ConfigPool):
self.labels = {}
# We will just use the interface_ip of the backing node
self.use_internal_ip = False
self.host_key_checking = False
# Allow extra checking of the backing node to detect failure
# after it's already running.
self.host_key_checking = pool_config.get('host-key-checking', False)
self.load(pool_config)
@ -95,6 +99,7 @@ class MetastaticPool(ConfigPool):
v.Required('name'): str,
v.Required('labels'): [label],
'max-servers': int,
'host-key-checking': bool,
})
return pool

View File

@ -344,6 +344,8 @@ class StateMachineNodeLauncher(stats.StatsReporter):
if isinstance(e, exceptions.LaunchKeyscanException):
try:
label = self.handler.pool.labels[node.type[0]]
self.manager.adapter.notifyNodescanFailure(
label, node.external_id)
console = self.manager.adapter.getConsoleLog(
label, node.external_id)
if console:
@ -1651,7 +1653,7 @@ class Adapter:
"""
raise NotImplementedError()
# The following method is optional
# The following methods are optional
def getConsoleLog(self, label, external_id):
"""Return the console log from the specified server
@ -1659,3 +1661,11 @@ class Adapter:
:param external_id str: The external id of the server
"""
raise NotImplementedError()
def notifyNodescanFailure(self, label, external_id):
"""Notify the adapter of a nodescan failure
:param label ConfigLabel: The label config for the node
:param external_id str: The external id of the server
"""
pass

View File

@ -49,6 +49,7 @@ providers:
- name: main
max-servers: 10
priority: 1
host-key-checking: true
node-attributes:
metaattr: meta
testattr: metastatic
@ -57,3 +58,4 @@ providers:
backing-label: backing-label
max-parallel-jobs: 2
grace-time: 2
host-key-checking: true

View File

@ -17,11 +17,13 @@ import json
import logging
import os
import fixtures
import testtools
from nodepool import tests
from nodepool.zk import zookeeper as zk
from nodepool.driver.statemachine import StateMachineProvider
import nodepool.driver.statemachine
from nodepool.cmd.config_validator import ConfigValidator
@ -228,3 +230,79 @@ class TestDriverMetastatic(tests.DBTestCase):
self.waitForNodeDeletion(bn1)
nodes = self._getNodes()
self.assertEqual(nodes, [])
def test_metastatic_nodescan(self):
# Test that a nodescan failure takes a backing node out of service
counter = -1
# bn1, node1, node2
fail = [False, False, True]
orig_advance = nodepool.driver.statemachine.NodescanRequest.advance
def handler(*args, **kw):
nonlocal counter, fail
counter += 1
if counter >= len(fail):
return orig_advance(*args, **kw)
if fail[counter]:
raise nodepool.exceptions.ConnectionTimeoutException()
return orig_advance(*args, **kw)
self.useFixture(fixtures.MonkeyPatch(
'nodepool.driver.statemachine.NodescanRequest.advance',
handler))
configfile = self.setup_config('metastatic.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
self.startPool(pool)
manager = pool.getProviderManager('fake-provider')
manager.adapter._client.create_image(name="fake-image")
# Launch one metastatic node on a backing node
node1 = self._requestNode()
nodes = self._getNodes()
self.assertEqual(len(nodes), 2)
bn1 = nodes[1]
self.assertEqual(bn1.provider, 'fake-provider')
self.assertEqual(bn1.id, node1.driver_data['backing_node'])
# Launch a second one with a failed nodescan; should have a
# second backing node
node2 = self._requestNode()
nodes = self._getNodes()
bn2 = nodes[3]
# Reload bn1 since the userdata failed
self.assertEqual(bn1.id, nodes[1].id)
bn1 = nodes[1]
self.assertNotEqual(bn1.id, bn2.id)
self.assertEqual(nodes, [node1, bn1, node2, bn2])
self.assertEqual(bn2.id, node2.driver_data['backing_node'])
# Allocate a third node, should use the second backing node
node3 = self._requestNode()
nodes = self._getNodes()
self.assertEqual(nodes, [node1, bn1, node2, bn2, node3])
self.assertEqual(bn2.id, node3.driver_data['backing_node'])
# Delete node3, verify that both backing nodes exist
node3.state = zk.DELETING
self.zk.storeNode(node3)
self.waitForNodeDeletion(node3)
nodes = self._getNodes()
self.assertEqual(nodes, [node1, bn1, node2, bn2])
# Delete node2, verify that only the first backing node exists
node2.state = zk.DELETING
self.zk.storeNode(node2)
self.waitForNodeDeletion(node2)
self.waitForNodeDeletion(bn2)
nodes = self._getNodes()
self.assertEqual(nodes, [node1, bn1])
# Delete node1, verify that no nodes exist
node1.state = zk.DELETING
self.zk.storeNode(node1)
self.waitForNodeDeletion(node1)
self.waitForNodeDeletion(bn1)
nodes = self._getNodes()
self.assertEqual(nodes, [])

View File

@ -0,0 +1,9 @@
---
features:
- |
The metastatic driver now includes the
:attr:`providers.[metastatic].pools.host-key-checking` and
:attr:`providers.[metastatic].pools.labels.host-key-checking`
options. This can be used to verify that a backing node is still
functioning correctly before allocating new metastatic nodes to
it.