Make estimatedNodepoolQuotaUsed more resilient

We had the case that we stored znodes without pool or type. At least
znodes without type break the quota calculation and can lead to wedged
providers. So make that more resilient and log exceptions per node
instead of failing the complete calculation. This way we don't wedge
in case we have bogus data in zk while still being able to debug
what's wrong with certain znodes.

Change-Id: I4a33ffbbf3684dc3830913ed8dc7b158f2426602
This commit is contained in:
Tobias Henkel 2018-12-05 10:18:41 +01:00
parent 6fe861f42a
commit 41c968e3ac
No known key found for this signature in database
GPG Key ID: 03750DEC158E5FA2
2 changed files with 57 additions and 20 deletions

View File

@ -151,26 +151,30 @@ class OpenStackProvider(Provider):
for node in self._zk.nodeIterator():
if node.provider == self.provider.name:
if pool and not node.pool == pool.name:
continue
provider_pool = self.provider.pools.get(node.pool)
if not provider_pool:
self.log.warning(
"Cannot find provider pool for node %s" % node)
# This node is in a funny state we log it for debugging
# but move on and don't account it as we can't properly
# calculate its cost without pool info.
continue
if node.type[0] not in provider_pool.labels:
self.log.warning(
"Node type is not in provider pool for node %s" % node)
# This node is also in a funny state; the config
# may have changed under it. It should settle out
# eventually when it's deleted.
continue
node_resources = self.quotaNeededByNodeType(
node.type[0], provider_pool)
used_quota.add(node_resources)
try:
if pool and not node.pool == pool.name:
continue
provider_pool = self.provider.pools.get(node.pool)
if not provider_pool:
self.log.warning(
"Cannot find provider pool for node %s" % node)
# This node is in a funny state we log it for debugging
# but move on and don't account it as we can't properly
# calculate its cost without pool info.
continue
if node.type[0] not in provider_pool.labels:
self.log.warning("Node type is not in provider pool "
"for node %s" % node)
# This node is also in a funny state; the config
# may have changed under it. It should settle out
# eventually when it's deleted.
continue
node_resources = self.quotaNeededByNodeType(
node.type[0], provider_pool)
used_quota.add(node_resources)
except Exception:
self.log.exception("Couldn't consider invalid node %s "
"for quota:" % node)
return used_quota
def unmanagedQuotaUsed(self):

View File

@ -729,6 +729,39 @@ class TestLauncher(tests.DBTestCase):
# retries in config is set to 2, so 2 attempts to create a server
self.assertEqual(0, manager.createServer_fails)
def test_node_launch_with_broken_znodes(self):
"""Test that node launch still works if there are broken znodes"""
# Create a znode without type
znode = zk.Node()
znode.provider = 'fake-provider'
znode.pool = 'main'
znode.external_id = 'fakeid'
znode.state = zk.READY
# Create znode without pool
self.zk.storeNode(znode)
znode = zk.Node()
znode.provider = 'fake-provider'
znode.type = ['fake-label']
znode.external_id = 'fakeid'
znode.state = zk.READY
self.zk.storeNode(znode)
configfile = self.setup_config('node_launch_retry.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
self.useBuilder(configfile)
pool.start()
self.wait_for_config(pool)
self.waitForImage('fake-provider', 'fake-image')
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
req = self.waitForNodeRequest(req)
self.assertEqual(req.state, zk.FULFILLED)
def test_node_launch_retries_with_external_id(self):
configfile = self.setup_config('node_launch_retry.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)