From 41c968e3acfb2bd7e728ac7a325feb7bc0fbd7f2 Mon Sep 17 00:00:00 2001 From: Tobias Henkel Date: Wed, 5 Dec 2018 10:18:41 +0100 Subject: [PATCH] Make estimatedNodepoolQuotaUsed more resilient We had the case that we stored znodes without pool or type. At least znodes without type break the quota calculation and can lead to wedged providers. So make that more resilient and log exceptions per node instead of failing the complete calculation. This way we don't wedge in case we have bogus data in zk while still being able to debug what's wrong with certain znodes. Change-Id: I4a33ffbbf3684dc3830913ed8dc7b158f2426602 --- nodepool/driver/openstack/provider.py | 44 +++++++++++++++------------ nodepool/tests/unit/test_launcher.py | 33 ++++++++++++++++++++ 2 files changed, 57 insertions(+), 20 deletions(-) diff --git a/nodepool/driver/openstack/provider.py b/nodepool/driver/openstack/provider.py index a78255b5f..b9a1462d8 100755 --- a/nodepool/driver/openstack/provider.py +++ b/nodepool/driver/openstack/provider.py @@ -151,26 +151,30 @@ class OpenStackProvider(Provider): for node in self._zk.nodeIterator(): if node.provider == self.provider.name: - if pool and not node.pool == pool.name: - continue - provider_pool = self.provider.pools.get(node.pool) - if not provider_pool: - self.log.warning( - "Cannot find provider pool for node %s" % node) - # This node is in a funny state we log it for debugging - # but move on and don't account it as we can't properly - # calculate its cost without pool info. - continue - if node.type[0] not in provider_pool.labels: - self.log.warning( - "Node type is not in provider pool for node %s" % node) - # This node is also in a funny state; the config - # may have changed under it. It should settle out - # eventually when it's deleted. - continue - node_resources = self.quotaNeededByNodeType( - node.type[0], provider_pool) - used_quota.add(node_resources) + try: + if pool and not node.pool == pool.name: + continue + provider_pool = self.provider.pools.get(node.pool) + if not provider_pool: + self.log.warning( + "Cannot find provider pool for node %s" % node) + # This node is in a funny state we log it for debugging + # but move on and don't account it as we can't properly + # calculate its cost without pool info. + continue + if node.type[0] not in provider_pool.labels: + self.log.warning("Node type is not in provider pool " + "for node %s" % node) + # This node is also in a funny state; the config + # may have changed under it. It should settle out + # eventually when it's deleted. + continue + node_resources = self.quotaNeededByNodeType( + node.type[0], provider_pool) + used_quota.add(node_resources) + except Exception: + self.log.exception("Couldn't consider invalid node %s " + "for quota:" % node) return used_quota def unmanagedQuotaUsed(self): diff --git a/nodepool/tests/unit/test_launcher.py b/nodepool/tests/unit/test_launcher.py index 3422b1027..acc07760f 100644 --- a/nodepool/tests/unit/test_launcher.py +++ b/nodepool/tests/unit/test_launcher.py @@ -729,6 +729,39 @@ class TestLauncher(tests.DBTestCase): # retries in config is set to 2, so 2 attempts to create a server self.assertEqual(0, manager.createServer_fails) + def test_node_launch_with_broken_znodes(self): + """Test that node launch still works if there are broken znodes""" + # Create a znode without type + znode = zk.Node() + znode.provider = 'fake-provider' + znode.pool = 'main' + znode.external_id = 'fakeid' + znode.state = zk.READY + + # Create znode without pool + self.zk.storeNode(znode) + znode = zk.Node() + znode.provider = 'fake-provider' + znode.type = ['fake-label'] + znode.external_id = 'fakeid' + znode.state = zk.READY + self.zk.storeNode(znode) + + configfile = self.setup_config('node_launch_retry.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.useBuilder(configfile) + pool.start() + self.wait_for_config(pool) + self.waitForImage('fake-provider', 'fake-image') + + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.node_types.append('fake-label') + self.zk.storeNodeRequest(req) + + req = self.waitForNodeRequest(req) + self.assertEqual(req.state, zk.FULFILLED) + def test_node_launch_retries_with_external_id(self): configfile = self.setup_config('node_launch_retry.yaml') pool = self.useNodepool(configfile, watermark_sleep=1)