From ebdc35fc5e96a15f83b29f031c54e159f7dfa04b Mon Sep 17 00:00:00 2001 From: mhuin Date: Wed, 8 Aug 2018 13:38:46 +0200 Subject: [PATCH] Do not abort node launch if failed node cannot be deleted There was a possibility for an uncaught exception when launching a node: if the launch fails and the subsequent node cleaning fails as well, nodepool would break out of the loop prematurely. Also add request information when logging launch failure to make debugging easier. Change-Id: I4a08ebe7b0ce763b6ac4bcd314403b869e487ac1 --- nodepool/driver/fake/provider.py | 35 ++++++++++++++++++++++++++ nodepool/driver/openstack/handler.py | 16 ++++++++---- nodepool/tests/test_launcher.py | 37 ++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 5 deletions(-) diff --git a/nodepool/driver/fake/provider.py b/nodepool/driver/fake/provider.py index 1230dc4e4..6e6fdcd4d 100644 --- a/nodepool/driver/fake/provider.py +++ b/nodepool/driver/fake/provider.py @@ -300,6 +300,41 @@ class FakeUploadFailCloud(FakeOpenStackCloud): return super(FakeUploadFailCloud, self).create_image(**kwargs) +class FakeLaunchAndDeleteFailCloud(FakeOpenStackCloud): + log = logging.getLogger("nodepool.FakeLaunchAndDeleteFailCloud") + + def __init__(self, times_to_fail=None): + super(FakeLaunchAndDeleteFailCloud, self).__init__() + self.times_to_fail_delete = times_to_fail + self.times_to_fail_launch = times_to_fail + self.times_failed_delete = 0 + self.times_failed_launch = 0 + self.launch_success = False + self.delete_success = False + + def wait_for_server(self, **kwargs): + if self.times_to_fail_launch is None: + raise Exception("Test fail server launch.") + if self.times_failed_launch < self.times_to_fail_launch: + self.times_failed_launch += 1 + raise exceptions.ServerDeleteException("Test fail server launch.") + else: + self.launch_success = True + return super(FakeLaunchAndDeleteFailCloud, + self).wait_for_server(**kwargs) + + def delete_server(self, *args, **kwargs): + if self.times_to_fail_delete is None: + raise exceptions.ServerDeleteException("Test fail server delete.") + if self.times_failed_delete < self.times_to_fail_delete: + self.times_failed_delete += 1 + raise exceptions.ServerDeleteException("Test fail server delete.") + else: + self.delete_success = True + return super(FakeLaunchAndDeleteFailCloud, + self).delete_server(*args, **kwargs) + + class FakeProvider(OpenStackProvider): fake_cloud = FakeOpenStackCloud diff --git a/nodepool/driver/openstack/handler.py b/nodepool/driver/openstack/handler.py index e89917ca1..171677840 100644 --- a/nodepool/driver/openstack/handler.py +++ b/nodepool/driver/openstack/handler.py @@ -232,13 +232,19 @@ class OpenStackNodeLauncher(NodeLauncher): except Exception as e: if attempts <= self._retries: self.log.exception( - "Launch attempt %d/%d failed for node %s:", - attempts, self._retries, self.node.id) + "Request %s: Launch attempt %d/%d failed for node %s:", + self.handler.request.id, attempts, + self._retries, self.node.id) # If we created an instance, delete it. if self.node.external_id: - self.handler.manager.cleanupNode(self.node.external_id) - self.handler.manager.waitForNodeCleanup( - self.node.external_id) + deleting_node = zk.Node() + deleting_node.provider = self.node.provider + deleting_node.external_id = self.node.external_id + deleting_node.state = zk.DELETING + self.zk.storeNode(deleting_node) + self.log.info( + "Request %s: Node %s scheduled for cleanup", + self.handler.request.id, deleting_node.external_id) self.node.external_id = None self.node.public_ipv4 = None self.node.public_ipv6 = None diff --git a/nodepool/tests/test_launcher.py b/nodepool/tests/test_launcher.py index 7d646bc2f..df377430a 100644 --- a/nodepool/tests/test_launcher.py +++ b/nodepool/tests/test_launcher.py @@ -1497,6 +1497,43 @@ class TestLauncher(tests.DBTestCase): while self.zk.countPoolNodes('fake-provider', 'main'): time.sleep(0) + def test_launchNode_delete_error(self): + ''' + Test that the launcher keeps trying to spawn a node in case of a + delete error + ''' + fake_client = fakeprovider.FakeLaunchAndDeleteFailCloud( + times_to_fail=1) + + def get_fake_client(*args, **kwargs): + return fake_client + + self.useFixture(fixtures.MockPatchObject( + fakeprovider.FakeProvider, '_getClient', + get_fake_client)) + + configfile = self.setup_config('node_launch_retry.yaml') + self.useBuilder(configfile) + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.cleanup_interval = 60 + pool.start() + self.waitForImage('fake-provider', 'fake-image') + + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.node_types.append('fake-label') + self.zk.storeNodeRequest(req) + + req = self.waitForNodeRequest(req) + self.assertTrue(fake_client.launch_success) + self.assertTrue(fake_client.delete_success) + self.assertEqual(fake_client.times_to_fail_delete, + fake_client.times_failed_delete) + self.assertEqual(fake_client.times_to_fail_launch, + fake_client.times_failed_launch) + self.assertEqual(req.state, zk.FULFILLED) + self.assertEqual(len(req.nodes), 1) + @mock.patch( 'nodepool.driver.openstack.handler.OpenStackNodeRequestHandler.poll') def test_handler_poll_session_expired(self, mock_poll):