Do not abort node launch if failed node cannot be deleted

There was a possibility for an uncaught exception when launching a
node: if the launch fails and the subsequent node cleaning fails
as well, nodepool would break out of the loop prematurely.

Also add request information when logging launch failure to make
debugging easier.

Change-Id: I4a08ebe7b0ce763b6ac4bcd314403b869e487ac1
This commit is contained in:
mhuin 2018-08-08 13:38:46 +02:00
parent 7fb7d69e3b
commit ebdc35fc5e
3 changed files with 83 additions and 5 deletions

View File

@ -300,6 +300,41 @@ class FakeUploadFailCloud(FakeOpenStackCloud):
return super(FakeUploadFailCloud, self).create_image(**kwargs)
class FakeLaunchAndDeleteFailCloud(FakeOpenStackCloud):
log = logging.getLogger("nodepool.FakeLaunchAndDeleteFailCloud")
def __init__(self, times_to_fail=None):
super(FakeLaunchAndDeleteFailCloud, self).__init__()
self.times_to_fail_delete = times_to_fail
self.times_to_fail_launch = times_to_fail
self.times_failed_delete = 0
self.times_failed_launch = 0
self.launch_success = False
self.delete_success = False
def wait_for_server(self, **kwargs):
if self.times_to_fail_launch is None:
raise Exception("Test fail server launch.")
if self.times_failed_launch < self.times_to_fail_launch:
self.times_failed_launch += 1
raise exceptions.ServerDeleteException("Test fail server launch.")
else:
self.launch_success = True
return super(FakeLaunchAndDeleteFailCloud,
self).wait_for_server(**kwargs)
def delete_server(self, *args, **kwargs):
if self.times_to_fail_delete is None:
raise exceptions.ServerDeleteException("Test fail server delete.")
if self.times_failed_delete < self.times_to_fail_delete:
self.times_failed_delete += 1
raise exceptions.ServerDeleteException("Test fail server delete.")
else:
self.delete_success = True
return super(FakeLaunchAndDeleteFailCloud,
self).delete_server(*args, **kwargs)
class FakeProvider(OpenStackProvider):
fake_cloud = FakeOpenStackCloud

View File

@ -232,13 +232,19 @@ class OpenStackNodeLauncher(NodeLauncher):
except Exception as e:
if attempts <= self._retries:
self.log.exception(
"Launch attempt %d/%d failed for node %s:",
attempts, self._retries, self.node.id)
"Request %s: Launch attempt %d/%d failed for node %s:",
self.handler.request.id, attempts,
self._retries, self.node.id)
# If we created an instance, delete it.
if self.node.external_id:
self.handler.manager.cleanupNode(self.node.external_id)
self.handler.manager.waitForNodeCleanup(
self.node.external_id)
deleting_node = zk.Node()
deleting_node.provider = self.node.provider
deleting_node.external_id = self.node.external_id
deleting_node.state = zk.DELETING
self.zk.storeNode(deleting_node)
self.log.info(
"Request %s: Node %s scheduled for cleanup",
self.handler.request.id, deleting_node.external_id)
self.node.external_id = None
self.node.public_ipv4 = None
self.node.public_ipv6 = None

View File

@ -1497,6 +1497,43 @@ class TestLauncher(tests.DBTestCase):
while self.zk.countPoolNodes('fake-provider', 'main'):
time.sleep(0)
def test_launchNode_delete_error(self):
'''
Test that the launcher keeps trying to spawn a node in case of a
delete error
'''
fake_client = fakeprovider.FakeLaunchAndDeleteFailCloud(
times_to_fail=1)
def get_fake_client(*args, **kwargs):
return fake_client
self.useFixture(fixtures.MockPatchObject(
fakeprovider.FakeProvider, '_getClient',
get_fake_client))
configfile = self.setup_config('node_launch_retry.yaml')
self.useBuilder(configfile)
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.cleanup_interval = 60
pool.start()
self.waitForImage('fake-provider', 'fake-image')
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
req = self.waitForNodeRequest(req)
self.assertTrue(fake_client.launch_success)
self.assertTrue(fake_client.delete_success)
self.assertEqual(fake_client.times_to_fail_delete,
fake_client.times_failed_delete)
self.assertEqual(fake_client.times_to_fail_launch,
fake_client.times_failed_launch)
self.assertEqual(req.state, zk.FULFILLED)
self.assertEqual(len(req.nodes), 1)
@mock.patch(
'nodepool.driver.openstack.handler.OpenStackNodeRequestHandler.poll')
def test_handler_poll_session_expired(self, mock_poll):