Merge "Do not abort node launch if failed node cannot be deleted"

This commit is contained in:
Zuul 2018-09-17 17:58:21 +00:00 committed by Gerrit Code Review
commit da8a94d341
3 changed files with 83 additions and 5 deletions

View File

@ -300,6 +300,41 @@ class FakeUploadFailCloud(FakeOpenStackCloud):
return super(FakeUploadFailCloud, self).create_image(**kwargs)
class FakeLaunchAndDeleteFailCloud(FakeOpenStackCloud):
log = logging.getLogger("nodepool.FakeLaunchAndDeleteFailCloud")
def __init__(self, times_to_fail=None):
super(FakeLaunchAndDeleteFailCloud, self).__init__()
self.times_to_fail_delete = times_to_fail
self.times_to_fail_launch = times_to_fail
self.times_failed_delete = 0
self.times_failed_launch = 0
self.launch_success = False
self.delete_success = False
def wait_for_server(self, **kwargs):
if self.times_to_fail_launch is None:
raise Exception("Test fail server launch.")
if self.times_failed_launch < self.times_to_fail_launch:
self.times_failed_launch += 1
raise exceptions.ServerDeleteException("Test fail server launch.")
else:
self.launch_success = True
return super(FakeLaunchAndDeleteFailCloud,
self).wait_for_server(**kwargs)
def delete_server(self, *args, **kwargs):
if self.times_to_fail_delete is None:
raise exceptions.ServerDeleteException("Test fail server delete.")
if self.times_failed_delete < self.times_to_fail_delete:
self.times_failed_delete += 1
raise exceptions.ServerDeleteException("Test fail server delete.")
else:
self.delete_success = True
return super(FakeLaunchAndDeleteFailCloud,
self).delete_server(*args, **kwargs)
class FakeProvider(OpenStackProvider):
fake_cloud = FakeOpenStackCloud

View File

@ -232,13 +232,19 @@ class OpenStackNodeLauncher(NodeLauncher):
except Exception as e:
if attempts <= self._retries:
self.log.exception(
"Launch attempt %d/%d failed for node %s:",
attempts, self._retries, self.node.id)
"Request %s: Launch attempt %d/%d failed for node %s:",
self.handler.request.id, attempts,
self._retries, self.node.id)
# If we created an instance, delete it.
if self.node.external_id:
self.handler.manager.cleanupNode(self.node.external_id)
self.handler.manager.waitForNodeCleanup(
self.node.external_id)
deleting_node = zk.Node()
deleting_node.provider = self.node.provider
deleting_node.external_id = self.node.external_id
deleting_node.state = zk.DELETING
self.zk.storeNode(deleting_node)
self.log.info(
"Request %s: Node %s scheduled for cleanup",
self.handler.request.id, deleting_node.external_id)
self.node.external_id = None
self.node.public_ipv4 = None
self.node.public_ipv6 = None

View File

@ -1499,6 +1499,43 @@ class TestLauncher(tests.DBTestCase):
while self.zk.countPoolNodes('fake-provider', 'main'):
time.sleep(0)
def test_launchNode_delete_error(self):
'''
Test that the launcher keeps trying to spawn a node in case of a
delete error
'''
fake_client = fakeprovider.FakeLaunchAndDeleteFailCloud(
times_to_fail=1)
def get_fake_client(*args, **kwargs):
return fake_client
self.useFixture(fixtures.MockPatchObject(
fakeprovider.FakeProvider, '_getClient',
get_fake_client))
configfile = self.setup_config('node_launch_retry.yaml')
self.useBuilder(configfile)
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.cleanup_interval = 60
pool.start()
self.waitForImage('fake-provider', 'fake-image')
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
req = self.waitForNodeRequest(req)
self.assertTrue(fake_client.launch_success)
self.assertTrue(fake_client.delete_success)
self.assertEqual(fake_client.times_to_fail_delete,
fake_client.times_failed_delete)
self.assertEqual(fake_client.times_to_fail_launch,
fake_client.times_failed_launch)
self.assertEqual(req.state, zk.FULFILLED)
self.assertEqual(len(req.nodes), 1)
@mock.patch(
'nodepool.driver.openstack.handler.OpenStackNodeRequestHandler.poll')
def test_handler_poll_session_expired(self, mock_poll):