Merge "Handle ZK session loss during node launch"

This commit is contained in:
Zuul 2018-04-12 18:03:29 +00:00 committed by Gerrit Code Review
commit b44e8491b3
2 changed files with 49 additions and 0 deletions

View File

@ -21,6 +21,8 @@ import random
import threading
import time
from kazoo import exceptions as kze
from nodepool import exceptions
from nodepool import nodeutils as utils
from nodepool import stats
@ -218,6 +220,10 @@ class NodeLauncher(threading.Thread, stats.StatsReporter):
try:
self._launchNode()
break
except kze.SessionExpiredError:
# If we lost our ZooKeeper session, we've lost our node lock
# so there's no need to continue.
raise
except Exception as e:
if attempts <= self._retries:
self.log.exception(
@ -252,6 +258,16 @@ class NodeLauncher(threading.Thread, stats.StatsReporter):
try:
self._run()
except kze.SessionExpiredError:
# Our node lock is gone, leaving the node state as BUILDING.
# This will get cleaned up in ZooKeeper automatically, but we
# must still set our cached node state to FAILED for the
# NodeLaunchManager's poll() method.
self.log.error(
"Lost ZooKeeper session trying to launch for node %s",
self._node.id)
self._node.state = zk.FAILED
statsd_key = 'error.zksession'
except Exception as e:
self.log.exception("Launch failed for node %s:",
self._node.id)

View File

@ -17,12 +17,15 @@ import logging
import math
import time
import fixtures
import mock
from nodepool import tests
from nodepool import zk
from nodepool.driver import Drivers
import nodepool.launcher
from kazoo import exceptions as kze
class TestLauncher(tests.DBTestCase):
log = logging.getLogger("nodepool.TestLauncher")
@ -1277,3 +1280,33 @@ class TestLauncher(tests.DBTestCase):
while launchers[0].supported_labels != {'fake-label', 'fake-label2'}:
time.sleep(1)
launchers = self.zk.getRegisteredLaunchers()
@mock.patch('nodepool.driver.openstack.handler.NodeLauncher._launchNode')
def test_launchNode_session_expired(self, mock_launch):
'''
Test ZK session lost during _launchNode().
'''
mock_launch.side_effect = kze.SessionExpiredError()
# use a config with min-ready of 0
configfile = self.setup_config('node_launch_retry.yaml')
self.useBuilder(configfile)
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.cleanup_interval = 60
pool.start()
self.waitForImage('fake-provider', 'fake-image')
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
# A session loss during node launch should at least try to set the
# request state to FAILED (in a non-test scenario, it may actually
# be missing).
req = self.waitForNodeRequest(req, states=(zk.FAILED,))
self.assertEqual(1, mock_launch.call_count)
# Any znodes created for the request should eventually get deleted.
while self.zk.countPoolNodes('fake-provider', 'main'):
time.sleep(0)