Merge "Handle ZK session loss during node launch"
This commit is contained in:
commit
b44e8491b3
|
@ -21,6 +21,8 @@ import random
|
|||
import threading
|
||||
import time
|
||||
|
||||
from kazoo import exceptions as kze
|
||||
|
||||
from nodepool import exceptions
|
||||
from nodepool import nodeutils as utils
|
||||
from nodepool import stats
|
||||
|
@ -218,6 +220,10 @@ class NodeLauncher(threading.Thread, stats.StatsReporter):
|
|||
try:
|
||||
self._launchNode()
|
||||
break
|
||||
except kze.SessionExpiredError:
|
||||
# If we lost our ZooKeeper session, we've lost our node lock
|
||||
# so there's no need to continue.
|
||||
raise
|
||||
except Exception as e:
|
||||
if attempts <= self._retries:
|
||||
self.log.exception(
|
||||
|
@ -252,6 +258,16 @@ class NodeLauncher(threading.Thread, stats.StatsReporter):
|
|||
|
||||
try:
|
||||
self._run()
|
||||
except kze.SessionExpiredError:
|
||||
# Our node lock is gone, leaving the node state as BUILDING.
|
||||
# This will get cleaned up in ZooKeeper automatically, but we
|
||||
# must still set our cached node state to FAILED for the
|
||||
# NodeLaunchManager's poll() method.
|
||||
self.log.error(
|
||||
"Lost ZooKeeper session trying to launch for node %s",
|
||||
self._node.id)
|
||||
self._node.state = zk.FAILED
|
||||
statsd_key = 'error.zksession'
|
||||
except Exception as e:
|
||||
self.log.exception("Launch failed for node %s:",
|
||||
self._node.id)
|
||||
|
|
|
@ -17,12 +17,15 @@ import logging
|
|||
import math
|
||||
import time
|
||||
import fixtures
|
||||
import mock
|
||||
|
||||
from nodepool import tests
|
||||
from nodepool import zk
|
||||
from nodepool.driver import Drivers
|
||||
import nodepool.launcher
|
||||
|
||||
from kazoo import exceptions as kze
|
||||
|
||||
|
||||
class TestLauncher(tests.DBTestCase):
|
||||
log = logging.getLogger("nodepool.TestLauncher")
|
||||
|
@ -1277,3 +1280,33 @@ class TestLauncher(tests.DBTestCase):
|
|||
while launchers[0].supported_labels != {'fake-label', 'fake-label2'}:
|
||||
time.sleep(1)
|
||||
launchers = self.zk.getRegisteredLaunchers()
|
||||
|
||||
@mock.patch('nodepool.driver.openstack.handler.NodeLauncher._launchNode')
|
||||
def test_launchNode_session_expired(self, mock_launch):
|
||||
'''
|
||||
Test ZK session lost during _launchNode().
|
||||
'''
|
||||
mock_launch.side_effect = kze.SessionExpiredError()
|
||||
|
||||
# use a config with min-ready of 0
|
||||
configfile = self.setup_config('node_launch_retry.yaml')
|
||||
self.useBuilder(configfile)
|
||||
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||
pool.cleanup_interval = 60
|
||||
pool.start()
|
||||
self.waitForImage('fake-provider', 'fake-image')
|
||||
|
||||
req = zk.NodeRequest()
|
||||
req.state = zk.REQUESTED
|
||||
req.node_types.append('fake-label')
|
||||
self.zk.storeNodeRequest(req)
|
||||
|
||||
# A session loss during node launch should at least try to set the
|
||||
# request state to FAILED (in a non-test scenario, it may actually
|
||||
# be missing).
|
||||
req = self.waitForNodeRequest(req, states=(zk.FAILED,))
|
||||
self.assertEqual(1, mock_launch.call_count)
|
||||
|
||||
# Any znodes created for the request should eventually get deleted.
|
||||
while self.zk.countPoolNodes('fake-provider', 'main'):
|
||||
time.sleep(0)
|
||||
|
|
Loading…
Reference in New Issue