diff --git a/nodepool/driver/openstack/handler.py b/nodepool/driver/openstack/handler.py index 6550ed494..80a978b4f 100644 --- a/nodepool/driver/openstack/handler.py +++ b/nodepool/driver/openstack/handler.py @@ -21,6 +21,8 @@ import random import threading import time +from kazoo import exceptions as kze + from nodepool import exceptions from nodepool import nodeutils as utils from nodepool import stats @@ -218,6 +220,10 @@ class NodeLauncher(threading.Thread, stats.StatsReporter): try: self._launchNode() break + except kze.SessionExpiredError: + # If we lost our ZooKeeper session, we've lost our node lock + # so there's no need to continue. + raise except Exception as e: if attempts <= self._retries: self.log.exception( @@ -252,6 +258,16 @@ class NodeLauncher(threading.Thread, stats.StatsReporter): try: self._run() + except kze.SessionExpiredError: + # Our node lock is gone, leaving the node state as BUILDING. + # This will get cleaned up in ZooKeeper automatically, but we + # must still set our cached node state to FAILED for the + # NodeLaunchManager's poll() method. + self.log.error( + "Lost ZooKeeper session trying to launch for node %s", + self._node.id) + self._node.state = zk.FAILED + statsd_key = 'error.zksession' except Exception as e: self.log.exception("Launch failed for node %s:", self._node.id) diff --git a/nodepool/tests/test_launcher.py b/nodepool/tests/test_launcher.py index 0b4d67b09..84b1ca901 100644 --- a/nodepool/tests/test_launcher.py +++ b/nodepool/tests/test_launcher.py @@ -17,12 +17,15 @@ import logging import math import time import fixtures +import mock from nodepool import tests from nodepool import zk from nodepool.driver import Drivers import nodepool.launcher +from kazoo import exceptions as kze + class TestLauncher(tests.DBTestCase): log = logging.getLogger("nodepool.TestLauncher") @@ -1277,3 +1280,33 @@ class TestLauncher(tests.DBTestCase): while launchers[0].supported_labels != {'fake-label', 'fake-label2'}: time.sleep(1) launchers = self.zk.getRegisteredLaunchers() + + @mock.patch('nodepool.driver.openstack.handler.NodeLauncher._launchNode') + def test_launchNode_session_expired(self, mock_launch): + ''' + Test ZK session lost during _launchNode(). + ''' + mock_launch.side_effect = kze.SessionExpiredError() + + # use a config with min-ready of 0 + configfile = self.setup_config('node_launch_retry.yaml') + self.useBuilder(configfile) + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.cleanup_interval = 60 + pool.start() + self.waitForImage('fake-provider', 'fake-image') + + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.node_types.append('fake-label') + self.zk.storeNodeRequest(req) + + # A session loss during node launch should at least try to set the + # request state to FAILED (in a non-test scenario, it may actually + # be missing). + req = self.waitForNodeRequest(req, states=(zk.FAILED,)) + self.assertEqual(1, mock_launch.call_count) + + # Any znodes created for the request should eventually get deleted. + while self.zk.countPoolNodes('fake-provider', 'main'): + time.sleep(0)