diff --git a/ironic/common/hash_ring.py b/ironic/common/hash_ring.py index 9034d69197..d682fcc0d2 100644 --- a/ironic/common/hash_ring.py +++ b/ironic/common/hash_ring.py @@ -16,6 +16,7 @@ import threading import time +from oslo_log import log from tooz import hashring from ironic.common import exception @@ -24,6 +25,9 @@ from ironic.conf import CONF from ironic.db import api as dbapi +LOG = log.getLogger(__name__) + + class HashRingManager(object): _hash_rings = None _lock = threading.Lock() @@ -42,15 +46,20 @@ class HashRingManager(object): if not self.cache: return self._load_hash_rings() - # Hot path, no lock - if self.__class__._hash_rings is not None and self.updated_at >= limit: - return self.__class__._hash_rings + # Hot path, no lock. Using a local variable to avoid races with code + # changing the class variable. + hash_rings = self.__class__._hash_rings + if hash_rings is not None and self.updated_at >= limit: + return hash_rings with self._lock: if self.__class__._hash_rings is None or self.updated_at < limit: + LOG.debug('Rebuilding cached hash rings') rings = self._load_hash_rings() self.__class__._hash_rings = rings self.updated_at = time.time() + LOG.debug('Finished rebuilding hash rings, available drivers ' + 'are %s', ', '.join(rings)) return self.__class__._hash_rings def _load_hash_rings(self): @@ -67,9 +76,28 @@ class HashRingManager(object): @classmethod def reset(cls): with cls._lock: + LOG.debug('Resetting cached hash rings') cls._hash_rings = None def get_ring(self, driver_name, conductor_group): + try: + return self._get_ring(driver_name, conductor_group) + except (exception.DriverNotFound, exception.TemporaryFailure): + # NOTE(dtantsur): we assume that this case is more often caused by + # conductors coming and leaving, so we try to rebuild the rings. + LOG.debug('No conductor from group %(group)s found for driver ' + '%(driver)s, trying to rebuild the hash rings', + {'driver': driver_name, + 'group': conductor_group or ''}) + + self.__class__.reset() + return self._get_ring(driver_name, conductor_group) + + def _get_ring(self, driver_name, conductor_group): + # There are no conductors, temporary failure - 503 Service Unavailable + if not self.ring: + raise exception.TemporaryFailure() + try: if self.use_groups: return self.ring['%s:%s' % (conductor_group, driver_name)] diff --git a/ironic/conductor/rpcapi.py b/ironic/conductor/rpcapi.py index 95f6445a5e..fb9bc70dc9 100644 --- a/ironic/conductor/rpcapi.py +++ b/ironic/conductor/rpcapi.py @@ -131,12 +131,6 @@ class ConductorAPI(object): :raises: NoValidHost """ - self.ring_manager.reset() - - # There are no conductors, temporary failure - 503 Service Unavailable - if not self.ring_manager.ring: - raise exception.TemporaryFailure() - try: ring = self.ring_manager.get_ring(node.driver, node.conductor_group) @@ -166,7 +160,12 @@ class ConductorAPI(object): # does not take groups into account. local_ring_manager = hash_ring.HashRingManager(use_groups=False, cache=False) - ring = local_ring_manager.get_ring(driver_name, '') + try: + ring = local_ring_manager.get_ring(driver_name, '') + except exception.TemporaryFailure: + # NOTE(dtantsur): even if no conductors are registered, it makes + # sense to report 404 on any driver request. + raise exception.DriverNotFound(_("No conductors registered.")) host = random.choice(list(ring.nodes)) return self.topic + "." + host diff --git a/ironic/conf/default.py b/ironic/conf/default.py index 3b35a93b10..46d1394b7b 100644 --- a/ironic/conf/default.py +++ b/ironic/conf/default.py @@ -192,8 +192,10 @@ hash_opts = [ 'and potentially allow the Ironic cluster to recover ' 'more quickly if a conductor instance is terminated.')), cfg.IntOpt('hash_ring_reset_interval', - default=180, - help=_('Interval (in seconds) between hash ring resets.')), + default=15, + help=_('Time (in seconds) after which the hash ring is ' + 'considered outdated and is refreshed on the next ' + 'access.')), ] image_opts = [ diff --git a/ironic/tests/unit/common/test_hash_ring.py b/ironic/tests/unit/common/test_hash_ring.py index c87203cca6..5ad5ee8701 100644 --- a/ironic/tests/unit/common/test_hash_ring.py +++ b/ironic/tests/unit/common/test_hash_ring.py @@ -80,29 +80,42 @@ class HashRingManagerTestCase(db_base.DbTestCase): self.ring_manager.get_ring, 'driver3', '') - def test_hash_ring_manager_no_refresh(self): - # If a new conductor is registered after the ring manager is - # initialized, it won't be seen. Long term this is probably - # undesirable, but today is the intended behavior. - self.assertRaises(exception.DriverNotFound, + def test_hash_ring_manager_automatic_retry(self): + self.assertRaises(exception.TemporaryFailure, self.ring_manager.get_ring, 'hardware-type', '') self.register_conductors() - self.assertRaises(exception.DriverNotFound, - self.ring_manager.get_ring, - 'hardware-type', '') - - def test_hash_ring_manager_refresh(self): - CONF.set_override('hash_ring_reset_interval', 30) - # Initialize the ring manager to make _hash_rings not None, then - # hash ring will refresh only when time interval exceeded. - self.assertRaises(exception.DriverNotFound, - self.ring_manager.get_ring, - 'hardware-type', '') - self.register_conductors() - self.ring_manager.updated_at = time.time() - 31 self.ring_manager.get_ring('hardware-type', '') + def test_hash_ring_manager_reset_interval(self): + CONF.set_override('hash_ring_reset_interval', 30) + # Just to simplify calculations + CONF.set_override('hash_partition_exponent', 0) + c1 = self.dbapi.register_conductor({ + 'hostname': 'host1', + 'drivers': ['driver1', 'driver2'], + }) + c2 = self.dbapi.register_conductor({ + 'hostname': 'host2', + 'drivers': ['driver1'], + }) + self.dbapi.register_conductor_hardware_interfaces( + c1.id, 'hardware-type', 'deploy', ['iscsi', 'direct'], 'iscsi') + + ring = self.ring_manager.get_ring('hardware-type', '') + self.assertEqual(1, len(ring)) + + self.dbapi.register_conductor_hardware_interfaces( + c2.id, 'hardware-type', 'deploy', ['iscsi', 'direct'], 'iscsi') + ring = self.ring_manager.get_ring('hardware-type', '') + # The new conductor is not known yet. Automatic retry does not kick in, + # since there is an active conductor for the requested hardware type. + self.assertEqual(1, len(ring)) + + self.ring_manager.updated_at = time.time() - 31 + ring = self.ring_manager.get_ring('hardware-type', '') + self.assertEqual(2, len(ring)) + def test_hash_ring_manager_uncached(self): ring_mgr = hash_ring.HashRingManager(cache=False, use_groups=self.use_groups) diff --git a/ironic/tests/unit/conductor/test_rpcapi.py b/ironic/tests/unit/conductor/test_rpcapi.py index 32302848a6..9749c4c924 100644 --- a/ironic/tests/unit/conductor/test_rpcapi.py +++ b/ironic/tests/unit/conductor/test_rpcapi.py @@ -130,14 +130,16 @@ class RPCAPITestCase(db_base.DbTestCase): def test_get_topic_for_driver_unknown_driver(self): CONF.set_override('host', 'fake-host') - self.dbapi.register_conductor({ + c = self.dbapi.register_conductor({ 'hostname': 'fake-host', 'drivers': [], }) + self.dbapi.register_conductor_hardware_interfaces( + c.id, 'fake-driver', 'deploy', ['iscsi', 'direct'], 'iscsi') rpcapi = conductor_rpcapi.ConductorAPI(topic='fake-topic') self.assertRaises(exception.DriverNotFound, rpcapi.get_topic_for_driver, - 'fake-driver') + 'fake-driver-2') def test_get_topic_for_driver_doesnt_cache(self): CONF.set_override('host', 'fake-host') diff --git a/releasenotes/notes/hash-ring-race-da0d584de1f46788.yaml b/releasenotes/notes/hash-ring-race-da0d584de1f46788.yaml new file mode 100644 index 0000000000..b47b1959e2 --- /dev/null +++ b/releasenotes/notes/hash-ring-race-da0d584de1f46788.yaml @@ -0,0 +1,13 @@ +--- +fixes: + - | + Fixes a race condition in the hash ring implementation that could cause + an internal server error on any request. See `story 2003966 + `_ for details. +upgrade: + - | + The ``hash_ring_reset_interval`` configuration option was changed from 180 + to 15 seconds. Previously, this option was essentially ignored on the API + side, becase the hash ring was reset on each API access. The lower value + minimizes the probability of a request routed to a wrong conductor when the + ring needs rebalancing.