Merge "Fixes a race condition in the hash ring code"
This commit is contained in:
commit
108801007c
|
@ -16,6 +16,7 @@
|
|||
import threading
|
||||
import time
|
||||
|
||||
from oslo_log import log
|
||||
from tooz import hashring
|
||||
|
||||
from ironic.common import exception
|
||||
|
@ -24,6 +25,9 @@ from ironic.conf import CONF
|
|||
from ironic.db import api as dbapi
|
||||
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
|
||||
class HashRingManager(object):
|
||||
_hash_rings = None
|
||||
_lock = threading.Lock()
|
||||
|
@ -42,15 +46,20 @@ class HashRingManager(object):
|
|||
if not self.cache:
|
||||
return self._load_hash_rings()
|
||||
|
||||
# Hot path, no lock
|
||||
if self.__class__._hash_rings is not None and self.updated_at >= limit:
|
||||
return self.__class__._hash_rings
|
||||
# Hot path, no lock. Using a local variable to avoid races with code
|
||||
# changing the class variable.
|
||||
hash_rings = self.__class__._hash_rings
|
||||
if hash_rings is not None and self.updated_at >= limit:
|
||||
return hash_rings
|
||||
|
||||
with self._lock:
|
||||
if self.__class__._hash_rings is None or self.updated_at < limit:
|
||||
LOG.debug('Rebuilding cached hash rings')
|
||||
rings = self._load_hash_rings()
|
||||
self.__class__._hash_rings = rings
|
||||
self.updated_at = time.time()
|
||||
LOG.debug('Finished rebuilding hash rings, available drivers '
|
||||
'are %s', ', '.join(rings))
|
||||
return self.__class__._hash_rings
|
||||
|
||||
def _load_hash_rings(self):
|
||||
|
@ -67,9 +76,28 @@ class HashRingManager(object):
|
|||
@classmethod
|
||||
def reset(cls):
|
||||
with cls._lock:
|
||||
LOG.debug('Resetting cached hash rings')
|
||||
cls._hash_rings = None
|
||||
|
||||
def get_ring(self, driver_name, conductor_group):
|
||||
try:
|
||||
return self._get_ring(driver_name, conductor_group)
|
||||
except (exception.DriverNotFound, exception.TemporaryFailure):
|
||||
# NOTE(dtantsur): we assume that this case is more often caused by
|
||||
# conductors coming and leaving, so we try to rebuild the rings.
|
||||
LOG.debug('No conductor from group %(group)s found for driver '
|
||||
'%(driver)s, trying to rebuild the hash rings',
|
||||
{'driver': driver_name,
|
||||
'group': conductor_group or '<none>'})
|
||||
|
||||
self.__class__.reset()
|
||||
return self._get_ring(driver_name, conductor_group)
|
||||
|
||||
def _get_ring(self, driver_name, conductor_group):
|
||||
# There are no conductors, temporary failure - 503 Service Unavailable
|
||||
if not self.ring:
|
||||
raise exception.TemporaryFailure()
|
||||
|
||||
try:
|
||||
if self.use_groups:
|
||||
return self.ring['%s:%s' % (conductor_group, driver_name)]
|
||||
|
|
|
@ -131,12 +131,6 @@ class ConductorAPI(object):
|
|||
:raises: NoValidHost
|
||||
|
||||
"""
|
||||
self.ring_manager.reset()
|
||||
|
||||
# There are no conductors, temporary failure - 503 Service Unavailable
|
||||
if not self.ring_manager.ring:
|
||||
raise exception.TemporaryFailure()
|
||||
|
||||
try:
|
||||
ring = self.ring_manager.get_ring(node.driver,
|
||||
node.conductor_group)
|
||||
|
@ -166,7 +160,12 @@ class ConductorAPI(object):
|
|||
# does not take groups into account.
|
||||
local_ring_manager = hash_ring.HashRingManager(use_groups=False,
|
||||
cache=False)
|
||||
ring = local_ring_manager.get_ring(driver_name, '')
|
||||
try:
|
||||
ring = local_ring_manager.get_ring(driver_name, '')
|
||||
except exception.TemporaryFailure:
|
||||
# NOTE(dtantsur): even if no conductors are registered, it makes
|
||||
# sense to report 404 on any driver request.
|
||||
raise exception.DriverNotFound(_("No conductors registered."))
|
||||
host = random.choice(list(ring.nodes))
|
||||
return self.topic + "." + host
|
||||
|
||||
|
|
|
@ -192,8 +192,10 @@ hash_opts = [
|
|||
'and potentially allow the Ironic cluster to recover '
|
||||
'more quickly if a conductor instance is terminated.')),
|
||||
cfg.IntOpt('hash_ring_reset_interval',
|
||||
default=180,
|
||||
help=_('Interval (in seconds) between hash ring resets.')),
|
||||
default=15,
|
||||
help=_('Time (in seconds) after which the hash ring is '
|
||||
'considered outdated and is refreshed on the next '
|
||||
'access.')),
|
||||
]
|
||||
|
||||
image_opts = [
|
||||
|
|
|
@ -80,29 +80,42 @@ class HashRingManagerTestCase(db_base.DbTestCase):
|
|||
self.ring_manager.get_ring,
|
||||
'driver3', '')
|
||||
|
||||
def test_hash_ring_manager_no_refresh(self):
|
||||
# If a new conductor is registered after the ring manager is
|
||||
# initialized, it won't be seen. Long term this is probably
|
||||
# undesirable, but today is the intended behavior.
|
||||
self.assertRaises(exception.DriverNotFound,
|
||||
def test_hash_ring_manager_automatic_retry(self):
|
||||
self.assertRaises(exception.TemporaryFailure,
|
||||
self.ring_manager.get_ring,
|
||||
'hardware-type', '')
|
||||
self.register_conductors()
|
||||
self.assertRaises(exception.DriverNotFound,
|
||||
self.ring_manager.get_ring,
|
||||
'hardware-type', '')
|
||||
|
||||
def test_hash_ring_manager_refresh(self):
|
||||
CONF.set_override('hash_ring_reset_interval', 30)
|
||||
# Initialize the ring manager to make _hash_rings not None, then
|
||||
# hash ring will refresh only when time interval exceeded.
|
||||
self.assertRaises(exception.DriverNotFound,
|
||||
self.ring_manager.get_ring,
|
||||
'hardware-type', '')
|
||||
self.register_conductors()
|
||||
self.ring_manager.updated_at = time.time() - 31
|
||||
self.ring_manager.get_ring('hardware-type', '')
|
||||
|
||||
def test_hash_ring_manager_reset_interval(self):
|
||||
CONF.set_override('hash_ring_reset_interval', 30)
|
||||
# Just to simplify calculations
|
||||
CONF.set_override('hash_partition_exponent', 0)
|
||||
c1 = self.dbapi.register_conductor({
|
||||
'hostname': 'host1',
|
||||
'drivers': ['driver1', 'driver2'],
|
||||
})
|
||||
c2 = self.dbapi.register_conductor({
|
||||
'hostname': 'host2',
|
||||
'drivers': ['driver1'],
|
||||
})
|
||||
self.dbapi.register_conductor_hardware_interfaces(
|
||||
c1.id, 'hardware-type', 'deploy', ['iscsi', 'direct'], 'iscsi')
|
||||
|
||||
ring = self.ring_manager.get_ring('hardware-type', '')
|
||||
self.assertEqual(1, len(ring))
|
||||
|
||||
self.dbapi.register_conductor_hardware_interfaces(
|
||||
c2.id, 'hardware-type', 'deploy', ['iscsi', 'direct'], 'iscsi')
|
||||
ring = self.ring_manager.get_ring('hardware-type', '')
|
||||
# The new conductor is not known yet. Automatic retry does not kick in,
|
||||
# since there is an active conductor for the requested hardware type.
|
||||
self.assertEqual(1, len(ring))
|
||||
|
||||
self.ring_manager.updated_at = time.time() - 31
|
||||
ring = self.ring_manager.get_ring('hardware-type', '')
|
||||
self.assertEqual(2, len(ring))
|
||||
|
||||
def test_hash_ring_manager_uncached(self):
|
||||
ring_mgr = hash_ring.HashRingManager(cache=False,
|
||||
use_groups=self.use_groups)
|
||||
|
|
|
@ -130,14 +130,16 @@ class RPCAPITestCase(db_base.DbTestCase):
|
|||
|
||||
def test_get_topic_for_driver_unknown_driver(self):
|
||||
CONF.set_override('host', 'fake-host')
|
||||
self.dbapi.register_conductor({
|
||||
c = self.dbapi.register_conductor({
|
||||
'hostname': 'fake-host',
|
||||
'drivers': [],
|
||||
})
|
||||
self.dbapi.register_conductor_hardware_interfaces(
|
||||
c.id, 'fake-driver', 'deploy', ['iscsi', 'direct'], 'iscsi')
|
||||
rpcapi = conductor_rpcapi.ConductorAPI(topic='fake-topic')
|
||||
self.assertRaises(exception.DriverNotFound,
|
||||
rpcapi.get_topic_for_driver,
|
||||
'fake-driver')
|
||||
'fake-driver-2')
|
||||
|
||||
def test_get_topic_for_driver_doesnt_cache(self):
|
||||
CONF.set_override('host', 'fake-host')
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes a race condition in the hash ring implementation that could cause
|
||||
an internal server error on any request. See `story 2003966
|
||||
<https://storyboard.openstack.org/#!/story/2003966>`_ for details.
|
||||
upgrade:
|
||||
- |
|
||||
The ``hash_ring_reset_interval`` configuration option was changed from 180
|
||||
to 15 seconds. Previously, this option was essentially ignored on the API
|
||||
side, becase the hash ring was reset on each API access. The lower value
|
||||
minimizes the probability of a request routed to a wrong conductor when the
|
||||
ring needs rebalancing.
|
Loading…
Reference in New Issue