Merge "Fixes a race condition in the hash ring code"

This commit is contained in:
Zuul 2018-10-06 00:31:32 +00:00 committed by Gerrit Code Review
commit 108801007c
6 changed files with 89 additions and 32 deletions

View File

@ -16,6 +16,7 @@
import threading
import time
from oslo_log import log
from tooz import hashring
from ironic.common import exception
@ -24,6 +25,9 @@ from ironic.conf import CONF
from ironic.db import api as dbapi
LOG = log.getLogger(__name__)
class HashRingManager(object):
_hash_rings = None
_lock = threading.Lock()
@ -42,15 +46,20 @@ class HashRingManager(object):
if not self.cache:
return self._load_hash_rings()
# Hot path, no lock
if self.__class__._hash_rings is not None and self.updated_at >= limit:
return self.__class__._hash_rings
# Hot path, no lock. Using a local variable to avoid races with code
# changing the class variable.
hash_rings = self.__class__._hash_rings
if hash_rings is not None and self.updated_at >= limit:
return hash_rings
with self._lock:
if self.__class__._hash_rings is None or self.updated_at < limit:
LOG.debug('Rebuilding cached hash rings')
rings = self._load_hash_rings()
self.__class__._hash_rings = rings
self.updated_at = time.time()
LOG.debug('Finished rebuilding hash rings, available drivers '
'are %s', ', '.join(rings))
return self.__class__._hash_rings
def _load_hash_rings(self):
@ -67,9 +76,28 @@ class HashRingManager(object):
@classmethod
def reset(cls):
with cls._lock:
LOG.debug('Resetting cached hash rings')
cls._hash_rings = None
def get_ring(self, driver_name, conductor_group):
try:
return self._get_ring(driver_name, conductor_group)
except (exception.DriverNotFound, exception.TemporaryFailure):
# NOTE(dtantsur): we assume that this case is more often caused by
# conductors coming and leaving, so we try to rebuild the rings.
LOG.debug('No conductor from group %(group)s found for driver '
'%(driver)s, trying to rebuild the hash rings',
{'driver': driver_name,
'group': conductor_group or '<none>'})
self.__class__.reset()
return self._get_ring(driver_name, conductor_group)
def _get_ring(self, driver_name, conductor_group):
# There are no conductors, temporary failure - 503 Service Unavailable
if not self.ring:
raise exception.TemporaryFailure()
try:
if self.use_groups:
return self.ring['%s:%s' % (conductor_group, driver_name)]

View File

@ -131,12 +131,6 @@ class ConductorAPI(object):
:raises: NoValidHost
"""
self.ring_manager.reset()
# There are no conductors, temporary failure - 503 Service Unavailable
if not self.ring_manager.ring:
raise exception.TemporaryFailure()
try:
ring = self.ring_manager.get_ring(node.driver,
node.conductor_group)
@ -166,7 +160,12 @@ class ConductorAPI(object):
# does not take groups into account.
local_ring_manager = hash_ring.HashRingManager(use_groups=False,
cache=False)
ring = local_ring_manager.get_ring(driver_name, '')
try:
ring = local_ring_manager.get_ring(driver_name, '')
except exception.TemporaryFailure:
# NOTE(dtantsur): even if no conductors are registered, it makes
# sense to report 404 on any driver request.
raise exception.DriverNotFound(_("No conductors registered."))
host = random.choice(list(ring.nodes))
return self.topic + "." + host

View File

@ -192,8 +192,10 @@ hash_opts = [
'and potentially allow the Ironic cluster to recover '
'more quickly if a conductor instance is terminated.')),
cfg.IntOpt('hash_ring_reset_interval',
default=180,
help=_('Interval (in seconds) between hash ring resets.')),
default=15,
help=_('Time (in seconds) after which the hash ring is '
'considered outdated and is refreshed on the next '
'access.')),
]
image_opts = [

View File

@ -80,29 +80,42 @@ class HashRingManagerTestCase(db_base.DbTestCase):
self.ring_manager.get_ring,
'driver3', '')
def test_hash_ring_manager_no_refresh(self):
# If a new conductor is registered after the ring manager is
# initialized, it won't be seen. Long term this is probably
# undesirable, but today is the intended behavior.
self.assertRaises(exception.DriverNotFound,
def test_hash_ring_manager_automatic_retry(self):
self.assertRaises(exception.TemporaryFailure,
self.ring_manager.get_ring,
'hardware-type', '')
self.register_conductors()
self.assertRaises(exception.DriverNotFound,
self.ring_manager.get_ring,
'hardware-type', '')
def test_hash_ring_manager_refresh(self):
CONF.set_override('hash_ring_reset_interval', 30)
# Initialize the ring manager to make _hash_rings not None, then
# hash ring will refresh only when time interval exceeded.
self.assertRaises(exception.DriverNotFound,
self.ring_manager.get_ring,
'hardware-type', '')
self.register_conductors()
self.ring_manager.updated_at = time.time() - 31
self.ring_manager.get_ring('hardware-type', '')
def test_hash_ring_manager_reset_interval(self):
CONF.set_override('hash_ring_reset_interval', 30)
# Just to simplify calculations
CONF.set_override('hash_partition_exponent', 0)
c1 = self.dbapi.register_conductor({
'hostname': 'host1',
'drivers': ['driver1', 'driver2'],
})
c2 = self.dbapi.register_conductor({
'hostname': 'host2',
'drivers': ['driver1'],
})
self.dbapi.register_conductor_hardware_interfaces(
c1.id, 'hardware-type', 'deploy', ['iscsi', 'direct'], 'iscsi')
ring = self.ring_manager.get_ring('hardware-type', '')
self.assertEqual(1, len(ring))
self.dbapi.register_conductor_hardware_interfaces(
c2.id, 'hardware-type', 'deploy', ['iscsi', 'direct'], 'iscsi')
ring = self.ring_manager.get_ring('hardware-type', '')
# The new conductor is not known yet. Automatic retry does not kick in,
# since there is an active conductor for the requested hardware type.
self.assertEqual(1, len(ring))
self.ring_manager.updated_at = time.time() - 31
ring = self.ring_manager.get_ring('hardware-type', '')
self.assertEqual(2, len(ring))
def test_hash_ring_manager_uncached(self):
ring_mgr = hash_ring.HashRingManager(cache=False,
use_groups=self.use_groups)

View File

@ -130,14 +130,16 @@ class RPCAPITestCase(db_base.DbTestCase):
def test_get_topic_for_driver_unknown_driver(self):
CONF.set_override('host', 'fake-host')
self.dbapi.register_conductor({
c = self.dbapi.register_conductor({
'hostname': 'fake-host',
'drivers': [],
})
self.dbapi.register_conductor_hardware_interfaces(
c.id, 'fake-driver', 'deploy', ['iscsi', 'direct'], 'iscsi')
rpcapi = conductor_rpcapi.ConductorAPI(topic='fake-topic')
self.assertRaises(exception.DriverNotFound,
rpcapi.get_topic_for_driver,
'fake-driver')
'fake-driver-2')
def test_get_topic_for_driver_doesnt_cache(self):
CONF.set_override('host', 'fake-host')

View File

@ -0,0 +1,13 @@
---
fixes:
- |
Fixes a race condition in the hash ring implementation that could cause
an internal server error on any request. See `story 2003966
<https://storyboard.openstack.org/#!/story/2003966>`_ for details.
upgrade:
- |
The ``hash_ring_reset_interval`` configuration option was changed from 180
to 15 seconds. Previously, this option was essentially ignored on the API
side, becase the hash ring was reset on each API access. The lower value
minimizes the probability of a request routed to a wrong conductor when the
ring needs rebalancing.