NVMe-oF: Fix attach when reconnecting
When an nvme subsystem has all portals in connecting state and we try to attach a new volume to that same subsystem it will fail. We can reproduce it with LVM+nvmet if we configure it to share targets and then: - Create instance - Attach 2 volumes - Delete instance (this leaves the subsystem in connecting state [1]) - Create instance - Attach volume <== FAILS The problem comes from the '_connect_target' method that ignores subsystems in 'connecting' state, so if they are all in that state it considers it equivalent to all portals being inaccessible. This patch changes this behavior and if we cannot connect to a target but we have portals in 'connecting' state we wait for the next retry of the nvme linux driver. Specifically we wait 10 more seconds that the interval between retries. [1]: https://bugs.launchpad.net/nova/+bug/2035375 Closes-Bug: #2035695 Change-Id: Ife710f52c339d67f2dcb160c20ad0d75480a1f48
This commit is contained in:
parent
aa83cc22e5
commit
ec22c32de6
|
@ -133,6 +133,9 @@ class Portal(object):
|
|||
"""Representation of an NVMe-oF Portal with some related operations."""
|
||||
LIVE = 'live'
|
||||
MISSING = None # Unkown or not present in the system
|
||||
CONNECTING = 'connecting'
|
||||
# Default value of reconnect_delay in sysfs
|
||||
DEFAULT_RECONNECT_DELAY = 10
|
||||
controller: Optional[str] = None # Don't know controller name on start
|
||||
|
||||
def __str__(self) -> str:
|
||||
|
@ -176,6 +179,15 @@ class Portal(object):
|
|||
return ctrl_property('state', self.controller)
|
||||
return None
|
||||
|
||||
@property
|
||||
def reconnect_delay(self) -> int:
|
||||
# 10 seconds is the default value of reconnect_delay
|
||||
if self.controller:
|
||||
res = ctrl_property('reconnect_delay', self.controller)
|
||||
if res is not None:
|
||||
return int(res)
|
||||
return self.DEFAULT_RECONNECT_DELAY
|
||||
|
||||
def get_device(self) -> Optional[str]:
|
||||
"""Get a device path using available volume identification markers.
|
||||
|
||||
|
@ -683,6 +695,8 @@ class NVMeOFConnector(base.BaseLinuxConnector):
|
|||
|
||||
# Use a class attribute since a restart is needed to change it on the host
|
||||
native_multipath_supported = None
|
||||
# Time we think is more than reasonable to establish an NVMe-oF connection
|
||||
TIME_TO_CONNECT = 10
|
||||
|
||||
def __init__(self,
|
||||
root_helper: str,
|
||||
|
@ -861,6 +875,7 @@ class NVMeOFConnector(base.BaseLinuxConnector):
|
|||
"""
|
||||
connected = False
|
||||
missing_portals = []
|
||||
reconnecting_portals = []
|
||||
|
||||
for portal in target.portals:
|
||||
state = portal.state # store it so we read only once from sysfs
|
||||
|
@ -871,6 +886,9 @@ class NVMeOFConnector(base.BaseLinuxConnector):
|
|||
# Remember portals that are not present in the system
|
||||
elif state == portal.MISSING:
|
||||
missing_portals.append(portal)
|
||||
elif state == portal.CONNECTING:
|
||||
LOG.debug('%s is reconnecting', portal)
|
||||
reconnecting_portals.append(portal)
|
||||
# Ignore reconnecting/dead portals
|
||||
else:
|
||||
LOG.debug('%s exists but is %s', portal, state)
|
||||
|
@ -905,11 +923,32 @@ class NVMeOFConnector(base.BaseLinuxConnector):
|
|||
LOG.warning('Race condition with some other application '
|
||||
'when connecting to %s, please check your '
|
||||
'system configuration.', portal)
|
||||
connected = True
|
||||
state = portal.state
|
||||
if state == portal.LIVE:
|
||||
connected = True
|
||||
elif state == portal.CONNECTING:
|
||||
reconnecting_portals.append(portal)
|
||||
else:
|
||||
LOG.error('Ignoring %s due to unknown state (%s)',
|
||||
portal, state)
|
||||
|
||||
if not do_multipath:
|
||||
break # We are connected
|
||||
|
||||
if not connected and reconnecting_portals:
|
||||
delay = self.TIME_TO_CONNECT + max(p.reconnect_delay
|
||||
for p in reconnecting_portals)
|
||||
LOG.debug('Waiting %s seconds for some nvme controllers to '
|
||||
'reconnect', delay)
|
||||
timeout = time.time() + delay
|
||||
while time.time() < timeout:
|
||||
time.sleep(1)
|
||||
if any(p.is_live for p in reconnecting_portals):
|
||||
LOG.debug('Reconnected')
|
||||
connected = True
|
||||
break
|
||||
LOG.debug('No controller reconnected')
|
||||
|
||||
if not connected:
|
||||
raise exception.VolumeDeviceNotFound(device=target.nqn)
|
||||
|
||||
|
|
|
@ -160,6 +160,13 @@ class PortalTestCase(test_base.TestCase):
|
|||
self.assertIs(expected, self.portal.is_live)
|
||||
mock_state.assert_called_once_with()
|
||||
|
||||
@mock.patch.object(nvmeof, 'ctrl_property', return_value='10')
|
||||
def test_reconnect_delay(self, mock_property):
|
||||
"""Reconnect delay returns an int."""
|
||||
self.portal.controller = 'nvme0'
|
||||
self.assertIs(10, self.portal.reconnect_delay)
|
||||
mock_property.assert_called_once_with('reconnect_delay', 'nvme0')
|
||||
|
||||
@mock.patch.object(nvmeof, 'ctrl_property')
|
||||
def test_state(self, mock_property):
|
||||
"""State uses sysfs to check the value."""
|
||||
|
@ -1360,6 +1367,11 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
|
|||
else:
|
||||
mock_cli.assert_not_called()
|
||||
|
||||
@mock.patch('time.time', side_effect=[0, 1, 20] * 3)
|
||||
@mock.patch.object(nvmeof.Portal, 'reconnect_delay',
|
||||
new_callable=mock.PropertyMock, return_value=10)
|
||||
@mock.patch.object(nvmeof.Portal, 'is_live',
|
||||
new_callable=mock.PropertyMock, return_value=False)
|
||||
@mock.patch.object(nvmeof.Target, 'find_device')
|
||||
@mock.patch.object(nvmeof.Target, 'set_portals_controllers')
|
||||
@mock.patch.object(nvmeof.NVMeOFConnector, 'run_nvme_cli')
|
||||
|
@ -1367,7 +1379,7 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
|
|||
@mock.patch.object(nvmeof.Portal, 'state', new_callable=mock.PropertyMock)
|
||||
def test__connect_target_portals_down(
|
||||
self, mock_state, mock_rescan, mock_cli, mock_set_ctrls,
|
||||
mock_find_dev):
|
||||
mock_find_dev, mock_is_live, mock_delay, mock_time):
|
||||
"""Test connect target has all portal connections down."""
|
||||
retries = 3
|
||||
mock_state.side_effect = retries * 3 * ['connecting']
|
||||
|
@ -1377,12 +1389,19 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
|
|||
self.conn_props.targets[0])
|
||||
|
||||
self.assertEqual(retries * 3, mock_state.call_count)
|
||||
self.assertEqual(retries * 3, mock_is_live.call_count)
|
||||
self.assertEqual(retries * 3, mock_delay.call_count)
|
||||
mock_state.assert_has_calls(retries * 3 * [mock.call()])
|
||||
mock_rescan.assert_not_called()
|
||||
mock_set_ctrls.assert_not_called()
|
||||
mock_find_dev.assert_not_called()
|
||||
mock_cli.assert_not_called()
|
||||
|
||||
@mock.patch('time.time', side_effect=[0, 1, 20] * 3)
|
||||
@mock.patch.object(nvmeof.Portal, 'reconnect_delay',
|
||||
new_callable=mock.PropertyMock, return_value=10)
|
||||
@mock.patch.object(nvmeof.Portal, 'is_live',
|
||||
new_callable=mock.PropertyMock, return_value=False)
|
||||
@mock.patch.object(nvmeof.LOG, 'error')
|
||||
@mock.patch.object(nvmeof.Target, 'find_device')
|
||||
@mock.patch.object(nvmeof.Target, 'set_portals_controllers')
|
||||
|
@ -1391,7 +1410,7 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
|
|||
@mock.patch.object(nvmeof.Portal, 'state', new_callable=mock.PropertyMock)
|
||||
def test__connect_target_no_portals_connect(
|
||||
self, mock_state, mock_rescan, mock_cli, mock_set_ctrls,
|
||||
mock_find_dev, mock_log):
|
||||
mock_find_dev, mock_log, mock_is_live, mock_delay, mock_time):
|
||||
"""Test connect target when fails to connect to any portal."""
|
||||
retries = 3
|
||||
mock_state.side_effect = retries * ['connecting', 'connecting', None]
|
||||
|
@ -1414,6 +1433,9 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
|
|||
retries * [mock.call(['connect', '-a', portal.address,
|
||||
'-s', portal.port, '-t', portal.transport,
|
||||
'-n', target.nqn, '-Q', '128', '-l', '-1'])])
|
||||
# There are 2 in connecting state
|
||||
self.assertEqual(retries * 2, mock_is_live.call_count)
|
||||
self.assertEqual(retries * 2, mock_delay.call_count)
|
||||
|
||||
@mock.patch.object(nvmeof.Target, 'find_device')
|
||||
@mock.patch.object(nvmeof.Target, 'set_portals_controllers')
|
||||
|
@ -1490,7 +1512,7 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
|
|||
self, exit_code, mock_state, mock_rescan, mock_cli,
|
||||
mock_set_ctrls, mock_find_dev, mock_log):
|
||||
"""Treat race condition with sysadmin as success."""
|
||||
mock_state.side_effect = ['connecting', 'connecting', None]
|
||||
mock_state.side_effect = ['connecting', 'connecting', None, 'live']
|
||||
dev_path = '/dev/nvme0n1'
|
||||
mock_find_dev.return_value = dev_path
|
||||
mock_cli.side_effect = putils.ProcessExecutionError(
|
||||
|
@ -1501,8 +1523,8 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
|
|||
res = self.connector._connect_target(target)
|
||||
self.assertEqual(dev_path, res)
|
||||
|
||||
self.assertEqual(3, mock_state.call_count)
|
||||
mock_state.assert_has_calls(3 * [mock.call()])
|
||||
self.assertEqual(4, mock_state.call_count)
|
||||
mock_state.assert_has_calls(4 * [mock.call()])
|
||||
mock_rescan.assert_not_called()
|
||||
mock_set_ctrls.assert_called_once_with()
|
||||
mock_find_dev.assert_called_once_with()
|
||||
|
@ -1513,6 +1535,126 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
|
|||
portal.transport, '-n', target.nqn, '-Q', '128', '-l', '-1'])
|
||||
self.assertEqual(1, mock_log.call_count)
|
||||
|
||||
@ddt.data(70, errno.EALREADY)
|
||||
@mock.patch.object(nvmeof.LOG, 'warning')
|
||||
@mock.patch('time.sleep')
|
||||
@mock.patch('time.time', side_effect=[0, 0.1, 0.6])
|
||||
@mock.patch.object(nvmeof.Portal, 'reconnect_delay',
|
||||
new_callable=mock.PropertyMock, return_value=10)
|
||||
@mock.patch.object(nvmeof.Portal, 'is_live',
|
||||
new_callable=mock.PropertyMock)
|
||||
@mock.patch.object(nvmeof.Target, 'find_device')
|
||||
@mock.patch.object(nvmeof.Target, 'set_portals_controllers')
|
||||
@mock.patch.object(nvmeof.NVMeOFConnector, 'run_nvme_cli')
|
||||
@mock.patch.object(nvmeof.NVMeOFConnector, 'rescan')
|
||||
@mock.patch.object(nvmeof.Portal, 'state', new_callable=mock.PropertyMock)
|
||||
def test__connect_target_race_connecting(
|
||||
self, exit_code, mock_state, mock_rescan, mock_cli, mock_set_ctrls,
|
||||
mock_find_dev, mock_is_live, mock_delay, mock_time, mock_sleep,
|
||||
mock_log):
|
||||
"""Test connect target when portal is reconnecting after race."""
|
||||
mock_cli.side_effect = putils.ProcessExecutionError(
|
||||
exit_code=exit_code)
|
||||
mock_state.side_effect = ['connecting', 'connecting', None,
|
||||
'connecting']
|
||||
mock_is_live.side_effect = [False, False, False, False, True]
|
||||
|
||||
target = self.conn_props.targets[0]
|
||||
res = self.connector._connect_target(target)
|
||||
|
||||
self.assertEqual(mock_find_dev.return_value, res)
|
||||
self.assertEqual(4, mock_state.call_count)
|
||||
self.assertEqual(5, mock_is_live.call_count)
|
||||
self.assertEqual(3, mock_delay.call_count)
|
||||
self.assertEqual(2, mock_sleep.call_count)
|
||||
mock_sleep.assert_has_calls(2 * [mock.call(1)])
|
||||
mock_rescan.assert_not_called()
|
||||
mock_set_ctrls.assert_called_once()
|
||||
mock_find_dev.assert_called_once()
|
||||
portal = target.portals[-1]
|
||||
mock_cli.assert_called_once_with([
|
||||
'connect', '-a', portal.address, '-s', portal.port, '-t',
|
||||
portal.transport, '-n', target.nqn, '-Q', '128', '-l', '-1'])
|
||||
self.assertEqual(1, mock_log.call_count)
|
||||
|
||||
@ddt.data(70, errno.EALREADY)
|
||||
@mock.patch.object(nvmeof.LOG, 'warning')
|
||||
@mock.patch.object(nvmeof.LOG, 'error')
|
||||
@mock.patch('time.sleep')
|
||||
@mock.patch('time.time', side_effect=[0, 0.1, 0.6])
|
||||
@mock.patch.object(nvmeof.Portal, 'reconnect_delay',
|
||||
new_callable=mock.PropertyMock, return_value=10)
|
||||
@mock.patch.object(nvmeof.Portal, 'is_live',
|
||||
new_callable=mock.PropertyMock)
|
||||
@mock.patch.object(nvmeof.Target, 'find_device')
|
||||
@mock.patch.object(nvmeof.Target, 'set_portals_controllers')
|
||||
@mock.patch.object(nvmeof.NVMeOFConnector, 'run_nvme_cli')
|
||||
@mock.patch.object(nvmeof.NVMeOFConnector, 'rescan')
|
||||
@mock.patch.object(nvmeof.Portal, 'state', new_callable=mock.PropertyMock)
|
||||
def test__connect_target_race_unknown(
|
||||
self, exit_code, mock_state, mock_rescan, mock_cli, mock_set_ctrls,
|
||||
mock_find_dev, mock_is_live, mock_delay, mock_time, mock_sleep,
|
||||
mock_log_err, mock_log_warn):
|
||||
"""Test connect target when portal is unknown after race."""
|
||||
mock_cli.side_effect = putils.ProcessExecutionError(
|
||||
exit_code=exit_code)
|
||||
mock_state.side_effect = ['connecting', 'connecting', None,
|
||||
'unknown']
|
||||
mock_is_live.side_effect = [False, False, False, True]
|
||||
|
||||
target = self.conn_props.targets[0]
|
||||
res = self.connector._connect_target(target)
|
||||
|
||||
self.assertEqual(mock_find_dev.return_value, res)
|
||||
self.assertEqual(4, mock_state.call_count)
|
||||
self.assertEqual(4, mock_is_live.call_count)
|
||||
self.assertEqual(2, mock_delay.call_count)
|
||||
self.assertEqual(2, mock_sleep.call_count)
|
||||
mock_sleep.assert_has_calls(2 * [mock.call(1)])
|
||||
mock_rescan.assert_not_called()
|
||||
mock_set_ctrls.assert_called_once()
|
||||
mock_find_dev.assert_called_once()
|
||||
portal = target.portals[-1]
|
||||
mock_cli.assert_called_once_with([
|
||||
'connect', '-a', portal.address, '-s', portal.port, '-t',
|
||||
portal.transport, '-n', target.nqn, '-Q', '128', '-l', '-1'])
|
||||
self.assertEqual(1, mock_log_err.call_count)
|
||||
self.assertEqual(1, mock_log_warn.call_count)
|
||||
|
||||
@mock.patch('time.sleep')
|
||||
@mock.patch('time.time', side_effect=[0, 0.1, 0.6])
|
||||
@mock.patch.object(nvmeof.Portal, 'reconnect_delay',
|
||||
new_callable=mock.PropertyMock, return_value=10)
|
||||
@mock.patch.object(nvmeof.Portal, 'is_live',
|
||||
new_callable=mock.PropertyMock)
|
||||
@mock.patch.object(nvmeof.Target, 'find_device')
|
||||
@mock.patch.object(nvmeof.Target, 'set_portals_controllers')
|
||||
@mock.patch.object(nvmeof.NVMeOFConnector, 'run_nvme_cli')
|
||||
@mock.patch.object(nvmeof.NVMeOFConnector, 'rescan')
|
||||
@mock.patch.object(nvmeof.Portal, 'state', new_callable=mock.PropertyMock,
|
||||
return_value='connecting')
|
||||
def test__connect_target_portals_connecting(
|
||||
self, mock_state, mock_rescan, mock_cli, mock_set_ctrls,
|
||||
mock_find_dev, mock_is_live, mock_delay, mock_time, mock_sleep):
|
||||
"""Test connect target when portals reconnect."""
|
||||
# First pass everything connecting, second pass the second portal is up
|
||||
mock_is_live.side_effect = [False, False, False, False, True]
|
||||
|
||||
# Connecting state changing after 2 sleeps
|
||||
target = self.conn_props.targets[0]
|
||||
res = self.connector._connect_target(target)
|
||||
|
||||
self.assertEqual(mock_find_dev.return_value, res)
|
||||
self.assertEqual(3, mock_state.call_count)
|
||||
self.assertEqual(5, mock_is_live.call_count)
|
||||
self.assertEqual(3, mock_delay.call_count)
|
||||
self.assertEqual(2, mock_sleep.call_count)
|
||||
mock_sleep.assert_has_calls(2 * [mock.call(1)])
|
||||
mock_rescan.assert_not_called()
|
||||
mock_set_ctrls.assert_called_once()
|
||||
mock_find_dev.assert_called_once()
|
||||
mock_cli.assert_not_called()
|
||||
|
||||
@mock.patch.object(nvmeof.NVMeOFConnector, 'stop_and_assemble_raid')
|
||||
@mock.patch.object(nvmeof.NVMeOFConnector, '_is_device_in_raid')
|
||||
def test_handle_replicated_volume_existing(
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
---
|
||||
fixes:
|
||||
- |
|
||||
NVMe-oF connector `bug #2035695
|
||||
<https://bugs.launchpad.net/os-brick/+bug/2035695>`_: Fixed
|
||||
attaching volumes when all portals of a subsystem are reconnecting at the
|
||||
time of the request.
|
Loading…
Reference in New Issue