Brick: Fix race in removing iSCSI device

Some folks are seeing race conditions on overloaded
or slow machines, where the time between issuing the
remove volume command, and the volume being from
/dev/disk/by-path by the kernel isn't long enough.

This patch adds the Cinder retry decorator mechanism.
This patch uses the new retry decorator to try 3 times
to wait for the volume to disappear from the system,
before moving on.

Change-Id: I24cc18d188b8eeb3b7a1392b2bd8b520dc2a0e03
Closes-Bug: 1437441
This commit is contained in:
Walter A. Boring IV 2015-03-27 15:37:19 -07:00
parent 7d341f1543
commit f4ecdf0f60
8 changed files with 279 additions and 36 deletions

View File

@ -98,5 +98,9 @@ class VolumeDeviceNotFound(BrickException):
message = _("Volume device not found at %(device)s.")
class VolumePathNotRemoved(BrickException):
message = _("Volume path %(volume_path)s was not removed in time.")
class ProtocolNotSupported(BrickException):
message = _("Connect to volume via protocol %(protocol)s not supported.")

View File

@ -412,6 +412,13 @@ class ISCSIConnector(InitiatorConnector):
if dev_name:
self._linuxscsi.remove_scsi_device(dev_name)
# NOTE(jdg): On busy systems we can have a race here
# where remove_iscsi_device is called before the device file
# has actually been removed. The result is an orphaned
# iscsi session that never gets logged out. The following
# call to wait addresses that issue.
self._linuxscsi.wait_for_volume_removal(host_device)
# NOTE(vish): Only disconnect from the target if no luns from the
# target are in use.
device_prefix = ("/dev/disk/by-path/ip-%(portal)s-iscsi-%(iqn)s-lun-" %

View File

@ -22,8 +22,10 @@ import re
from oslo_concurrency import processutils as putils
from oslo_log import log as logging
from os_brick import exception
from os_brick import executor
from os_brick.i18n import _LW
from os_brick import utils
LOG = logging.getLogger(__name__)
@ -67,6 +69,19 @@ class LinuxSCSI(executor.Executor):
{'device': device, 'path': path})
self.echo_scsi_command(path, "1")
@utils.retry(exceptions=exception.VolumePathNotRemoved, retries=3,
backoff_rate=1)
def wait_for_volume_removal(self, volume_path):
"""This is used to ensure that volumes are gone."""
LOG.debug("Checking to see if SCSI volume %s has been removed.",
volume_path)
if os.path.exists(volume_path):
LOG.debug("%(path)s still exists.", {'path': volume_path})
raise exception.VolumePathNotRemoved(
volume_path=volume_path)
else:
LOG.debug("SCSI volume %s has been removed.", volume_path)
def get_device_info(self, device):
(out, _err) = self._execute('sg_scan', device, run_as_root=True,
root_helper=self._root_helper)

View File

@ -234,9 +234,13 @@ class ISCSIConnectorTestCase(ConnectorTestCase):
initiator = self.connector.get_initiator()
self.assertEqual(initiator, 'iqn.1234-56.foo.bar:01:23456789abc')
@mock.patch.object(os.path, 'exists', return_value=True)
def _test_connect_volume(self, extra_props, additional_commands,
exists_mock):
disconnect_mock=None):
# for making sure the /dev/disk/by-path is gone
exists_mock = mock.Mock()
exists_mock.return_value = True
os.path.exists = exists_mock
location = '10.0.2.15:3260'
name = 'volume-00000001'
iqn = 'iqn.2010-10.org.openstack:%s' % name
@ -249,29 +253,40 @@ class ISCSIConnectorTestCase(ConnectorTestCase):
self.assertEqual(device['type'], 'block')
self.assertEqual(device['path'], dev_str)
self.connector.disconnect_volume(connection_info['data'], device)
expected_commands = [('iscsiadm -m node -T %s -p %s' %
(iqn, location)),
('iscsiadm -m session'),
('iscsiadm -m node -T %s -p %s --login' %
(iqn, location)),
('iscsiadm -m node -T %s -p %s --op update'
' -n node.startup -v automatic'
% (iqn, location)),
('iscsiadm -m node --rescan'),
('iscsiadm -m session --rescan'),
('blockdev --flushbufs /dev/sdb'),
('tee -a /sys/block/sdb/device/delete'),
('iscsiadm -m node -T %s -p %s --op update'
' -n node.startup -v manual' % (iqn, location)),
('iscsiadm -m node -T %s -p %s --logout' %
(iqn, location)),
('iscsiadm -m node -T %s -p %s --op delete' %
(iqn, location)), ] + additional_commands
LOG.debug("self.cmds = %s" % self.cmds)
LOG.debug("expected = %s" % expected_commands)
self.count = 0
self.assertEqual(expected_commands, self.cmds)
def mock_exists_effect(*args, **kwargs):
self.count = self.count + 1
if self.count == 4:
return False
else:
return True
if disconnect_mock is None:
disconnect_mock = mock_exists_effect
with mock.patch.object(os.path, 'exists',
side_effect=disconnect_mock):
self.connector.disconnect_volume(connection_info['data'], device)
expected_commands = [
('iscsiadm -m node -T %s -p %s' % (iqn, location)),
('iscsiadm -m session'),
('iscsiadm -m node -T %s -p %s --login' % (iqn, location)),
('iscsiadm -m node -T %s -p %s --op update'
' -n node.startup -v automatic' % (iqn, location)),
('iscsiadm -m node --rescan'),
('iscsiadm -m session --rescan'),
('blockdev --flushbufs /dev/sdb'),
('tee -a /sys/block/sdb/device/delete'),
('iscsiadm -m node -T %s -p %s --op update'
' -n node.startup -v manual' % (iqn, location)),
('iscsiadm -m node -T %s -p %s --logout' % (iqn, location)),
('iscsiadm -m node -T %s -p %s --op delete' %
(iqn, location)), ] + additional_commands
LOG.debug("self.cmds = %s", self.cmds)
LOG.debug("expected = %s", expected_commands)
self.assertEqual(expected_commands, self.cmds)
@testtools.skipUnless(os.path.exists('/dev/disk/by-path'),
'Test requires /dev/disk/by-path')
@ -297,7 +312,20 @@ class ISCSIConnectorTestCase(ConnectorTestCase):
(iqn2, location2)),
('iscsiadm -m node -T %s -p %s --op delete' %
(iqn2, location2))]
self._test_connect_volume(extra_props, additional_commands)
def mock_exists_effect(*args, **kwargs):
self.count = self.count + 1
# we have 2 targets in this test, so we need
# to make sure we remove and detect removal
# for both.
if (self.count == 4 or
self.count == 8):
return False
else:
return True
self._test_connect_volume(extra_props, additional_commands,
disconnect_mock=mock_exists_effect)
@testtools.skipUnless(os.path.exists('/dev/disk/by-path'),
'Test requires /dev/disk/by-path')
@ -338,17 +366,19 @@ class ISCSIConnectorTestCase(ConnectorTestCase):
check_exit_code=[0, 255])
mock_iscsiadm.reset_mock()
self.connector.disconnect_volume(connection_info['data'], device)
props = connection_info['data'].copy()
for key in ('target_portals', 'target_iqns', 'target_luns'):
props.pop(key, None)
mock_iscsiadm.assert_any_call(props, ('--logout',),
check_exit_code=[0, 21, 255])
props['target_portal'] = location2
props['target_iqn'] = iqn2
props['target_lun'] = 2
mock_iscsiadm.assert_any_call(props, ('--logout',),
check_exit_code=[0, 21, 255])
with mock.patch.object(os.path, 'exists',
return_value=False):
self.connector.disconnect_volume(connection_info['data'], device)
props = connection_info['data'].copy()
for key in ('target_portals', 'target_iqns', 'target_luns'):
props.pop(key, None)
mock_iscsiadm.assert_any_call(props, ('--logout',),
check_exit_code=[0, 21, 255])
props['target_portal'] = location2
props['target_iqn'] = iqn2
props['target_lun'] = 2
mock_iscsiadm.assert_any_call(props, ('--logout',),
check_exit_code=[0, 21, 255])
@mock.patch.object(connector.ISCSIConnector, '_run_iscsiadm_bare')
@mock.patch.object(connector.ISCSIConnector,

View File

@ -19,6 +19,7 @@ import string
import mock
from oslo_log import log as logging
from os_brick import exception
from os_brick.initiator import linuxscsi
from os_brick.tests import base
@ -67,6 +68,22 @@ class LinuxSCSITestCase(base.TestCase):
('tee -a /sys/block/sdc/device/delete')]
self.assertEqual(expected_commands, self.cmds)
def test_wait_for_volume_removal(self):
fake_path = '/dev/disk/by-path/fake-iscsi-iqn-lun-0'
exists_mock = mock.Mock()
exists_mock.return_value = True
os.path.exists = exists_mock
self.assertRaises(exception.VolumePathNotRemoved,
self.linuxscsi.wait_for_volume_removal,
fake_path)
exists_mock = mock.Mock()
exists_mock.return_value = False
os.path.exists = exists_mock
self.linuxscsi.wait_for_volume_removal(fake_path)
expected_commands = []
self.assertEqual(expected_commands, self.cmds)
def test_flush_multipath_device(self):
self.linuxscsi.flush_multipath_device('/dev/dm-9')
expected_commands = [('multipath -f /dev/dm-9')]

View File

@ -0,0 +1,111 @@
# (c) Copyright 2015 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import time
import mock
from oslo_log import log as logging
from os_brick import exception
from os_brick.tests import base
from os_brick import utils
LOG = logging.getLogger(__name__)
class WrongException(exception.BrickException):
pass
class TestRetryDecorator(base.TestCase):
def test_no_retry_required(self):
self.counter = 0
with mock.patch.object(time, 'sleep') as mock_sleep:
@utils.retry(exceptions=exception.VolumeDeviceNotFound,
interval=2,
retries=3,
backoff_rate=2)
def succeeds():
self.counter += 1
return 'success'
ret = succeeds()
self.assertFalse(mock_sleep.called)
self.assertEqual(ret, 'success')
self.assertEqual(self.counter, 1)
def test_retries_once(self):
self.counter = 0
interval = 2
backoff_rate = 2
retries = 3
with mock.patch.object(time, 'sleep') as mock_sleep:
@utils.retry(exception.VolumeDeviceNotFound,
interval,
retries,
backoff_rate)
def fails_once():
self.counter += 1
if self.counter < 2:
raise exception.VolumeDeviceNotFound(device='fake')
else:
return 'success'
ret = fails_once()
self.assertEqual(ret, 'success')
self.assertEqual(self.counter, 2)
self.assertEqual(mock_sleep.call_count, 1)
mock_sleep.assert_called_with(interval * backoff_rate)
def test_limit_is_reached(self):
self.counter = 0
retries = 3
interval = 2
backoff_rate = 4
with mock.patch.object(time, 'sleep') as mock_sleep:
@utils.retry(exception.VolumeDeviceNotFound,
interval,
retries,
backoff_rate)
def always_fails():
self.counter += 1
raise exception.VolumeDeviceNotFound(device='fake')
self.assertRaises(exception.VolumeDeviceNotFound,
always_fails)
self.assertEqual(retries, self.counter)
expected_sleep_arg = []
for i in xrange(retries):
if i > 0:
interval *= backoff_rate
expected_sleep_arg.append(float(interval))
mock_sleep.assert_has_calls(map(mock.call, expected_sleep_arg))
def test_wrong_exception_no_retry(self):
with mock.patch.object(time, 'sleep') as mock_sleep:
@utils.retry(exceptions=exception.VolumeDeviceNotFound)
def raise_unexpected_error():
raise WrongException("wrong exception")
self.assertRaises(WrongException, raise_unexpected_error)
self.assertFalse(mock_sleep.called)

58
os_brick/utils.py Normal file
View File

@ -0,0 +1,58 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
"""Utilities and helper functions."""
from oslo_log import log as logging
import retrying
import six
from os_brick.i18n import _
LOG = logging.getLogger(__name__)
def retry(exceptions, interval=1, retries=3, backoff_rate=2):
def _retry_on_exception(e):
return isinstance(e, exceptions)
def _backoff_sleep(previous_attempt_number, delay_since_first_attempt_ms):
exp = backoff_rate ** previous_attempt_number
wait_for = max(0, interval * exp)
LOG.debug("Sleeping for %s seconds", wait_for)
return wait_for * 1000.0
def _print_stop(previous_attempt_number, delay_since_first_attempt_ms):
delay_since_first_attempt = delay_since_first_attempt_ms / 1000.0
LOG.debug("Failed attempt %s", previous_attempt_number)
LOG.debug("Have been at this for %s seconds",
delay_since_first_attempt)
return previous_attempt_number == retries
if retries < 1:
raise ValueError(_('Retries must be greater than or '
'equal to 1 (received: %s). ') % retries)
def _decorator(f):
@six.wraps(f)
def _wrapper(*args, **kwargs):
r = retrying.Retrying(retry_on_exception=_retry_on_exception,
wait_func=_backoff_sleep,
stop_func=_print_stop)
return r.call(f, *args, **kwargs)
return _wrapper
return _decorator

View File

@ -10,4 +10,5 @@ oslo.log>=1.0.0,<1.1.0
oslo.serialization>=1.4.0,<1.5.0
oslo.i18n>=1.5.0,<1.6.0
oslo.utils>=1.4.0,<1.5.0
retrying>=1.2.3,!=1.3.0
six>=1.9.0