Fix workaround for the "device is busy" problem

Right after partitioning, we have a sleep(3) to avoid the "device is
busy" problem. A less error-prone solution is to check with fuser,
whether there is any process currently using the disk.

The following config options are introduced under the 'disk_partitioner'
group:
- check_device_interval: seconds between consecutive checks for the
device status
- check_device_max_retries: number of retries for checking the device
status

Change-Id: Icb73db5aeb16d1abc390975c2bd8bf4b486566e5
Implements: blueprint device-status-check-with-fuser
Closes-Bug: #1301975
This commit is contained in:
Imre Farkas 2014-04-24 17:44:36 +02:00
parent 510b967870
commit 5166aab73a
5 changed files with 219 additions and 29 deletions

View File

@ -652,6 +652,25 @@
#db_max_retries=20
[disk_partitioner]
#
# Options defined in ironic.common.disk_partitioner
#
# After Ironic has completed creating the partition table, it
# continues to check for activity on the attached iSCSI device
# status at this interval prior to copying the image to the
# node, in seconds (integer value)
#check_device_interval=1
# The maximum number of times to check that the device is not
# accessed by another process. If the device is still busy
# after that, the disk partitioning will be treated as having
# failed. (integer value)
#check_device_max_retries=20
[glance]
#

View File

@ -13,4 +13,5 @@ mkswap: CommandFilter, mkswap, root
mkfs: CommandFilter, mkfs, root
# ironic/common/disk_partitioner.py
fuser: CommandFilter, fuser, root
parted: CommandFilter, parted, root

View File

@ -14,11 +14,35 @@
# under the License.
import re
import time
from oslo.config import cfg
from ironic.common import exception
from ironic.common import utils
from ironic.openstack.common import log as logging
from ironic.openstack.common import loopingcall
from ironic.openstack.common import processutils
opts = [
cfg.IntOpt('check_device_interval',
default=1,
help='After Ironic has completed creating the partition table, '
'it continues to check for activity on the attached iSCSI '
'device status at this interval prior to copying the image'
' to the node, in seconds'),
cfg.IntOpt('check_device_max_retries',
default=20,
help='The maximum number of times to check that the device is '
'not accessed by another process. If the device is still '
'busy after that, the disk partitioning will be treated as'
' having failed.'),
]
CONF = cfg.CONF
opt_group = cfg.OptGroup(name='disk_partitioner',
title='Options for the disk partitioner')
CONF.register_group(opt_group)
CONF.register_opts(opts, opt_group)
LOG = logging.getLogger(__name__)
@ -41,6 +65,7 @@ class DiskPartitioner(object):
self._disk_label = disk_label
self._alignment = alignment
self._partitions = []
self._fuser_pids_re = re.compile(r'((\d)+\s*)+')
def _exec(self, *args):
# NOTE(lucasagomes): utils.execute() is already a wrapper on top
@ -82,6 +107,30 @@ class DiskPartitioner(object):
"""
return enumerate(self._partitions, 1)
def _wait_for_disk_to_become_available(self, retries, max_retries, pids,
stderr):
retries[0] += 1
if retries[0] > max_retries:
raise loopingcall.LoopingCallDone()
try:
# NOTE(ifarkas): fuser returns a non-zero return code if none of
# the specified files is accessed
out, err = utils.execute('fuser', self._device,
check_exit_code=[0, 1], run_as_root=True)
if not out and not err:
raise loopingcall.LoopingCallDone()
else:
if err:
stderr[0] = err
if out:
pids_match = re.search(self._fuser_pids_re, out)
pids[0] = pids_match.group()
except processutils.ProcessExecutionError as exc:
LOG.warning(_('Failed to check the device %(device)s with fuser: '
' %(err)s') % {'device': self._device, 'err': exc})
def commit(self):
"""Write to the disk."""
cmd_args = ['mklabel', self._disk_label]
@ -97,10 +146,31 @@ class DiskPartitioner(object):
start = end
self._exec(*cmd_args)
# TODO(lucasagomes): Do not sleep, use another mechanism to avoid
# the "device is busy" problem. lsof, fuser...
# avoid "device is busy"
time.sleep(3)
retries = [0]
pids = ['']
fuser_err = ['']
interval = CONF.disk_partitioner.check_device_interval
max_retries = CONF.disk_partitioner.check_device_max_retries
timer = loopingcall.FixedIntervalLoopingCall(
self._wait_for_disk_to_become_available,
retries, max_retries, pids, fuser_err)
timer.start(interval=interval).wait()
if retries[0] > max_retries:
if pids[0]:
raise exception.InstanceDeployFailure(
_('Disk partitioning failed on device %(device)s. '
'Processes with the following PIDs are holding it: '
'%(pids)s. Time out waiting for completion.')
% {'device': self._device, 'pids': pids[0]})
else:
raise exception.InstanceDeployFailure(
_('Disk partitioning failed on device %(device)s. Fuser '
'exited with "%(fuser_err)s". Time out waiting for '
'completion.')
% {'device': self._device, 'fuser_err': fuser_err[0]})
_PARTED_PRINT_RE = re.compile(r"^\d+:([\d\.]+)MiB:"

View File

@ -14,10 +14,8 @@
# License for the specific language governing permissions and limitations
# under the License.
import itertools
import time
import fixtures
import itertools
import mock
import os
import tempfile
@ -490,7 +488,6 @@ class WorkOnDiskTestCase(tests_base.TestCase):
commit=True)
@mock.patch.object(time, 'sleep', lambda _: None)
@mock.patch.object(common_utils, 'execute')
class MakePartitionsTestCase(tests_base.TestCase):
@ -504,13 +501,19 @@ class MakePartitionsTestCase(tests_base.TestCase):
'--', 'unit', 'MiB', 'mklabel', 'msdos']
def test_make_partitions(self, mock_exc):
expected_mkpart = ['mkpart', 'primary', 'linux-swap', '1', '513',
'mkpart', 'primary', '', '513', '1537']
cmd = self.parted_static_cmd + expected_mkpart
mock_exc.return_value = (None, None)
utils.make_partitions(self.dev, self.root_mb, self.swap_mb,
self.ephemeral_mb)
mock_exc.assert_called_once_with(*cmd, run_as_root=True,
check_exit_code=[0])
expected_mkpart = ['mkpart', 'primary', 'linux-swap', '1', '513',
'mkpart', 'primary', '', '513', '1537']
parted_cmd = self.parted_static_cmd + expected_mkpart
parted_call = mock.call(*parted_cmd, run_as_root=True,
check_exit_code=[0])
fuser_cmd = ['fuser', 'fake-dev']
fuser_call = mock.call(*fuser_cmd, run_as_root=True,
check_exit_code=[0, 1])
mock_exc.assert_has_calls([parted_call, fuser_call])
def test_make_partitions_with_ephemeral(self, mock_exc):
self.ephemeral_mb = 2048
@ -518,10 +521,12 @@ class MakePartitionsTestCase(tests_base.TestCase):
'mkpart', 'primary', 'linux-swap', '2049', '2561',
'mkpart', 'primary', '', '2561', '3585']
cmd = self.parted_static_cmd + expected_mkpart
mock_exc.return_value = (None, None)
utils.make_partitions(self.dev, self.root_mb, self.swap_mb,
self.ephemeral_mb)
mock_exc.assert_called_once_with(*cmd, run_as_root=True,
check_exit_code=[0])
parted_call = mock.call(*cmd, run_as_root=True, check_exit_code=[0])
mock_exc.assert_has_calls(parted_call)
@mock.patch.object(utils, 'get_dev_block_size')
@ -592,8 +597,6 @@ class GetDeviceBlockSizeTestCase(tests_base.TestCase):
@mock.patch.object(utils, 'block_uuid', lambda p: 'uuid')
@mock.patch.object(utils, 'dd', lambda *_: None)
@mock.patch.object(common_utils, 'mkfs', lambda *_: None)
# TODO(dtantsur): remove once https://review.openstack.org/90126 lands
@mock.patch.object(time, 'sleep', lambda *_: None)
# NOTE(dtantsur): destroy_disk_metadata resets file size, disabling it
@mock.patch.object(utils, 'destroy_disk_metadata', lambda *_: None)
class RealFilePartitioningTestCase(tests_base.TestCase):
@ -611,8 +614,10 @@ class RealFilePartitioningTestCase(tests_base.TestCase):
common_utils.execute('parted', '--version')
except OSError as exc:
self.skipTest('parted utility was not found: %s' % exc)
self.file = tempfile.NamedTemporaryFile()
self.addCleanup(lambda: self.file.close())
self.file = tempfile.NamedTemporaryFile(delete=False)
# NOTE(ifarkas): the file needs to be closed, so fuser won't report
# any usage
self.file.close()
# NOTE(dtantsur): 20 MiB file with zeros
common_utils.execute('dd', 'if=/dev/zero', 'of=%s' % self.file.name,
'bs=1', 'count=0', 'seek=20MiB')

View File

@ -13,20 +13,27 @@
# License for the specific language governing permissions and limitations
# under the License.
import time
import fixtures
import mock
from testtools.matchers import HasLength
from ironic.common import disk_partitioner
from ironic.common import exception
from ironic.common import utils
from ironic.tests import base
@mock.patch.object(time, 'sleep', lambda _: None)
@mock.patch.object(utils, 'execute', lambda _: None)
class DiskPartitionerTestCase(base.TestCase):
def setUp(self):
super(DiskPartitionerTestCase, self).setUp()
def noop(*args, **kwargs):
pass
self.useFixture(fixtures.MonkeyPatch('eventlet.greenthread.sleep',
noop))
def test_add_partition(self):
dp = disk_partitioner.DiskPartitioner('/dev/fake')
dp.add_partition(1024)
@ -49,7 +56,8 @@ class DiskPartitionerTestCase(base.TestCase):
self.assertEqual(expected, partitions)
@mock.patch.object(disk_partitioner.DiskPartitioner, '_exec')
def test_commit(self, mock_exec):
@mock.patch.object(utils, 'execute')
def test_commit(self, mock_utils_exc, mock_disk_partitioner_exec):
dp = disk_partitioner.DiskPartitioner('/dev/fake')
fake_parts = [(1, {'bootable': False,
'fs_type': 'fake-fs-type',
@ -61,11 +69,98 @@ class DiskPartitionerTestCase(base.TestCase):
'size': 1})]
with mock.patch.object(dp, 'get_partitions') as mock_gp:
mock_gp.return_value = fake_parts
mock_utils_exc.return_value = (None, None)
dp.commit()
mock_exec.assert_called_once_with('mklabel', 'msdos',
'mkpart', 'fake-type', 'fake-fs-type', '1', '2',
'mkpart', 'fake-type', 'fake-fs-type', '2', '3',
'set', '2', 'boot', 'on')
mock_disk_partitioner_exec.assert_called_once_with('mklabel', 'msdos',
'mkpart', 'fake-type', 'fake-fs-type', '1', '2',
'mkpart', 'fake-type', 'fake-fs-type', '2', '3',
'set', '2', 'boot', 'on')
mock_utils_exc.assert_called_once_with('fuser', '/dev/fake',
run_as_root=True, check_exit_code=[0, 1])
@mock.patch.object(disk_partitioner.DiskPartitioner, '_exec')
@mock.patch.object(utils, 'execute')
def test_commit_with_device_is_busy_once(self, mock_utils_exc,
mock_disk_partitioner_exec):
dp = disk_partitioner.DiskPartitioner('/dev/fake')
fake_parts = [(1, {'bootable': False,
'fs_type': 'fake-fs-type',
'type': 'fake-type',
'size': 1}),
(2, {'bootable': True,
'fs_type': 'fake-fs-type',
'type': 'fake-type',
'size': 1})]
fuser_outputs = [("/dev/fake: 10000 10001", None), (None, None)]
with mock.patch.object(dp, 'get_partitions') as mock_gp:
mock_gp.return_value = fake_parts
mock_utils_exc.side_effect = fuser_outputs
dp.commit()
mock_disk_partitioner_exec.assert_called_once_with('mklabel', 'msdos',
'mkpart', 'fake-type', 'fake-fs-type', '1', '2',
'mkpart', 'fake-type', 'fake-fs-type', '2', '3',
'set', '2', 'boot', 'on')
mock_utils_exc.assert_called_with('fuser', '/dev/fake',
run_as_root=True, check_exit_code=[0, 1])
self.assertEqual(2, mock_utils_exc.call_count)
@mock.patch.object(disk_partitioner.DiskPartitioner, '_exec')
@mock.patch.object(utils, 'execute')
def test_commit_with_device_is_always_busy(self, mock_utils_exc,
mock_disk_partitioner_exec):
dp = disk_partitioner.DiskPartitioner('/dev/fake')
fake_parts = [(1, {'bootable': False,
'fs_type': 'fake-fs-type',
'type': 'fake-type',
'size': 1}),
(2, {'bootable': True,
'fs_type': 'fake-fs-type',
'type': 'fake-type',
'size': 1})]
with mock.patch.object(dp, 'get_partitions') as mock_gp:
mock_gp.return_value = fake_parts
mock_utils_exc.return_value = ("/dev/fake: 10000 10001", None)
self.assertRaises(exception.InstanceDeployFailure, dp.commit)
mock_disk_partitioner_exec.assert_called_once_with('mklabel', 'msdos',
'mkpart', 'fake-type', 'fake-fs-type', '1', '2',
'mkpart', 'fake-type', 'fake-fs-type', '2', '3',
'set', '2', 'boot', 'on')
mock_utils_exc.assert_called_with('fuser', '/dev/fake',
run_as_root=True, check_exit_code=[0, 1])
self.assertEqual(20, mock_utils_exc.call_count)
@mock.patch.object(disk_partitioner.DiskPartitioner, '_exec')
@mock.patch.object(utils, 'execute')
def test_commit_with_device_disconnected(self, mock_utils_exc,
mock_disk_partitioner_exec):
dp = disk_partitioner.DiskPartitioner('/dev/fake')
fake_parts = [(1, {'bootable': False,
'fs_type': 'fake-fs-type',
'type': 'fake-type',
'size': 1}),
(2, {'bootable': True,
'fs_type': 'fake-fs-type',
'type': 'fake-type',
'size': 1})]
with mock.patch.object(dp, 'get_partitions') as mock_gp:
mock_gp.return_value = fake_parts
mock_utils_exc.return_value = (None, "Specified filename /dev/fake"
" does not exist.")
self.assertRaises(exception.InstanceDeployFailure, dp.commit)
mock_disk_partitioner_exec.assert_called_once_with('mklabel', 'msdos',
'mkpart', 'fake-type', 'fake-fs-type', '1', '2',
'mkpart', 'fake-type', 'fake-fs-type', '2', '3',
'set', '2', 'boot', 'on')
mock_utils_exc.assert_called_with('fuser', '/dev/fake',
run_as_root=True, check_exit_code=[0, 1])
self.assertEqual(20, mock_utils_exc.call_count)
@mock.patch.object(utils, 'execute')