rbd_utils: increase _destroy_volume timeout
If RBD backend is used for Nova ephemeral storage, Nova tries to remove ephemeral storage volume from Ceph in a retry loop: 10 attempts at 1 second intervals, totaling 10 seconds overall - which, due to a thirty second ceph watcher timeout, might result in intermittent volume removal failures on Ceph side. This patch adds params rbd_destroy_volume_retries, defaulting to 12, and rbd_destroy_volume_retry_interval, defaulting to 5, which multiplied, give Ceph reasonable amount of time to complete the operation successfully. Closes-Bug: #1856845 Change-Id: Icfd55617f0126f79d9610f8a2fc6b4c817d1a2bd
This commit is contained in:
parent
382d9b2336
commit
6458c3dba5
nova
releasenotes/notes
@ -1095,6 +1095,27 @@ The libvirt UUID of the secret for the rbd_user volumes.
|
||||
default=5,
|
||||
help="""
|
||||
The RADOS client timeout in seconds when initially connecting to the cluster.
|
||||
"""),
|
||||
cfg.IntOpt('rbd_destroy_volume_retry_interval',
|
||||
default=5,
|
||||
min=0,
|
||||
help="""
|
||||
Number of seconds to wait between each consecutive retry to destroy a
|
||||
RBD volume.
|
||||
|
||||
Related options:
|
||||
|
||||
* [libvirt]/images_type = 'rbd'
|
||||
"""),
|
||||
cfg.IntOpt('rbd_destroy_volume_retries',
|
||||
default=12,
|
||||
min=0,
|
||||
help="""
|
||||
Number of retries to destroy a RBD volume.
|
||||
|
||||
Related options:
|
||||
|
||||
* [libvirt]/images_type = 'rbd'
|
||||
"""),
|
||||
]
|
||||
|
||||
|
@ -435,8 +435,8 @@ class RbdTestCase(test.NoDBTestCase):
|
||||
self.driver.cleanup_volumes(filter_fn)
|
||||
rbd.remove.assert_any_call(client.__enter__.return_value.ioctx,
|
||||
'%s_test' % uuids.instance)
|
||||
# NOTE(danms): 10 retries + 1 final attempt to propagate = 11
|
||||
self.assertEqual(11, len(rbd.remove.call_args_list))
|
||||
# NOTE(sandonov): 12 retries + 1 final attempt to propagate = 13
|
||||
self.assertEqual(13, len(rbd.remove.call_args_list))
|
||||
|
||||
def test_cleanup_volumes_fail_not_found(self):
|
||||
self._test_cleanup_exception('ImageBusy')
|
||||
@ -508,6 +508,27 @@ class RbdTestCase(test.NoDBTestCase):
|
||||
client.__enter__.assert_called_once_with()
|
||||
client.__exit__.assert_called_once_with(None, None, None)
|
||||
|
||||
@mock.patch.object(rbd_utils, 'RADOSClient')
|
||||
@mock.patch('oslo_service.loopingcall.FixedIntervalLoopingCall')
|
||||
def test_destroy_volume_with_retries(self, mock_loopingcall, mock_client):
|
||||
vol = '12345_test'
|
||||
client = mock_client.return_value
|
||||
loopingcall = mock_loopingcall.return_value
|
||||
|
||||
# Try for sixty seconds: six retries at 10 second interval
|
||||
self.flags(rbd_destroy_volume_retries=6, group='libvirt')
|
||||
self.flags(rbd_destroy_volume_retry_interval=10, group='libvirt')
|
||||
self.driver.destroy_volume(vol)
|
||||
|
||||
# Make sure both params have the expected values
|
||||
retryctx = mock_loopingcall.call_args.args[3]
|
||||
self.assertEqual(retryctx, {'retries': 6})
|
||||
loopingcall.start.assert_called_with(interval=10)
|
||||
|
||||
# Make sure that we entered and exited the RADOSClient
|
||||
client.__enter__.assert_called_once_with()
|
||||
client.__exit__.assert_called_once_with(None, None, None)
|
||||
|
||||
@mock.patch.object(rbd_utils, 'RADOSClient')
|
||||
def test_remove_image(self, mock_client):
|
||||
name = '12345_disk.config.rescue'
|
||||
|
@ -351,11 +351,13 @@ class RBDDriver(object):
|
||||
if retryctx['retries'] <= 0:
|
||||
raise loopingcall.LoopingCallDone()
|
||||
|
||||
# NOTE(danms): We let it go for ten seconds
|
||||
retryctx = {'retries': 10}
|
||||
# NOTE(sandonov): We let it go for:
|
||||
# rbd_destroy_volume_retries*rbd_destroy_volume_retry_interval seconds
|
||||
retryctx = {'retries': CONF.libvirt.rbd_destroy_volume_retries}
|
||||
timer = loopingcall.FixedIntervalLoopingCall(
|
||||
_cleanup_vol, client.ioctx, volume, retryctx)
|
||||
timed_out = timer.start(interval=1).wait()
|
||||
timed_out = timer.start(
|
||||
interval=CONF.libvirt.rbd_destroy_volume_retry_interval).wait()
|
||||
if timed_out:
|
||||
# NOTE(danms): Run this again to propagate the error, but
|
||||
# if it succeeds, don't raise the loopingcall exception
|
||||
|
@ -0,0 +1,19 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Added params ``[libvirt]/rbd_destroy_volume_retries``, defaulting to 12,
|
||||
and ``[libvirt]/rbd_destroy_volume_retry_interval``, defaulting to 5, that
|
||||
Nova will use when trying to remove a volume from Ceph in a retry loop
|
||||
that combines these parameters together. Thus, maximum elapsing time is by
|
||||
default 60 seconds.
|
||||
fixes:
|
||||
- |
|
||||
Nova tries to remove a volume from Ceph in a retry loop of 10 attempts at
|
||||
1 second intervals, totaling 10 seconds overall - which, due to 30 second
|
||||
ceph watcher timeout, might result in intermittent object removal failures
|
||||
on Ceph side (`bug 1856845`_). Setting default values for
|
||||
``[libvirt]/rbd_destroy_volume_retries`` to 12 and
|
||||
``[libvirt]/rbd_destroy_volume_retry_interval`` to 5, now gives Ceph
|
||||
reasonable amount of time to complete the operation successfully.
|
||||
|
||||
.. _`bug 1856845`: https://bugs.launchpad.net/nova/+bug/1856845
|
Loading…
x
Reference in New Issue
Block a user