rbd_utils: increase _destroy_volume timeout

If RBD backend is used for Nova ephemeral storage, Nova tries to remove
ephemeral storage volume from Ceph in a retry loop: 10 attempts at 1
second intervals, totaling 10 seconds overall - which, due to a thirty
second ceph watcher timeout, might result in intermittent volume
removal failures on Ceph side.
This patch adds params rbd_destroy_volume_retries, defaulting to 12, and
rbd_destroy_volume_retry_interval, defaulting to 5, which multiplied, give
Ceph reasonable amount of time to complete the operation successfully.

Closes-Bug: #1856845
Change-Id: Icfd55617f0126f79d9610f8a2fc6b4c817d1a2bd
This commit is contained in:
Sasha Andonov 2020-02-04 16:59:14 +01:00 committed by melanie witt
parent 382d9b2336
commit 6458c3dba5
4 changed files with 68 additions and 5 deletions
nova
conf
tests/unit/virt/libvirt/storage
virt/libvirt/storage
releasenotes/notes

@ -1095,6 +1095,27 @@ The libvirt UUID of the secret for the rbd_user volumes.
default=5,
help="""
The RADOS client timeout in seconds when initially connecting to the cluster.
"""),
cfg.IntOpt('rbd_destroy_volume_retry_interval',
default=5,
min=0,
help="""
Number of seconds to wait between each consecutive retry to destroy a
RBD volume.
Related options:
* [libvirt]/images_type = 'rbd'
"""),
cfg.IntOpt('rbd_destroy_volume_retries',
default=12,
min=0,
help="""
Number of retries to destroy a RBD volume.
Related options:
* [libvirt]/images_type = 'rbd'
"""),
]

@ -435,8 +435,8 @@ class RbdTestCase(test.NoDBTestCase):
self.driver.cleanup_volumes(filter_fn)
rbd.remove.assert_any_call(client.__enter__.return_value.ioctx,
'%s_test' % uuids.instance)
# NOTE(danms): 10 retries + 1 final attempt to propagate = 11
self.assertEqual(11, len(rbd.remove.call_args_list))
# NOTE(sandonov): 12 retries + 1 final attempt to propagate = 13
self.assertEqual(13, len(rbd.remove.call_args_list))
def test_cleanup_volumes_fail_not_found(self):
self._test_cleanup_exception('ImageBusy')
@ -508,6 +508,27 @@ class RbdTestCase(test.NoDBTestCase):
client.__enter__.assert_called_once_with()
client.__exit__.assert_called_once_with(None, None, None)
@mock.patch.object(rbd_utils, 'RADOSClient')
@mock.patch('oslo_service.loopingcall.FixedIntervalLoopingCall')
def test_destroy_volume_with_retries(self, mock_loopingcall, mock_client):
vol = '12345_test'
client = mock_client.return_value
loopingcall = mock_loopingcall.return_value
# Try for sixty seconds: six retries at 10 second interval
self.flags(rbd_destroy_volume_retries=6, group='libvirt')
self.flags(rbd_destroy_volume_retry_interval=10, group='libvirt')
self.driver.destroy_volume(vol)
# Make sure both params have the expected values
retryctx = mock_loopingcall.call_args.args[3]
self.assertEqual(retryctx, {'retries': 6})
loopingcall.start.assert_called_with(interval=10)
# Make sure that we entered and exited the RADOSClient
client.__enter__.assert_called_once_with()
client.__exit__.assert_called_once_with(None, None, None)
@mock.patch.object(rbd_utils, 'RADOSClient')
def test_remove_image(self, mock_client):
name = '12345_disk.config.rescue'

@ -351,11 +351,13 @@ class RBDDriver(object):
if retryctx['retries'] <= 0:
raise loopingcall.LoopingCallDone()
# NOTE(danms): We let it go for ten seconds
retryctx = {'retries': 10}
# NOTE(sandonov): We let it go for:
# rbd_destroy_volume_retries*rbd_destroy_volume_retry_interval seconds
retryctx = {'retries': CONF.libvirt.rbd_destroy_volume_retries}
timer = loopingcall.FixedIntervalLoopingCall(
_cleanup_vol, client.ioctx, volume, retryctx)
timed_out = timer.start(interval=1).wait()
timed_out = timer.start(
interval=CONF.libvirt.rbd_destroy_volume_retry_interval).wait()
if timed_out:
# NOTE(danms): Run this again to propagate the error, but
# if it succeeds, don't raise the loopingcall exception

@ -0,0 +1,19 @@
---
features:
- |
Added params ``[libvirt]/rbd_destroy_volume_retries``, defaulting to 12,
and ``[libvirt]/rbd_destroy_volume_retry_interval``, defaulting to 5, that
Nova will use when trying to remove a volume from Ceph in a retry loop
that combines these parameters together. Thus, maximum elapsing time is by
default 60 seconds.
fixes:
- |
Nova tries to remove a volume from Ceph in a retry loop of 10 attempts at
1 second intervals, totaling 10 seconds overall - which, due to 30 second
ceph watcher timeout, might result in intermittent object removal failures
on Ceph side (`bug 1856845`_). Setting default values for
``[libvirt]/rbd_destroy_volume_retries`` to 12 and
``[libvirt]/rbd_destroy_volume_retry_interval`` to 5, now gives Ceph
reasonable amount of time to complete the operation successfully.
.. _`bug 1856845`: https://bugs.launchpad.net/nova/+bug/1856845