rbd_utils: increase _destroy_volume timeout

If RBD backend is used for Nova ephemeral storage, Nova tries to remove ephemeral storage volume from Ceph in a retry loop: 10 attempts at 1 second intervals, totaling 10 seconds overall - which, due to a thirty second ceph watcher timeout, might result in intermittent volume removal failures on Ceph side. This patch adds params rbd_destroy_volume_retries, defaulting to 12, and rbd_destroy_volume_retry_interval, defaulting to 5, which multiplied, give Ceph reasonable amount of time to complete the operation successfully. Closes-Bug: #1856845 Change-Id: Icfd55617f0126f79d9610f8a2fc6b4c817d1a2bd
2020-02-04 16:59:14 +01:00 · 2020-02-04 16:59:14 +01:00 · 6458c3dba5
commit 6458c3dba5
parent 382d9b2336
4 changed files with 68 additions and 5 deletions
--- a/nova/conf/libvirt.py
+++ b/nova/conf/libvirt.py
@ -1095,6 +1095,27 @@ The libvirt UUID of the secret for the rbd_user volumes.
               default=5,
               help="""
 The RADOS client timeout in seconds when initially connecting to the cluster.
+"""),
+    cfg.IntOpt('rbd_destroy_volume_retry_interval',
+               default=5,
+               min=0,
+               help="""
+Number of seconds to wait between each consecutive retry to destroy a
+RBD volume.
+
+Related options:
+
+* [libvirt]/images_type = 'rbd'
+"""),
+    cfg.IntOpt('rbd_destroy_volume_retries',
+               default=12,
+               min=0,
+               help="""
+Number of retries to destroy a RBD volume.
+
+Related options:
+
+* [libvirt]/images_type = 'rbd'
 """),
 ]

--- a/nova/tests/unit/virt/libvirt/storage/test_rbd.py
+++ b/nova/tests/unit/virt/libvirt/storage/test_rbd.py
@ -435,8 +435,8 @@ class RbdTestCase(test.NoDBTestCase):
            self.driver.cleanup_volumes(filter_fn)
        rbd.remove.assert_any_call(client.__enter__.return_value.ioctx,
                                   '%s_test' % uuids.instance)
-        # NOTE(danms): 10 retries + 1 final attempt to propagate = 11
-        self.assertEqual(11, len(rbd.remove.call_args_list))
+        # NOTE(sandonov): 12 retries + 1 final attempt to propagate = 13
+        self.assertEqual(13, len(rbd.remove.call_args_list))

    def test_cleanup_volumes_fail_not_found(self):
        self._test_cleanup_exception('ImageBusy')
@ -508,6 +508,27 @@ class RbdTestCase(test.NoDBTestCase):
        client.__enter__.assert_called_once_with()
        client.__exit__.assert_called_once_with(None, None, None)

+    @mock.patch.object(rbd_utils, 'RADOSClient')
+    @mock.patch('oslo_service.loopingcall.FixedIntervalLoopingCall')
+    def test_destroy_volume_with_retries(self, mock_loopingcall, mock_client):
+        vol = '12345_test'
+        client = mock_client.return_value
+        loopingcall = mock_loopingcall.return_value
+
+        # Try for sixty seconds: six retries at 10 second interval
+        self.flags(rbd_destroy_volume_retries=6, group='libvirt')
+        self.flags(rbd_destroy_volume_retry_interval=10, group='libvirt')
+        self.driver.destroy_volume(vol)
+
+        # Make sure both params have the expected values
+        retryctx = mock_loopingcall.call_args.args[3]
+        self.assertEqual(retryctx, {'retries': 6})
+        loopingcall.start.assert_called_with(interval=10)
+
+        # Make sure that we entered and exited the RADOSClient
+        client.__enter__.assert_called_once_with()
+        client.__exit__.assert_called_once_with(None, None, None)
+
    @mock.patch.object(rbd_utils, 'RADOSClient')
    def test_remove_image(self, mock_client):
        name = '12345_disk.config.rescue'
--- a/nova/virt/libvirt/storage/rbd_utils.py
+++ b/nova/virt/libvirt/storage/rbd_utils.py
@ -351,11 +351,13 @@ class RBDDriver(object):
            if retryctx['retries'] <= 0:
                raise loopingcall.LoopingCallDone()

-        # NOTE(danms): We let it go for ten seconds
-        retryctx = {'retries': 10}
+        # NOTE(sandonov): We let it go for:
+        # rbd_destroy_volume_retries*rbd_destroy_volume_retry_interval seconds
+        retryctx = {'retries': CONF.libvirt.rbd_destroy_volume_retries}
        timer = loopingcall.FixedIntervalLoopingCall(
            _cleanup_vol, client.ioctx, volume, retryctx)
-        timed_out = timer.start(interval=1).wait()
+        timed_out = timer.start(
+            interval=CONF.libvirt.rbd_destroy_volume_retry_interval).wait()
        if timed_out:
            # NOTE(danms): Run this again to propagate the error, but
            # if it succeeds, don't raise the loopingcall exception
--- a/releasenotes/notes/rbd-increase-timeout-c4e5a34cf5da7fdc.yaml
+++ b/releasenotes/notes/rbd-increase-timeout-c4e5a34cf5da7fdc.yaml
@ -0,0 +1,19 @@
+---
+features:
+  - |
+    Added params ``[libvirt]/rbd_destroy_volume_retries``, defaulting to 12,
+    and ``[libvirt]/rbd_destroy_volume_retry_interval``, defaulting to 5, that
+    Nova will use when trying to remove a volume from Ceph in a retry loop
+    that combines these parameters together. Thus, maximum elapsing time is by
+    default 60 seconds.
+fixes:
+  - |
+    Nova tries to remove a volume from Ceph in a retry loop of 10 attempts at
+    1 second intervals, totaling 10 seconds overall - which, due to 30 second
+    ceph watcher timeout, might result in intermittent object removal failures
+    on Ceph side (`bug 1856845`_). Setting default values for
+    ``[libvirt]/rbd_destroy_volume_retries`` to 12 and
+    ``[libvirt]/rbd_destroy_volume_retry_interval`` to 5, now gives Ceph
+    reasonable amount of time to complete the operation successfully.
+
+    .. _`bug 1856845`: https://bugs.launchpad.net/nova/+bug/1856845