diff --git a/nova/conf/libvirt.py b/nova/conf/libvirt.py index 5f8dd49e11a4..a15d2c8062cb 100644 --- a/nova/conf/libvirt.py +++ b/nova/conf/libvirt.py @@ -1068,6 +1068,11 @@ the Ceph RBD server. cfg.StrOpt('rbd_secret_uuid', help=""" The libvirt UUID of the secret for the rbd_user volumes. +"""), + cfg.IntOpt('rbd_connect_timeout', + default=5, + help=""" +The RADOS client timeout in seconds when initially connecting to the cluster. """), ] diff --git a/nova/tests/unit/virt/libvirt/storage/test_rbd.py b/nova/tests/unit/virt/libvirt/storage/test_rbd.py index 5464f1ba7b2c..cd1c33bbaad6 100644 --- a/nova/tests/unit/virt/libvirt/storage/test_rbd.py +++ b/nova/tests/unit/virt/libvirt/storage/test_rbd.py @@ -83,7 +83,9 @@ class RbdTestCase(test.NoDBTestCase): self.mock_rbd.ImageHasSnapshots = FakeException self.rbd_pool = 'rbd' - self.driver = rbd_utils.RBDDriver(self.rbd_pool, None, None) + self.rbd_connect_timeout = 5 + self.driver = rbd_utils.RBDDriver(self.rbd_pool, None, None, + self.rbd_connect_timeout) self.volume_name = u'volume-00000001' self.snap_name = u'test-snap' @@ -276,7 +278,8 @@ class RbdTestCase(test.NoDBTestCase): def test_connect_to_rados_default(self): ret = self.driver._connect_to_rados() - self.assertTrue(self.mock_rados.Rados.connect.called) + self.mock_rados.Rados.connect.assert_called_once_with( + timeout=self.rbd_connect_timeout) self.assertTrue(self.mock_rados.Rados.open_ioctx.called) self.assertIsInstance(ret[0], self.mock_rados.Rados) self.assertEqual(self.mock_rados.Rados.ioctx, ret[1]) @@ -284,7 +287,8 @@ class RbdTestCase(test.NoDBTestCase): def test_connect_to_rados_different_pool(self): ret = self.driver._connect_to_rados('alt_pool') - self.assertTrue(self.mock_rados.Rados.connect.called) + self.mock_rados.Rados.connect.assert_called_once_with( + timeout=self.rbd_connect_timeout) self.assertTrue(self.mock_rados.Rados.open_ioctx.called) self.assertIsInstance(ret[0], self.mock_rados.Rados) self.assertEqual(self.mock_rados.Rados.ioctx, ret[1]) diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 05e759516e61..57328604ff0a 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -1262,7 +1262,8 @@ class LibvirtDriver(driver.ComputeDriver): return rbd_utils.RBDDriver( pool=CONF.libvirt.images_rbd_pool, ceph_conf=CONF.libvirt.images_rbd_ceph_conf, - rbd_user=CONF.libvirt.rbd_user) + rbd_user=CONF.libvirt.rbd_user, + rbd_connect_timeout=CONF.libvirt.rbd_connect_timeout) def _cleanup_rbd(self, instance): # NOTE(nic): On revert_resize, the cleanup steps for the root diff --git a/nova/virt/libvirt/imagebackend.py b/nova/virt/libvirt/imagebackend.py index 4da0c6568c1b..66f2ce3196a4 100644 --- a/nova/virt/libvirt/imagebackend.py +++ b/nova/virt/libvirt/imagebackend.py @@ -847,6 +847,7 @@ class Rbd(Image): self.pool = CONF.libvirt.images_rbd_pool self.rbd_user = CONF.libvirt.rbd_user + self.rbd_connect_timeout = CONF.libvirt.rbd_connect_timeout self.ceph_conf = CONF.libvirt.images_rbd_ceph_conf path = 'rbd:%s/%s' % (self.pool, self.rbd_name) @@ -860,7 +861,8 @@ class Rbd(Image): self.driver = rbd_utils.RBDDriver( pool=self.pool, ceph_conf=self.ceph_conf, - rbd_user=self.rbd_user) + rbd_user=self.rbd_user, + rbd_connect_timeout=self.rbd_connect_timeout) self.discard_mode = CONF.libvirt.hw_disk_discard diff --git a/nova/virt/libvirt/storage/rbd_utils.py b/nova/virt/libvirt/storage/rbd_utils.py index 133f72f410ec..f78dfd228f6c 100644 --- a/nova/virt/libvirt/storage/rbd_utils.py +++ b/nova/virt/libvirt/storage/rbd_utils.py @@ -118,12 +118,13 @@ class RADOSClient(object): class RBDDriver(object): - def __init__(self, pool, ceph_conf, rbd_user): + def __init__(self, pool, ceph_conf, rbd_user, rbd_connect_timeout): self.pool = pool # NOTE(angdraug): rados.Rados fails to connect if ceph_conf is None: # https://github.com/ceph/ceph/pull/1787 self.ceph_conf = ceph_conf or '' self.rbd_user = rbd_user or None + self.rbd_connect_timeout = rbd_connect_timeout if rbd is None: raise RuntimeError(_('rbd python libraries not found')) @@ -131,7 +132,7 @@ class RBDDriver(object): client = rados.Rados(rados_id=self.rbd_user, conffile=self.ceph_conf) try: - client.connect() + client.connect(timeout=self.rbd_connect_timeout) pool_to_open = pool or self.pool # NOTE(luogangyi): open_ioctx >= 10.1.0 could handle unicode # arguments perfectly as part of Python 3 support. diff --git a/releasenotes/notes/bug-1834048-8b19ae1c5048b801.yaml b/releasenotes/notes/bug-1834048-8b19ae1c5048b801.yaml new file mode 100644 index 000000000000..1bae7903dea9 --- /dev/null +++ b/releasenotes/notes/bug-1834048-8b19ae1c5048b801.yaml @@ -0,0 +1,12 @@ +--- +other: + - | + A new ``[libvirt]/rbd_connect_timeout`` configuration option has been + introduced to limit the time spent waiting when connecting to a RBD cluster + via the RADOS API. This timeout currently defaults to 5 seconds. + + This aims to address issues reported in `bug 1834048`_ where failures to + initially connect to a RBD cluster left the nova-compute service inoperable + due to constant RPC timeouts being hit. + + .. _bug 1834048: https://bugs.launchpad.net/nova/+bug/1834048