From e0c7d6c8816d0c21b2913adc7a3d6cbd59604cd0 Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Wed, 14 Mar 2018 16:43:22 -0400 Subject: [PATCH] libvirt: handle DiskNotFound during update_available_resource The update_available_resource periodic task in the compute manager eventually calls through to the resource tracker and virt driver get_available_resource method, which gets the guests running on the hypervisor, and builds up a set of information about the host. This includes disk information for the active domains. However, the periodic task can race with instances being deleted concurrently and the hypervisor can report the domain but the driver has already deleted the backing files as part of deleting the instance, and this leads to failures when running "qemu-img info" on the disk path which is now gone. When that happens, the entire periodic update fails. This change simply tries to detect the specific failure from 'qemu-img info' and translate it into a DiskNotFound exception which the driver can handle. In this case, if the associated instance is undergoing a task state transition such as moving to another host or being deleted, we log a message and continue. If the instance is in steady state (task_state is not set), then we consider it a failure and re-raise it up. Note that we could add the deleted=False filter to the instance query in _get_disk_over_committed_size_total but that doesn't help us in this case because the hypervisor says the domain is still active and the instance is not actually considered deleted in the DB yet. Conflicts: nova/virt/libvirt/driver.py nova/tests/unit/virt/libvirt/test_driver.py NOTE(lyarwood): Conflicts due to the substantial refactoring of _get_instance_disk_info via I9616a602ee0605f7f1dc1f47b6416f01895e025b and removal of _LW etc during Pike, Change-Id: Icec2769bf42455853cbe686fb30fda73df791b25 Closes-Bug: #1662867 (cherry picked from commit 5f16e714f58336344752305f94451e7c7c55742c) (cherry picked from commit 5a4c6913a37f912489543abd5e12a54feeeb89e2) (cherry picked from commit d251b95083731829ba104dc5c7f642dd5097d510) --- nova/tests/unit/virt/libvirt/test_driver.py | 44 +++++++++++++++++++++ nova/tests/unit/virt/test_images.py | 20 +++++++++- nova/virt/images.py | 7 +++- nova/virt/libvirt/driver.py | 18 +++++++++ 4 files changed, 87 insertions(+), 2 deletions(-) diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 7010c3dc6294..f34f03bf8d20 100755 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -12938,6 +12938,50 @@ class LibvirtConnTestCase(test.NoDBTestCase): drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) self.assertEqual(0, drvr._get_disk_over_committed_size_total()) + @mock.patch('nova.virt.libvirt.host.Host.list_instance_domains') + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.' + '_get_instance_disk_info', + side_effect=exception.DiskNotFound(location='/opt/stack/foo')) + @mock.patch('nova.objects.BlockDeviceMappingList.bdms_by_instance_uuid', + return_value=objects.BlockDeviceMappingList()) + @mock.patch('nova.objects.InstanceList.get_by_filters', + return_value=objects.InstanceList(objects=[ + objects.Instance(uuid=uuids.instance, + task_state=task_states.DELETING)])) + def test_disk_over_committed_size_total_disk_not_found_ignore( + self, mock_get, mock_bdms, mock_get_disk_info, mock_list_domains): + """Tests that we handle DiskNotFound gracefully for an instance that + is undergoing a task_state transition. + """ + mock_dom = mock.Mock() + mock_dom.XMLDesc.return_value = "" + mock_dom.UUIDString.return_value = uuids.instance + mock_list_domains.return_value = [mock_dom] + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + self.assertEqual(0, drvr._get_disk_over_committed_size_total()) + + @mock.patch('nova.virt.libvirt.host.Host.list_instance_domains') + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.' + '_get_instance_disk_info', + side_effect=exception.DiskNotFound(location='/opt/stack/foo')) + @mock.patch('nova.objects.BlockDeviceMappingList.bdms_by_instance_uuid', + return_value=objects.BlockDeviceMappingList()) + @mock.patch('nova.objects.InstanceList.get_by_filters', + return_value=objects.InstanceList(objects=[ + objects.Instance(uuid=uuids.instance, task_state=None)])) + def test_disk_over_committed_size_total_disk_not_found_reraise( + self, mock_get, mock_bdms, mock_get_disk_info, mock_list_domains): + """Tests that we handle DiskNotFound gracefully for an instance that + is NOT undergoing a task_state transition and the error is re-raised. + """ + mock_dom = mock.Mock() + mock_dom.XMLDesc.return_value = "" + mock_dom.UUIDString.return_value = uuids.instance + mock_list_domains.return_value = [mock_dom] + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + self.assertRaises(exception.DiskNotFound, + drvr._get_disk_over_committed_size_total) + def test_cpu_info(self): drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True) diff --git a/nova/tests/unit/virt/test_images.py b/nova/tests/unit/virt/test_images.py index c6864f295df9..ac6392d99a70 100644 --- a/nova/tests/unit/virt/test_images.py +++ b/nova/tests/unit/virt/test_images.py @@ -32,7 +32,7 @@ class QemuTestCase(test.NoDBTestCase): @mock.patch.object(os.path, 'exists', return_value=True) def test_qemu_info_with_errors(self, path_exists): - self.assertRaises(exception.InvalidDiskInfo, + self.assertRaises(exception.DiskNotFound, images.qemu_img_info, '/fake/path') @@ -65,6 +65,24 @@ class QemuTestCase(test.NoDBTestCase): '/fake/path') self.assertIn('qemu-img aborted by prlimits', six.text_type(exc)) + @mock.patch.object(utils, 'execute') + @mock.patch.object(os.path, 'exists', return_value=True) + def test_qemu_img_info_with_disk_not_found(self, exists, mocked_execute): + """Tests that the initial os.path.exists check passes but the qemu-img + command fails because the path is gone by the time the command runs. + """ + path = '/opt/stack/data/nova/instances/some-uuid/disk' + stderr = (u"qemu-img: Could not open " + "'/opt/stack/data/nova/instances/some-uuid/disk': " + "Could not open '/opt/stack/data/nova/instances/some-uuid/" + "disk': No such file or directory\n") + mocked_execute.side_effect = ( + processutils.ProcessExecutionError( + exit_code=1, stderr=stderr)) + self.assertRaises(exception.DiskNotFound, images.qemu_img_info, path) + exists.assert_called_once_with(path) + mocked_execute.assert_called_once() + @mock.patch.object(images, 'convert_image', side_effect=exception.ImageUnacceptable) @mock.patch.object(images, 'qemu_img_info') diff --git a/nova/virt/images.py b/nova/virt/images.py index 59e1ddf24dec..4abc5bce4357 100644 --- a/nova/virt/images.py +++ b/nova/virt/images.py @@ -62,10 +62,15 @@ def qemu_img_info(path, format=None): cmd = cmd + ('-f', format) out, err = utils.execute(*cmd, prlimit=QEMU_IMG_LIMITS) except processutils.ProcessExecutionError as exp: - # this means we hit prlimits, make the exception more specific if exp.exit_code == -9: + # this means we hit prlimits, make the exception more specific msg = (_("qemu-img aborted by prlimits when inspecting " "%(path)s : %(exp)s") % {'path': path, 'exp': exp}) + elif exp.exit_code == 1 and 'No such file or directory' in exp.stderr: + # The os.path.exists check above can race so this is a simple + # best effort at catching that type of failure and raising a more + # specific error. + raise exception.DiskNotFound(location=path) else: msg = (_("qemu-img failed to execute on %(path)s : %(exp)s") % {'path': path, 'exp': exp}) diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 0a003200cc01..3ca158aaa6f6 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -7316,6 +7316,24 @@ class LibvirtDriver(driver.ComputeDriver): 'Error: %(error)s'), {'i_name': guest.name, 'error': e}) + except exception.DiskNotFound: + with excutils.save_and_reraise_exception() as err_ctxt: + # If the instance is undergoing a task state transition, + # like moving to another host or is being deleted, we + # should ignore this instance and move on. + if guest.uuid in local_instances: + inst = local_instances[guest.uuid] + if inst.task_state is not None: + LOG.info(_LI('Periodic task is updating the host ' + 'stats; it is trying to get disk info ' + 'for %(i_name)s, but the backing disk ' + 'was removed by a concurrent operation ' + '(task_state=%(task_state)s)'), + {'i_name': guest.name, + 'task_state': inst.task_state}, + instance=inst) + err_ctxt.reraise = False + # NOTE(gtt116): give other tasks a chance. greenthread.sleep(0) return disk_over_committed_size