From 994cdb234b2b16d97f0276c6356db65817944ee2 Mon Sep 17 00:00:00 2001 From: Matthew Booth Date: Tue, 24 Jun 2014 12:12:59 +0100 Subject: [PATCH] VMware: Fix race in spawn() when resizing cached image spawn() guards against multiple threads simultaneously attempting to cache the same image, but it wasn't guarding against them simultanously trying to create a resized copy in the cache. Attempting to create a large number of images simultaneously of an uncached image would result in a race to create the resized image. This resulted in 2 classes of failed instance: 1. Instances whose disk was a linked clone of a copy which had been subsequently overwritten. These were corrupt. 2. Instances whose spawn() failed in ExtendVirtualDisk_Task due to a locked image. This patch creates a Nova-local lock for the resized image. The image is in a per-Nova directory on the datastore, so inter-Nova locking is not a concern. The lock guards both testing for the existence of the image, and its creation. Therefore when multiple processes race, only 1 will create the resized copy, and all others will find and use it. In normal usage this will add the overhead of an additional uncontended local lock creation and deletion in spawn(). In wrapping this code in a lock, we also make certain that any failure to create the resized image is appropriately cleaned up. Otherwise subsequent users will attempt to use a corrupt copy. Change-Id: I3df3d614656e511c909b6c1837582c0d34bf84c6 Closes-bug: 1333587 --- nova/tests/virt/vmwareapi/test_driver_api.py | 114 +++++++++++++++++++ nova/virt/vmwareapi/vmops.py | 79 +++++++++---- 2 files changed, 173 insertions(+), 20 deletions(-) diff --git a/nova/tests/virt/vmwareapi/test_driver_api.py b/nova/tests/virt/vmwareapi/test_driver_api.py index e7576e01165e..1a8b272fe70b 100644 --- a/nova/tests/virt/vmwareapi/test_driver_api.py +++ b/nova/tests/virt/vmwareapi/test_driver_api.py @@ -864,6 +864,120 @@ class VMwareAPIVMTestCase(test.NoDBTestCase): self.assertFalse(vmwareapi_fake.get_file(cached_image)) self.assertFalse(vmwareapi_fake.get_file(tmp_file)) + def test_spawn_disk_extend_failed_copy(self): + # Spawn instance + # copy for extend fails without creating a file + # + # Expect the copy error to be raised + self.flags(use_linked_clone=True, group='vmware') + self.wait_task = self.conn._session._wait_for_task + self.call_method = self.conn._session._call_method + + CopyError = error_util.FileFaultException + + def fake_wait_for_task(task_ref): + if task_ref == 'fake-copy-task': + raise CopyError('Copy failed!') + return self.wait_task(task_ref) + + def fake_call_method(module, method, *args, **kwargs): + if method == "CopyVirtualDisk_Task": + return 'fake-copy-task' + + return self.call_method(module, method, *args, **kwargs) + + with contextlib.nested( + mock.patch.object(self.conn._session, '_call_method', + new=fake_call_method), + mock.patch.object(self.conn._session, '_wait_for_task', + new=fake_wait_for_task)): + self.assertRaises(CopyError, self._create_vm) + + def test_spawn_disk_extend_failed_partial_copy(self): + # Spawn instance + # Copy for extend fails, leaving a file behind + # + # Expect the file to be cleaned up + # Expect the copy error to be raised + self.flags(use_linked_clone=True, group='vmware') + self.wait_task = self.conn._session._wait_for_task + self.call_method = self.conn._session._call_method + self.task_ref = None + uuid = self.fake_image_uuid + cached_image = '[%s] vmware_base/%s/%s.80.vmdk' % (self.ds, + uuid, uuid) + + CopyError = error_util.FileFaultException + + def fake_wait_for_task(task_ref): + if task_ref == self.task_ref: + self.task_ref = None + self.assertTrue(vmwareapi_fake.get_file(cached_image)) + # N.B. We don't test for -flat here because real + # CopyVirtualDisk_Task doesn't actually create it + raise CopyError('Copy failed!') + return self.wait_task(task_ref) + + def fake_call_method(module, method, *args, **kwargs): + task_ref = self.call_method(module, method, *args, **kwargs) + if method == "CopyVirtualDisk_Task": + self.task_ref = task_ref + return task_ref + + with contextlib.nested( + mock.patch.object(self.conn._session, '_call_method', + new=fake_call_method), + mock.patch.object(self.conn._session, '_wait_for_task', + new=fake_wait_for_task)): + self.assertRaises(CopyError, self._create_vm) + self.assertFalse(vmwareapi_fake.get_file(cached_image)) + + def test_spawn_disk_extend_failed_partial_copy_failed_cleanup(self): + # Spawn instance + # Copy for extend fails, leaves file behind + # File cleanup fails + # + # Expect file to be left behind + # Expect file cleanup error to be raised + self.flags(use_linked_clone=True, group='vmware') + self.wait_task = self.conn._session._wait_for_task + self.call_method = self.conn._session._call_method + self.task_ref = None + uuid = self.fake_image_uuid + cached_image = '[%s] vmware_base/%s/%s.80.vmdk' % (self.ds, + uuid, uuid) + + CopyError = error_util.FileFaultException + DeleteError = error_util.CannotDeleteFileException + + def fake_wait_for_task(task_ref): + if task_ref == self.task_ref: + self.task_ref = None + self.assertTrue(vmwareapi_fake.get_file(cached_image)) + # N.B. We don't test for -flat here because real + # CopyVirtualDisk_Task doesn't actually create it + raise CopyError('Copy failed!') + elif task_ref == 'fake-delete-task': + raise DeleteError('Delete failed!') + return self.wait_task(task_ref) + + def fake_call_method(module, method, *args, **kwargs): + if method == "DeleteDatastoreFile_Task": + return 'fake-delete-task' + + task_ref = self.call_method(module, method, *args, **kwargs) + if method == "CopyVirtualDisk_Task": + self.task_ref = task_ref + return task_ref + + with contextlib.nested( + mock.patch.object(self.conn._session, '_wait_for_task', + new=fake_wait_for_task), + mock.patch.object(self.conn._session, '_call_method', + new=fake_call_method)): + self.assertRaises(DeleteError, self._create_vm) + self.assertTrue(vmwareapi_fake.get_file(cached_image)) + def test_spawn_disk_invalid_disk_size(self): self.mox.StubOutWithMock(vmware_images, 'get_vmdk_size_and_properties') result = [82 * units.Gi, diff --git a/nova/virt/vmwareapi/vmops.py b/nova/virt/vmwareapi/vmops.py index 331f0c76f78f..b7000eb76991 100644 --- a/nova/virt/vmwareapi/vmops.py +++ b/nova/virt/vmwareapi/vmops.py @@ -34,7 +34,7 @@ from nova import context as nova_context from nova import exception from nova.network import model as network_model from nova.openstack.common import excutils -from nova.openstack.common.gettextutils import _ +from nova.openstack.common.gettextutils import _, _LE from nova.openstack.common import lockutils from nova.openstack.common import log as logging from nova.openstack.common import strutils @@ -453,27 +453,66 @@ class VMwareVMOps(object): root_vmdk_name = "%s.%s.vmdk" % (upload_name, root_gb) root_vmdk_path = str(datastore.build_path( upload_folder, root_vmdk_name)) - if not self._check_if_folder_file_exists( - ds_browser, - datastore.ref, datastore.name, - upload_folder, - upload_name + ".%s.vmdk" % root_gb): - LOG.debug("Copying root disk of size %sGb", root_gb) - try: + + # Ensure only a single thread extends the image at once. + # We do this by taking a lock on the name of the extended + # image. This allows multiple threads to create resized + # copies simultaneously, as long as they are different + # sizes. Threads attempting to create the same resized copy + # will be serialized, with only the first actually creating + # the copy. + # + # Note that the object is in a per-nova cache directory, + # so inter-nova locking is not a concern. Consequently we + # can safely use simple thread locks. + + with lockutils.lock(root_vmdk_path, + lock_file_prefix='nova-vmware-image'): + if not self._check_if_folder_file_exists( + ds_browser, + datastore.ref, datastore.name, + upload_folder, + root_vmdk_name): + LOG.debug("Copying root disk of size %sGb", + root_gb) + copy_spec = self.get_copy_virtual_disk_spec( client_factory, adapter_type, disk_type) - vm_util.copy_virtual_disk(self._session, - dc_info.ref, - uploaded_file_path, - root_vmdk_path, - copy_spec) - except Exception as e: - LOG.warning(_("Root disk file creation " - "failed - %s"), e) - if root_gb_in_kb > vmdk_file_size_in_kb: - self._extend_virtual_disk(instance, root_gb_in_kb, - root_vmdk_path, - dc_info.ref) + + # Create a copy of the base image, ensuring we + # clean up on failure + try: + vm_util.copy_virtual_disk(self._session, + dc_info.ref, + uploaded_file_path, + root_vmdk_path, + copy_spec) + except Exception as e: + with excutils.save_and_reraise_exception(): + LOG.error(_LE('Failed to copy cached ' + 'image %(source)s to ' + '%(dest)s for resize: ' + '%(error)s'), + {'source': uploaded_file_path, + 'dest': root_vmdk_path, + 'error': e.message}) + try: + ds_util.file_delete(self._session, + root_vmdk_path, + dc_info.ref) + except error_util.FileNotFoundException: + # File was never created: cleanup not + # required + pass + + # Resize the copy to the appropriate size. No need + # for cleanup up here, as _extend_virtual_disk + # already does it + if root_gb_in_kb > vmdk_file_size_in_kb: + self._extend_virtual_disk(instance, + root_gb_in_kb, + root_vmdk_path, + dc_info.ref) # Attach the root disk to the VM. if root_vmdk_path: