VMware: Fix race in spawn() when resizing cached image

spawn() guards against multiple threads simultaneously attempting to
cache the same image, but it wasn't guarding against them
simultanously trying to create a resized copy in the cache. Attempting
to create a large number of images simultaneously of an uncached image
would result in a race to create the resized image. This resulted in 2
classes of failed instance:

1. Instances whose disk was a linked clone of a copy which had been
   subsequently overwritten. These were corrupt.
2. Instances whose spawn() failed in ExtendVirtualDisk_Task due to a
   locked image.

This patch creates a Nova-local lock for the resized image. The image
is in a per-Nova directory on the datastore, so inter-Nova locking is
not a concern. The lock guards both testing for the existence of the
image, and its creation. Therefore when multiple processes race, only
1 will create the resized copy, and all others will find and use it.
In normal usage this will add the overhead of an additional
uncontended local lock creation and deletion in spawn().

In wrapping this code in a lock, we also make certain that any failure
to create the resized image is appropriately cleaned up. Otherwise
subsequent users will attempt to use a corrupt copy.

Change-Id: I3df3d614656e511c909b6c1837582c0d34bf84c6
Closes-bug: 1333587
This commit is contained in:
Matthew Booth 2014-06-24 12:12:59 +01:00
parent 7a78f39726
commit 994cdb234b
2 changed files with 173 additions and 20 deletions

View File

@ -864,6 +864,120 @@ class VMwareAPIVMTestCase(test.NoDBTestCase):
self.assertFalse(vmwareapi_fake.get_file(cached_image))
self.assertFalse(vmwareapi_fake.get_file(tmp_file))
def test_spawn_disk_extend_failed_copy(self):
# Spawn instance
# copy for extend fails without creating a file
#
# Expect the copy error to be raised
self.flags(use_linked_clone=True, group='vmware')
self.wait_task = self.conn._session._wait_for_task
self.call_method = self.conn._session._call_method
CopyError = error_util.FileFaultException
def fake_wait_for_task(task_ref):
if task_ref == 'fake-copy-task':
raise CopyError('Copy failed!')
return self.wait_task(task_ref)
def fake_call_method(module, method, *args, **kwargs):
if method == "CopyVirtualDisk_Task":
return 'fake-copy-task'
return self.call_method(module, method, *args, **kwargs)
with contextlib.nested(
mock.patch.object(self.conn._session, '_call_method',
new=fake_call_method),
mock.patch.object(self.conn._session, '_wait_for_task',
new=fake_wait_for_task)):
self.assertRaises(CopyError, self._create_vm)
def test_spawn_disk_extend_failed_partial_copy(self):
# Spawn instance
# Copy for extend fails, leaving a file behind
#
# Expect the file to be cleaned up
# Expect the copy error to be raised
self.flags(use_linked_clone=True, group='vmware')
self.wait_task = self.conn._session._wait_for_task
self.call_method = self.conn._session._call_method
self.task_ref = None
uuid = self.fake_image_uuid
cached_image = '[%s] vmware_base/%s/%s.80.vmdk' % (self.ds,
uuid, uuid)
CopyError = error_util.FileFaultException
def fake_wait_for_task(task_ref):
if task_ref == self.task_ref:
self.task_ref = None
self.assertTrue(vmwareapi_fake.get_file(cached_image))
# N.B. We don't test for -flat here because real
# CopyVirtualDisk_Task doesn't actually create it
raise CopyError('Copy failed!')
return self.wait_task(task_ref)
def fake_call_method(module, method, *args, **kwargs):
task_ref = self.call_method(module, method, *args, **kwargs)
if method == "CopyVirtualDisk_Task":
self.task_ref = task_ref
return task_ref
with contextlib.nested(
mock.patch.object(self.conn._session, '_call_method',
new=fake_call_method),
mock.patch.object(self.conn._session, '_wait_for_task',
new=fake_wait_for_task)):
self.assertRaises(CopyError, self._create_vm)
self.assertFalse(vmwareapi_fake.get_file(cached_image))
def test_spawn_disk_extend_failed_partial_copy_failed_cleanup(self):
# Spawn instance
# Copy for extend fails, leaves file behind
# File cleanup fails
#
# Expect file to be left behind
# Expect file cleanup error to be raised
self.flags(use_linked_clone=True, group='vmware')
self.wait_task = self.conn._session._wait_for_task
self.call_method = self.conn._session._call_method
self.task_ref = None
uuid = self.fake_image_uuid
cached_image = '[%s] vmware_base/%s/%s.80.vmdk' % (self.ds,
uuid, uuid)
CopyError = error_util.FileFaultException
DeleteError = error_util.CannotDeleteFileException
def fake_wait_for_task(task_ref):
if task_ref == self.task_ref:
self.task_ref = None
self.assertTrue(vmwareapi_fake.get_file(cached_image))
# N.B. We don't test for -flat here because real
# CopyVirtualDisk_Task doesn't actually create it
raise CopyError('Copy failed!')
elif task_ref == 'fake-delete-task':
raise DeleteError('Delete failed!')
return self.wait_task(task_ref)
def fake_call_method(module, method, *args, **kwargs):
if method == "DeleteDatastoreFile_Task":
return 'fake-delete-task'
task_ref = self.call_method(module, method, *args, **kwargs)
if method == "CopyVirtualDisk_Task":
self.task_ref = task_ref
return task_ref
with contextlib.nested(
mock.patch.object(self.conn._session, '_wait_for_task',
new=fake_wait_for_task),
mock.patch.object(self.conn._session, '_call_method',
new=fake_call_method)):
self.assertRaises(DeleteError, self._create_vm)
self.assertTrue(vmwareapi_fake.get_file(cached_image))
def test_spawn_disk_invalid_disk_size(self):
self.mox.StubOutWithMock(vmware_images, 'get_vmdk_size_and_properties')
result = [82 * units.Gi,

View File

@ -34,7 +34,7 @@ from nova import context as nova_context
from nova import exception
from nova.network import model as network_model
from nova.openstack.common import excutils
from nova.openstack.common.gettextutils import _
from nova.openstack.common.gettextutils import _, _LE
from nova.openstack.common import lockutils
from nova.openstack.common import log as logging
from nova.openstack.common import strutils
@ -453,27 +453,66 @@ class VMwareVMOps(object):
root_vmdk_name = "%s.%s.vmdk" % (upload_name, root_gb)
root_vmdk_path = str(datastore.build_path(
upload_folder, root_vmdk_name))
if not self._check_if_folder_file_exists(
ds_browser,
datastore.ref, datastore.name,
upload_folder,
upload_name + ".%s.vmdk" % root_gb):
LOG.debug("Copying root disk of size %sGb", root_gb)
try:
# Ensure only a single thread extends the image at once.
# We do this by taking a lock on the name of the extended
# image. This allows multiple threads to create resized
# copies simultaneously, as long as they are different
# sizes. Threads attempting to create the same resized copy
# will be serialized, with only the first actually creating
# the copy.
#
# Note that the object is in a per-nova cache directory,
# so inter-nova locking is not a concern. Consequently we
# can safely use simple thread locks.
with lockutils.lock(root_vmdk_path,
lock_file_prefix='nova-vmware-image'):
if not self._check_if_folder_file_exists(
ds_browser,
datastore.ref, datastore.name,
upload_folder,
root_vmdk_name):
LOG.debug("Copying root disk of size %sGb",
root_gb)
copy_spec = self.get_copy_virtual_disk_spec(
client_factory, adapter_type, disk_type)
vm_util.copy_virtual_disk(self._session,
dc_info.ref,
uploaded_file_path,
root_vmdk_path,
copy_spec)
except Exception as e:
LOG.warning(_("Root disk file creation "
"failed - %s"), e)
if root_gb_in_kb > vmdk_file_size_in_kb:
self._extend_virtual_disk(instance, root_gb_in_kb,
root_vmdk_path,
dc_info.ref)
# Create a copy of the base image, ensuring we
# clean up on failure
try:
vm_util.copy_virtual_disk(self._session,
dc_info.ref,
uploaded_file_path,
root_vmdk_path,
copy_spec)
except Exception as e:
with excutils.save_and_reraise_exception():
LOG.error(_LE('Failed to copy cached '
'image %(source)s to '
'%(dest)s for resize: '
'%(error)s'),
{'source': uploaded_file_path,
'dest': root_vmdk_path,
'error': e.message})
try:
ds_util.file_delete(self._session,
root_vmdk_path,
dc_info.ref)
except error_util.FileNotFoundException:
# File was never created: cleanup not
# required
pass
# Resize the copy to the appropriate size. No need
# for cleanup up here, as _extend_virtual_disk
# already does it
if root_gb_in_kb > vmdk_file_size_in_kb:
self._extend_virtual_disk(instance,
root_gb_in_kb,
root_vmdk_path,
dc_info.ref)
# Attach the root disk to the VM.
if root_vmdk_path: