VMware: Fix race in spawn() when resizing cached image
spawn() guards against multiple threads simultaneously attempting to
cache the same image, but it wasn't guarding against them
simultanously trying to create a resized copy in the cache. Attempting
to create a large number of images simultaneously of an uncached image
would result in a race to create the resized image. This resulted in 2
classes of failed instance:
1. Instances whose disk was a linked clone of a copy which had been
subsequently overwritten. These were corrupt.
2. Instances whose spawn() failed in ExtendVirtualDisk_Task due to a
locked image.
This patch creates a Nova-local lock for the resized image. The image
is in a per-Nova directory on the datastore, so inter-Nova locking is
not a concern. The lock guards both testing for the existence of the
image, and its creation. Therefore when multiple processes race, only
1 will create the resized copy, and all others will find and use it.
In normal usage this will add the overhead of an additional
uncontended local lock creation and deletion in spawn().
In wrapping this code in a lock, we also make certain that any failure
to create the resized image is appropriately cleaned up. Otherwise
subsequent users will attempt to use a corrupt copy.
Conflicts:
nova/virt/vmwareapi/vmops.py
Required changes:
datastore.build_path -> ds_util.build_datastore_path
No datastore object
vm_util.copy_virtual_disk -> _copy_virtual_disk
self.fake_image_uuid -> 'fake_image_uuid'
removed use of _LE() for log message
This change includes a test which depends on change
I2025bffa887582eaa9e9072d0400f90ca97d1898.
Change-Id: I3df3d614656e511c909b6c1837582c0d34bf84c6
Closes-bug: 1333587
(cherry picked from commit 994cdb234b
)
This commit is contained in:
parent
0142324b2f
commit
a8b52c05ed
|
@ -785,6 +785,120 @@ class VMwareAPIVMTestCase(test.NoDBTestCase):
|
|||
self.assertFalse(vmwareapi_fake.get_file(cached_image))
|
||||
self.assertFalse(vmwareapi_fake.get_file(tmp_file))
|
||||
|
||||
def test_spawn_disk_extend_failed_copy(self):
|
||||
# Spawn instance
|
||||
# copy for extend fails without creating a file
|
||||
#
|
||||
# Expect the copy error to be raised
|
||||
self.flags(use_linked_clone=True, group='vmware')
|
||||
self.wait_task = self.conn._session._wait_for_task
|
||||
self.call_method = self.conn._session._call_method
|
||||
|
||||
CopyError = error_util.FileFaultException
|
||||
|
||||
def fake_wait_for_task(task_ref):
|
||||
if task_ref == 'fake-copy-task':
|
||||
raise CopyError('Copy failed!')
|
||||
return self.wait_task(task_ref)
|
||||
|
||||
def fake_call_method(module, method, *args, **kwargs):
|
||||
if method == "CopyVirtualDisk_Task":
|
||||
return 'fake-copy-task'
|
||||
|
||||
return self.call_method(module, method, *args, **kwargs)
|
||||
|
||||
with contextlib.nested(
|
||||
mock.patch.object(self.conn._session, '_call_method',
|
||||
new=fake_call_method),
|
||||
mock.patch.object(self.conn._session, '_wait_for_task',
|
||||
new=fake_wait_for_task)):
|
||||
self.assertRaises(CopyError, self._create_vm)
|
||||
|
||||
def test_spawn_disk_extend_failed_partial_copy(self):
|
||||
# Spawn instance
|
||||
# Copy for extend fails, leaving a file behind
|
||||
#
|
||||
# Expect the file to be cleaned up
|
||||
# Expect the copy error to be raised
|
||||
self.flags(use_linked_clone=True, group='vmware')
|
||||
self.wait_task = self.conn._session._wait_for_task
|
||||
self.call_method = self.conn._session._call_method
|
||||
self.task_ref = None
|
||||
uuid = 'fake_image_uuid'
|
||||
cached_image = '[%s] vmware_base/%s/%s.80.vmdk' % (self.ds,
|
||||
uuid, uuid)
|
||||
|
||||
CopyError = error_util.FileFaultException
|
||||
|
||||
def fake_wait_for_task(task_ref):
|
||||
if task_ref == self.task_ref:
|
||||
self.task_ref = None
|
||||
self.assertTrue(vmwareapi_fake.get_file(cached_image))
|
||||
# N.B. We don't test for -flat here because real
|
||||
# CopyVirtualDisk_Task doesn't actually create it
|
||||
raise CopyError('Copy failed!')
|
||||
return self.wait_task(task_ref)
|
||||
|
||||
def fake_call_method(module, method, *args, **kwargs):
|
||||
task_ref = self.call_method(module, method, *args, **kwargs)
|
||||
if method == "CopyVirtualDisk_Task":
|
||||
self.task_ref = task_ref
|
||||
return task_ref
|
||||
|
||||
with contextlib.nested(
|
||||
mock.patch.object(self.conn._session, '_call_method',
|
||||
new=fake_call_method),
|
||||
mock.patch.object(self.conn._session, '_wait_for_task',
|
||||
new=fake_wait_for_task)):
|
||||
self.assertRaises(CopyError, self._create_vm)
|
||||
self.assertFalse(vmwareapi_fake.get_file(cached_image))
|
||||
|
||||
def test_spawn_disk_extend_failed_partial_copy_failed_cleanup(self):
|
||||
# Spawn instance
|
||||
# Copy for extend fails, leaves file behind
|
||||
# File cleanup fails
|
||||
#
|
||||
# Expect file to be left behind
|
||||
# Expect file cleanup error to be raised
|
||||
self.flags(use_linked_clone=True, group='vmware')
|
||||
self.wait_task = self.conn._session._wait_for_task
|
||||
self.call_method = self.conn._session._call_method
|
||||
self.task_ref = None
|
||||
uuid = 'fake_image_uuid'
|
||||
cached_image = '[%s] vmware_base/%s/%s.80.vmdk' % (self.ds,
|
||||
uuid, uuid)
|
||||
|
||||
CopyError = error_util.FileFaultException
|
||||
DeleteError = error_util.CannotDeleteFileException
|
||||
|
||||
def fake_wait_for_task(task_ref):
|
||||
if task_ref == self.task_ref:
|
||||
self.task_ref = None
|
||||
self.assertTrue(vmwareapi_fake.get_file(cached_image))
|
||||
# N.B. We don't test for -flat here because real
|
||||
# CopyVirtualDisk_Task doesn't actually create it
|
||||
raise CopyError('Copy failed!')
|
||||
elif task_ref == 'fake-delete-task':
|
||||
raise DeleteError('Delete failed!')
|
||||
return self.wait_task(task_ref)
|
||||
|
||||
def fake_call_method(module, method, *args, **kwargs):
|
||||
if method == "DeleteDatastoreFile_Task":
|
||||
return 'fake-delete-task'
|
||||
|
||||
task_ref = self.call_method(module, method, *args, **kwargs)
|
||||
if method == "CopyVirtualDisk_Task":
|
||||
self.task_ref = task_ref
|
||||
return task_ref
|
||||
|
||||
with contextlib.nested(
|
||||
mock.patch.object(self.conn._session, '_wait_for_task',
|
||||
new=fake_wait_for_task),
|
||||
mock.patch.object(self.conn._session, '_call_method',
|
||||
new=fake_call_method)):
|
||||
self.assertRaises(DeleteError, self._create_vm)
|
||||
self.assertTrue(vmwareapi_fake.get_file(cached_image))
|
||||
|
||||
def test_spawn_disk_invalid_disk_size(self):
|
||||
self.mox.StubOutWithMock(vmware_images, 'get_vmdk_size_and_properties')
|
||||
result = [82 * units.Gi,
|
||||
|
|
|
@ -570,21 +570,60 @@ class VMwareVMOps(object):
|
|||
root_gb)
|
||||
root_vmdk_path = ds_util.build_datastore_path(
|
||||
data_store_name, root_vmdk_name)
|
||||
if not self._check_if_folder_file_exists(ds_browser,
|
||||
data_store_ref, data_store_name,
|
||||
upload_folder,
|
||||
upload_name + ".%s.vmdk" % root_gb):
|
||||
LOG.debug(_("Copying root disk of size %sGb"), root_gb)
|
||||
try:
|
||||
_copy_virtual_disk(uploaded_file_path,
|
||||
root_vmdk_path)
|
||||
except Exception as e:
|
||||
LOG.warning(_("Root disk file creation "
|
||||
"failed - %s"), e)
|
||||
if root_gb_in_kb > vmdk_file_size_in_kb:
|
||||
self._extend_virtual_disk(instance, root_gb_in_kb,
|
||||
root_vmdk_path,
|
||||
dc_info.ref)
|
||||
|
||||
# Ensure only a single thread extends the image at once.
|
||||
# We do this by taking a lock on the name of the extended
|
||||
# image. This allows multiple threads to create resized
|
||||
# copies simultaneously, as long as they are different
|
||||
# sizes. Threads attempting to create the same resized copy
|
||||
# will be serialized, with only the first actually creating
|
||||
# the copy.
|
||||
#
|
||||
# Note that the object is in a per-nova cache directory,
|
||||
# so inter-nova locking is not a concern. Consequently we
|
||||
# can safely use simple thread locks.
|
||||
|
||||
with lockutils.lock(root_vmdk_path,
|
||||
lock_file_prefix='nova-vmware-image'):
|
||||
if not self._check_if_folder_file_exists(
|
||||
ds_browser,
|
||||
data_store_ref, data_store_name,
|
||||
upload_folder,
|
||||
upload_name + ".%s.vmdk" % root_gb):
|
||||
LOG.debug("Copying root disk of size %sGb",
|
||||
root_gb)
|
||||
|
||||
# Create a copy of the base image, ensuring we
|
||||
# clean up on failure
|
||||
try:
|
||||
_copy_virtual_disk(uploaded_file_path,
|
||||
root_vmdk_path)
|
||||
except Exception as e:
|
||||
with excutils.save_and_reraise_exception():
|
||||
LOG.error(_('Failed to copy cached '
|
||||
'image %(source)s to '
|
||||
'%(dest)s for resize: '
|
||||
'%(error)s'),
|
||||
{'source': uploaded_file_path,
|
||||
'dest': root_vmdk_path,
|
||||
'error': e.message})
|
||||
try:
|
||||
ds_util.file_delete(self._session,
|
||||
root_vmdk_path,
|
||||
dc_info.ref)
|
||||
except error_util.FileNotFoundException:
|
||||
# File was never created: cleanup not
|
||||
# required
|
||||
pass
|
||||
|
||||
# Resize the copy to the appropriate size. No need
|
||||
# for cleanup up here, as _extend_virtual_disk
|
||||
# already does it
|
||||
if root_gb_in_kb > vmdk_file_size_in_kb:
|
||||
self._extend_virtual_disk(instance,
|
||||
root_gb_in_kb,
|
||||
root_vmdk_path,
|
||||
dc_info.ref)
|
||||
|
||||
# Attach the root disk to the VM.
|
||||
if root_vmdk_path:
|
||||
|
|
Loading…
Reference in New Issue