Fix race conditions between imagebackend and imagecache

The race may occur in the situation:
* There is a base file that is not used for a long time enough
  to be removed by imagecache.
* imagebackend is provisioning a virtual disk from the base file.
* imagecache is removing the base file.

Then, the base file is removed even though it is about to be used.

To fix this, these changes are in this patch:

* A new function imagecache.refresh_timestamp(base_file) updates
  the owner and mtime of the base file with the lock dedicated
  to the base file.
* imagebacked calls refresh_timestamp(base_file) before provision
  a disk from the base file.
* imagecache.ImageCacheManager._remove_base_file(base_file) uses
  the same lock as used by refresh_timestamp()

Closes-Bug: #1256838
Change-Id: I7c897cf6071d87a2c4532fb3a73863d649d02782
This commit is contained in:
Arata Notsu 2013-12-03 18:27:26 +09:00
parent 7e4075301c
commit 2593469103
4 changed files with 96 additions and 28 deletions

View File

@ -29,6 +29,7 @@ from nova.tests import fake_processutils
from nova.tests.virt.libvirt import fake_libvirt_utils
from nova import unit
from nova.virt.libvirt import imagebackend
from nova.virt.libvirt import imagecache
CONF = cfg.CONF
@ -65,6 +66,7 @@ class _ImageTestCase(object):
fake_libvirt_utils))
def test_cache(self):
self.mox.StubOutWithMock(imagecache, 'refresh_timestamp')
self.mox.StubOutWithMock(os.path, 'exists')
if self.OLD_STYLE_INSTANCE_PATH:
os.path.exists(self.OLD_STYLE_INSTANCE_PATH).AndReturn(False)
@ -74,6 +76,7 @@ class _ImageTestCase(object):
fn(target=self.TEMPLATE_PATH)
self.mox.StubOutWithMock(imagebackend.fileutils, 'ensure_tree')
imagebackend.fileutils.ensure_tree(self.TEMPLATE_DIR)
imagecache.refresh_timestamp(self.TEMPLATE_PATH)
self.mox.ReplayAll()
image = self.image_class(self.INSTANCE, self.NAME)
@ -83,12 +86,14 @@ class _ImageTestCase(object):
self.mox.VerifyAll()
def test_cache_image_exists(self):
self.mox.StubOutWithMock(imagecache, 'refresh_timestamp')
self.mox.StubOutWithMock(os.path, 'exists')
if self.OLD_STYLE_INSTANCE_PATH:
os.path.exists(self.OLD_STYLE_INSTANCE_PATH).AndReturn(False)
os.path.exists(self.TEMPLATE_DIR).AndReturn(True)
os.path.exists(self.PATH).AndReturn(True)
os.path.exists(self.TEMPLATE_PATH).AndReturn(True)
imagecache.refresh_timestamp(self.TEMPLATE_PATH)
self.mox.ReplayAll()
image = self.image_class(self.INSTANCE, self.NAME)
@ -97,11 +102,13 @@ class _ImageTestCase(object):
self.mox.VerifyAll()
def test_cache_base_dir_exists(self):
self.mox.StubOutWithMock(imagecache, 'refresh_timestamp')
self.mox.StubOutWithMock(os.path, 'exists')
if self.OLD_STYLE_INSTANCE_PATH:
os.path.exists(self.OLD_STYLE_INSTANCE_PATH).AndReturn(False)
os.path.exists(self.TEMPLATE_DIR).AndReturn(True)
os.path.exists(self.PATH).AndReturn(False)
imagecache.refresh_timestamp(self.TEMPLATE_PATH)
fn = self.mox.CreateMockAnything()
fn(target=self.TEMPLATE_PATH)
self.mox.StubOutWithMock(imagebackend.fileutils, 'ensure_tree')
@ -114,11 +121,13 @@ class _ImageTestCase(object):
self.mox.VerifyAll()
def test_cache_template_exists(self):
self.mox.StubOutWithMock(imagecache, 'refresh_timestamp')
self.mox.StubOutWithMock(os.path, 'exists')
if self.OLD_STYLE_INSTANCE_PATH:
os.path.exists(self.OLD_STYLE_INSTANCE_PATH).AndReturn(False)
os.path.exists(self.TEMPLATE_DIR).AndReturn(True)
os.path.exists(self.PATH).AndReturn(False)
imagecache.refresh_timestamp(self.TEMPLATE_PATH)
fn = self.mox.CreateMockAnything()
fn(target=self.TEMPLATE_PATH)
self.mox.ReplayAll()
@ -141,6 +150,7 @@ class _ImageTestCase(object):
self.stubs.Set(os.path, 'exists', lambda _: True)
self.stubs.Set(os, 'access', lambda p, w: True)
self.stubs.Set(imagecache, 'refresh_timestamp', lambda _: None)
# Call twice to verify testing fallocate is only called once.
image.cache(fake_fetch, self.TEMPLATE_PATH, self.SIZE)
@ -165,6 +175,7 @@ class _ImageTestCase(object):
self.stubs.Set(image, '_can_fallocate', lambda: True)
self.stubs.Set(os.path, 'exists', lambda _: True)
self.stubs.Set(os, 'access', lambda p, w: False)
self.stubs.Set(imagecache, 'refresh_timestamp', lambda _: None)
# Testing fallocate is only called when user has write access.
image.cache(fake_fetch, self.TEMPLATE_PATH, self.SIZE)
@ -502,6 +513,7 @@ class LvmTestCase(_ImageTestCase, test.NoDBTestCase):
self.stubs.Set(os.path, 'exists', lambda _: True)
self.stubs.Set(image, 'check_image_exists', lambda: True)
self.stubs.Set(imagecache, 'refresh_timestamp', lambda _: None)
image.cache(fake_fetch, self.TEMPLATE_PATH, self.SIZE)
@ -537,8 +549,10 @@ class RbdTestCase(_ImageTestCase, test.NoDBTestCase):
self.mox.StubOutWithMock(os.path, 'exists')
self.mox.StubOutWithMock(image, 'check_image_exists')
self.mox.StubOutWithMock(imagecache, 'refresh_timestamp')
os.path.exists(self.TEMPLATE_DIR).AndReturn(False)
image.check_image_exists().AndReturn(False)
imagecache.refresh_timestamp(self.TEMPLATE_PATH)
fn = self.mox.CreateMockAnything()
fn(target=self.TEMPLATE_PATH)
self.mox.StubOutWithMock(imagebackend.fileutils, 'ensure_tree')
@ -555,8 +569,10 @@ class RbdTestCase(_ImageTestCase, test.NoDBTestCase):
self.mox.StubOutWithMock(os.path, 'exists')
self.mox.StubOutWithMock(image, 'check_image_exists')
self.mox.StubOutWithMock(imagecache, 'refresh_timestamp')
os.path.exists(self.TEMPLATE_DIR).AndReturn(True)
image.check_image_exists().AndReturn(False)
imagecache.refresh_timestamp(self.TEMPLATE_PATH)
fn = self.mox.CreateMockAnything()
fn(target=self.TEMPLATE_PATH)
self.mox.StubOutWithMock(imagebackend.fileutils, 'ensure_tree')
@ -572,9 +588,11 @@ class RbdTestCase(_ImageTestCase, test.NoDBTestCase):
self.mox.StubOutWithMock(os.path, 'exists')
self.mox.StubOutWithMock(image, 'check_image_exists')
self.mox.StubOutWithMock(imagecache, 'refresh_timestamp')
os.path.exists(self.TEMPLATE_DIR).AndReturn(True)
image.check_image_exists().AndReturn(True)
os.path.exists(self.TEMPLATE_PATH).AndReturn(True)
imagecache.refresh_timestamp(self.TEMPLATE_PATH)
self.mox.ReplayAll()
image.cache(None, self.TEMPLATE)
@ -586,8 +604,10 @@ class RbdTestCase(_ImageTestCase, test.NoDBTestCase):
self.mox.StubOutWithMock(os.path, 'exists')
self.mox.StubOutWithMock(image, 'check_image_exists')
self.mox.StubOutWithMock(imagecache, 'refresh_timestamp')
os.path.exists(self.TEMPLATE_DIR).AndReturn(True)
image.check_image_exists().AndReturn(False)
imagecache.refresh_timestamp(self.TEMPLATE_PATH)
fn = self.mox.CreateMockAnything()
fn(target=self.TEMPLATE_PATH)
self.mox.ReplayAll()
@ -635,6 +655,7 @@ class RbdTestCase(_ImageTestCase, test.NoDBTestCase):
self.stubs.Set(os.path, 'exists', lambda _: True)
self.stubs.Set(image, 'check_image_exists', lambda: True)
self.stubs.Set(imagecache, 'refresh_timestamp', lambda _: None)
image.cache(fake_fetch, self.TEMPLATE_PATH, self.SIZE)

View File

@ -655,6 +655,12 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
self.stubs.Set(os.path, 'getmtime', lambda x: getmtime(x))
# Fake decorator for lock
def passthru(f):
return f
self.stubs.Set(imagecache, '_lock_for_refresh', lambda _: passthru)
# Make sure we don't accidentally remove a real file
orig_remove = os.remove

View File

@ -34,6 +34,7 @@ from nova import utils
from nova.virt.disk import api as disk
from nova.virt import images
from nova.virt.libvirt import config as vconfig
from nova.virt.libvirt import imagecache
from nova.virt.libvirt import utils as libvirt_utils
@ -181,7 +182,7 @@ class Image(object):
if not os.path.exists(base_dir):
fileutils.ensure_tree(base_dir)
base = os.path.join(base_dir, filename)
imagecache.refresh_timestamp(base)
if not self.check_image_exists() or not os.path.exists(base):
self.create_image(fetch_func_sync, base, size,
*args, **kwargs)

View File

@ -238,10 +238,33 @@ def write_stored_checksum(target):
write_stored_info(target, field='sha1', value=_hash_file(target))
def _get_lock_path():
return os.path.join(CONF.instances_path, 'locks')
def _lock_for_refresh(base_file):
lock_name = 'refresh-%s' % os.path.split(base_file)[-1]
return utils.synchronized(lock_name, external=True,
lock_path=_get_lock_path())
def refresh_timestamp(base_file):
"""Update age of a file not to removed from the cache."""
@_lock_for_refresh(base_file)
def inner_refresh_timestamp():
if os.path.exists(base_file):
virtutils.chown(base_file, os.getuid())
os.utime(base_file, None)
if os.path.exists(base_file):
inner_refresh_timestamp()
class ImageCacheManager(imagecache.ImageCacheManager):
def __init__(self):
super(ImageCacheManager, self).__init__()
self.lock_path = os.path.join(CONF.instances_path, 'locks')
self.lock_path = _get_lock_path()
self._reset_state()
def _reset_state(self):
@ -426,33 +449,50 @@ class ImageCacheManager(imagecache.ImageCacheManager):
Returns nothing.
"""
# TODO(arata): If the lock is already owned by other process
# (or thread), it means the process is refreshing or removing
# the base file. Then, this process has nothing to do after
# the lock is acquired, because the base file is already
# refreshed (and should not be removed) or removed.
# So, unless the lock is immediately acquired, this method
# should return immediately instead of blocking meaninglessly.
@_lock_for_refresh(base_file)
def inner_remove_base_file():
if not os.path.exists(base_file):
LOG.debug(_('Cannot remove %(base_file)s, it does not exist'),
base_file)
return
mtime = os.path.getmtime(base_file)
age = time.time() - mtime
maxage = CONF.libvirt.remove_unused_resized_minimum_age_seconds
if base_file in self.originals:
maxage = CONF.remove_unused_original_minimum_age_seconds
if age < maxage:
LOG.info(_('Base file too young to remove: %s'),
base_file)
else:
LOG.info(_('Removing base file: %s'), base_file)
try:
os.remove(base_file)
signature = get_info_filename(base_file)
if os.path.exists(signature):
os.remove(signature)
except OSError as e:
LOG.error(_('Failed to remove %(base_file)s, '
'error was %(error)s'),
{'base_file': base_file,
'error': e})
if not os.path.exists(base_file):
LOG.debug(_('Cannot remove %(base_file)s, it does not exist'),
base_file)
return
mtime = os.path.getmtime(base_file)
age = time.time() - mtime
maxage = CONF.libvirt.remove_unused_resized_minimum_age_seconds
if base_file in self.originals:
maxage = CONF.remove_unused_original_minimum_age_seconds
if age < maxage:
LOG.info(_('Base file too young to remove: %s'),
base_file)
else:
LOG.info(_('Removing base file: %s'), base_file)
try:
os.remove(base_file)
signature = get_info_filename(base_file)
if os.path.exists(signature):
os.remove(signature)
except OSError as e:
LOG.error(_('Failed to remove %(base_file)s, '
'error was %(error)s'),
{'base_file': base_file,
'error': e})
inner_remove_base_file()
def _handle_base_image(self, img_id, base_file):
"""Handle the checks for a single base image."""
@ -519,9 +559,9 @@ class ImageCacheManager(imagecache.ImageCacheManager):
'use'),
{'id': img_id,
'base_file': base_file})
if os.path.exists(base_file):
virtutils.chown(base_file, os.getuid())
os.utime(base_file, None)
# TODO(arata): Should not block to acquire the lock here
# as well in _remove_base_file()
refresh_timestamp(base_file)
def _age_and_verify_cached_images(self, context, all_instances, base_dir):
LOG.debug(_('Verify base images'))