libvirt: Race condition leads to instance in error

ImageCacheManager deletes base image while image backend is copying
image to the instance path leading instance to go in the error state.

Acquired lock before removing image from cache. If libvirt is copying
image to the instance path, image cache manager won't be able to remove
it until libvirt finishes copying image completely.

Closes-Bug: 1256838
Closes-Bug: 1470437
Co-Authored-By: Michael Still <mikal@stillhq.com>
Depends-On: I337ce28e2fc516c91bec61ca3639ebff0029ad49
Change-Id: I376cc951922c338669fdf3f83da83e0d3cea1532
This commit is contained in:
Ankit Agrawal 2015-08-10 16:27:57 +10:00
parent 0d8a3d90fb
commit ec9d5e375e
9 changed files with 88 additions and 31 deletions

View File

@ -246,3 +246,6 @@ ploop: CommandFilter, ploop, root
# nova/virt/libvirt/utils.py: 'xend', 'status'
xend: CommandFilter, xend, root
# nova/virt/libvirt/utils.py:
touch: CommandFilter, touch, root

View File

@ -77,6 +77,10 @@ def chown(path, owner):
pass
def update_mtime(path):
pass
def extract_snapshot(disk_path, source_fmt, out_path, dest_fmt):
files[out_path] = ''

View File

@ -20,6 +20,7 @@ import os
import time
import mock
from oslo_concurrency import lockutils
from oslo_concurrency import processutils
from oslo_config import cfg
from oslo_log import formatters
@ -461,34 +462,32 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
[fname])
self.assertEqual(image_cache_manager.corrupt_base_files, [])
def test_handle_base_image_used(self):
self.stubs.Set(libvirt_utils, 'chown', lambda x, y: None)
@mock.patch.object(libvirt_utils, 'update_mtime')
def test_handle_base_image_used(self, mock_mtime):
img = '123'
with self._make_base_file() as fname:
os.utime(fname, (-1, time.time() - 3601))
image_cache_manager = imagecache.ImageCacheManager()
image_cache_manager.unexplained_images = [fname]
image_cache_manager.used_images = {'123': (1, 0, ['banana-42'])}
image_cache_manager._handle_base_image(img, fname)
mock_mtime.assert_called_once_with(fname)
self.assertEqual(image_cache_manager.unexplained_images, [])
self.assertEqual(image_cache_manager.removable_base_files, [])
self.assertEqual(image_cache_manager.corrupt_base_files, [])
def test_handle_base_image_used_remotely(self):
self.stubs.Set(libvirt_utils, 'chown', lambda x, y: None)
@mock.patch.object(libvirt_utils, 'update_mtime')
def test_handle_base_image_used_remotely(self, mock_mtime):
img = '123'
with self._make_base_file() as fname:
os.utime(fname, (-1, time.time() - 3601))
image_cache_manager = imagecache.ImageCacheManager()
image_cache_manager.unexplained_images = [fname]
image_cache_manager.used_images = {'123': (0, 1, ['banana-42'])}
image_cache_manager._handle_base_image(img, fname)
mock_mtime.assert_called_once_with(fname)
self.assertEqual(image_cache_manager.unexplained_images, [])
self.assertEqual(image_cache_manager.removable_base_files, [])
self.assertEqual(image_cache_manager.corrupt_base_files, [])
@ -527,9 +526,9 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
self.assertEqual(image_cache_manager.removable_base_files, [])
self.assertEqual(image_cache_manager.corrupt_base_files, [])
def test_handle_base_image_checksum_fails(self):
@mock.patch.object(libvirt_utils, 'update_mtime')
def test_handle_base_image_checksum_fails(self, mock_mtime):
self.flags(checksum_base_images=True, group='libvirt')
self.stubs.Set(libvirt_utils, 'chown', lambda x, y: None)
img = '123'
@ -546,12 +545,15 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
image_cache_manager.used_images = {'123': (1, 0, ['banana-42'])}
image_cache_manager._handle_base_image(img, fname)
mock_mtime.assert_called_once_with(fname)
self.assertEqual(image_cache_manager.unexplained_images, [])
self.assertEqual(image_cache_manager.removable_base_files, [])
self.assertEqual(image_cache_manager.corrupt_base_files,
[fname])
def test_verify_base_images(self):
@mock.patch.object(libvirt_utils, 'update_mtime')
@mock.patch.object(lockutils, 'external_lock')
def test_verify_base_images(self, mock_lock, mock_mtime):
hashed_1 = '356a192b7913b04c54574d18c28d46e6395428ab'
hashed_21 = '472b07b9fcf2c2451e8781e944bf5f77cd8457c8'
hashed_22 = '12c6fc06c99a462375eeb3f43dfd832b08ca9e17'
@ -607,11 +609,6 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
self.stub_out('os.path.exists', lambda x: exists(x))
self.stubs.Set(libvirt_utils, 'chown', lambda x, y: None)
# We need to stub utime as well
self.stub_out('os.utime', lambda x, y: None)
# Fake up some instances in the instances directory
orig_listdir = os.listdir
@ -857,13 +854,13 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
expect_set = set(['swap_123', 'swap_456'])
self.assertEqual(image_cache_manager.back_swap_images, expect_set)
@mock.patch.object(libvirt_utils, 'chown')
@mock.patch.object(lockutils, 'external_lock')
@mock.patch.object(libvirt_utils, 'update_mtime')
@mock.patch('os.path.exists', return_value=True)
@mock.patch('os.utime')
@mock.patch('os.path.getmtime')
@mock.patch('os.remove')
def test_age_and_verify_swap_images(self, mock_remove, mock_getmtime,
mock_utime, mock_exist, mock_chown):
mock_exist, mock_mtime, mock_lock):
image_cache_manager = imagecache.ImageCacheManager()
expected_remove = set()
expected_exist = set(['swap_128', 'swap_256'])
@ -894,6 +891,20 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
self.assertIn('swap_128', expected_exist)
self.assertIn('swap_256', expected_remove)
@mock.patch.object(utils, 'synchronized')
@mock.patch.object(imagecache.ImageCacheManager, '_get_age_of_file',
return_value=(True, 100))
def test_lock_acquired_on_removing_old_enough_files(self, mock_get_age,
mock_synchronized):
base_file = '/tmp_age_test'
lock_path = os.path.join(CONF.instances_path, 'locks')
lock_file = os.path.split(base_file)[-1]
image_cache_manager = imagecache.ImageCacheManager()
image_cache_manager._remove_old_enough_file(
base_file, 60, remove_sig=False, remove_lock=False)
mock_synchronized.assert_called_once_with(lock_file, external=True,
lock_path=lock_path)
class VerifyChecksumTestCase(test.NoDBTestCase):

View File

@ -25,7 +25,7 @@ from nova.virt import images
class QemuTestCase(test.NoDBTestCase):
def test_qemu_info_with_bad_path(self):
self.assertRaises(exception.InvalidDiskInfo,
self.assertRaises(exception.DiskNotFound,
images.qemu_img_info,
'/path/that/does/not/exist')

View File

@ -54,8 +54,7 @@ def qemu_img_info(path, format=None):
CONF.import_opt('images_type', 'nova.virt.libvirt.imagebackend',
group='libvirt')
if not os.path.exists(path) and CONF.libvirt.images_type != 'rbd':
msg = (_("Path does not exist %(path)s") % {'path': path})
raise exception.InvalidDiskInfo(reason=msg)
raise exception.DiskNotFound(location=path)
try:
cmd = ('env', 'LC_ALL=C', 'LANG=C', 'qemu-img', 'info', path)

View File

@ -531,10 +531,15 @@ class Raw(Image):
else:
if not os.path.exists(base):
prepare_template(target=base, max_size=size, *args, **kwargs)
# NOTE(mikal): Update the mtime of the base file so the image
# cache manager knows it is in use.
libvirt_utils.update_mtime(base)
self.verify_base_size(base, size)
if not os.path.exists(self.path):
with fileutils.remove_path_on_error(self.path):
copy_raw_image(base, self.path, size)
self.correct_format()
def resize_image(self, size):
@ -584,6 +589,10 @@ class Qcow2(Image):
# Download the unmodified base image unless we already have a copy.
if not os.path.exists(base):
prepare_template(target=base, max_size=size, *args, **kwargs)
# NOTE(ankit): Update the mtime of the base file so the image
# cache manager knows it is in use.
libvirt_utils.update_mtime(base)
self.verify_base_size(base, size)
legacy_backing_size = None

View File

@ -449,10 +449,18 @@ class ImageCacheManager(imagecache.ImageCacheManager):
if not exists:
return
if age < maxage:
LOG.info(_LI('Base or swap file too young to remove: %s'),
base_file)
else:
lock_file = os.path.split(base_file)[-1]
@utils.synchronized(lock_file, external=True,
lock_path=self.lock_path)
def _inner_remove_old_enough_file():
# NOTE(mikal): recheck that the file is old enough, as a new
# user of the file might have come along while we were waiting
# for the lock
exists, age = self._get_age_of_file(base_file)
if not exists or age < maxage:
return
LOG.info(_LI('Removing base or swap file: %s'), base_file)
try:
os.remove(base_file)
@ -466,6 +474,11 @@ class ImageCacheManager(imagecache.ImageCacheManager):
{'base_file': base_file,
'error': e})
if age < maxage:
LOG.info(_LI('Base or swap file too young to remove: %s'),
base_file)
else:
_inner_remove_old_enough_file()
if remove_lock:
try:
# NOTE(jichenjc) The lock file will be constructed first
@ -473,7 +486,6 @@ class ImageCacheManager(imagecache.ImageCacheManager):
# like nova-9e881789030568a317fad9daae82c5b1c65e0d4a
# or nova-03d8e206-6500-4d91-b47d-ee74897f9b4e
# according to the original file name
lock_file = os.path.split(base_file)[-1]
lockutils.remove_external_lock_file(lock_file,
lock_file_prefix='nova-', lock_path=self.lock_path)
except OSError as e:
@ -562,8 +574,7 @@ class ImageCacheManager(imagecache.ImageCacheManager):
{'id': img_id,
'base_file': base_file})
if os.path.exists(base_file):
libvirt_utils.chown(base_file, os.getuid())
os.utime(base_file, None)
libvirt_utils.update_mtime(base_file)
def _age_and_verify_swap_images(self, context, base_dir):
LOG.debug('Verify swap images')
@ -571,8 +582,7 @@ class ImageCacheManager(imagecache.ImageCacheManager):
for ent in self.back_swap_images:
base_file = os.path.join(base_dir, ent)
if ent in self.used_swap_images and os.path.exists(base_file):
libvirt_utils.chown(base_file, os.getuid())
os.utime(base_file, None)
libvirt_utils.update_mtime(base_file)
elif self.remove_unused_base_images:
self._remove_swap_file(base_file)

View File

@ -248,6 +248,14 @@ def chown(path, owner):
execute('chown', owner, path, run_as_root=True)
def update_mtime(path):
"""Touch a file without being the owner.
:param path: File bump the mtime on
"""
execute('touch', '-c', path, run_as_root=True)
def _id_map_to_config(id_map):
return "%s:%s:%s" % (id_map.start, id_map.target, id_map.count)

View File

@ -0,0 +1,13 @@
---
upgrade:
- Upgrade the rootwrap configuration for the compute service,
so that patches requiring new rootwrap configuration can be
tested with grenade.
fixes:
- In a race condition if base image is deleted by ImageCacheManager
while imagebackend is copying the image to instance path, then the
instance goes in to error state. In this case when libvirt has
changed the base file ownership to libvirt-qemu while imagebackend
is copying the image, then we get permission denied error on updating
the file access time using os.utime. Fixed this issue by updating the
base file access time with root user privileges using 'touch' command.