libvirt: Fix BlockDevice.wait_for_job when qemu reports no job

We were misinterpreting the return value of blockJobInfo. Most
immediately we were expecting it to return an integer, which has never
been the case. blockJobInfo also raises an exception on error. Note
that the implementation of abort_on_error has always expected an
integer return value, and exceptions have never been handled, which
means that abort_on_error has always been a no-op, and exceptions have
never been swallowed. As this is also the most intuitive behaviour, we
make it explicit by removing abort_on_error. Any exception raised by
blockJobInfo will continue to propagate unhandled.

We were obfuscating the return value indicating that the job did not
exist, {}, by populating a BlockDeviceJobInfo with fake values. We
de-obfuscate this by returning None instead, which is unambiguous.

wait_for_job() was misnamed, as it does not wait. This is renamed to
is_job_complete() to be less confusing. Note that the logic is
reversed.

After discussion with Eric Blake of the libvirt team (see review
comments: https://review.openstack.org/#/c/375652/), we are now
confident asserting that if no job exists then it has completed
(although we are still not sure that it succeeded). Consequently we
remove the wait_for_job_clean parameter, and always assume that no job
means it has completed. Previously this was implicit because no job
meant a defaulted BlockDeviceJobInfo.job value of 0.

Co-authored-by: Sławek Kapłoński <slawek@kaplonski.pl>
Closes-Bug: #1627134
Change-Id: I2d0daa32b1d37fa60412ad7a374ee38cebdeb579
(cherry picked from commit 0f4bd24166)
This commit is contained in:
Matthew Booth 2016-09-28 16:44:41 +01:00 committed by Lee Yarwood
parent 99f8a3c4e9
commit a021a72dda
4 changed files with 144 additions and 84 deletions

View File

@ -14461,7 +14461,12 @@ class LibvirtConnTestCase(test.NoDBTestCase):
mock_dom.XMLDesc.return_value = xmldoc
mock_dom.isPersistent.return_value = True
mock_dom.blockJobInfo.return_value = {'cur': 100, 'end': 100}
mock_dom.blockJobInfo.return_value = {
'type': 0,
'bandwidth': 0,
'cur': 100,
'end': 100
}
drvr._swap_volume(guest, srcfile, dstfile, 1)
@ -17570,9 +17575,16 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
domain.blockRebase('vda', 'snap.img', 0, flags=0)
domain.blockJobInfo('vda', flags=0).AndReturn({'cur': 1, 'end': 1000})
domain.blockJobInfo('vda', flags=0).AndReturn(
{'cur': 1000, 'end': 1000})
domain.blockJobInfo('vda', flags=0).AndReturn({
'type': 0,
'bandwidth': 0,
'cur': 1,
'end': 1000})
domain.blockJobInfo('vda', flags=0).AndReturn({
'type': 0,
'bandwidth': 0,
'cur': 1000,
'end': 1000})
self.mox.ReplayAll()
@ -17605,9 +17617,16 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
domain.blockRebase('vda', 'snap.img', 0,
flags=fakelibvirt.VIR_DOMAIN_BLOCK_REBASE_RELATIVE)
domain.blockJobInfo('vda', flags=0).AndReturn({'cur': 1, 'end': 1000})
domain.blockJobInfo('vda', flags=0).AndReturn(
{'cur': 1000, 'end': 1000})
domain.blockJobInfo('vda', flags=0).AndReturn({
'type': 0,
'bandwidth': 0,
'cur': 1,
'end': 1000})
domain.blockJobInfo('vda', flags=0).AndReturn({
'type': 0,
'bandwidth': 0,
'cur': 1000,
'end': 1000})
self.mox.ReplayAll()
@ -17762,9 +17781,16 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
domain.blockCommit('vda', 'other-snap.img', 'snap.img', 0,
flags=fakelibvirt.VIR_DOMAIN_BLOCK_COMMIT_RELATIVE)
domain.blockJobInfo('vda', flags=0).AndReturn({'cur': 1, 'end': 1000})
domain.blockJobInfo('vda', flags=0).AndReturn({'cur': 1000,
'end': 1000})
domain.blockJobInfo('vda', flags=0).AndReturn({
'type': 0,
'bandwidth': 0,
'cur': 1,
'end': 1000})
domain.blockJobInfo('vda', flags=0).AndReturn({
'type': 0,
'bandwidth': 0,
'cur': 1000,
'end': 1000})
self.mox.ReplayAll()
@ -17789,7 +17815,11 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
return_value=guest),
mock.patch.object(domain, 'blockRebase'),
mock.patch.object(domain, 'blockJobInfo',
return_value={'cur': 1000, 'end': 1000})
return_value={
'type': 4, # See virDomainBlockJobType enum
'bandwidth': 0,
'cur': 1000,
'end': 1000})
) as (mock_xmldesc, mock_get_guest,
mock_rebase, mock_job_info):
@ -17819,7 +17849,11 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
return_value=guest),
mock.patch.object(domain, 'blockRebase'),
mock.patch.object(domain, 'blockJobInfo',
return_value={'cur': 1000, 'end': 1000})
return_value={
'type': 0,
'bandwidth': 0,
'cur': 1000,
'end': 1000})
) as (mock_xmldesc, mock_get_guest,
mock_rebase, mock_job_info):
@ -17946,9 +17980,16 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
domain.blockRebase('vdb', 'vdb[1]', 0, flags=0)
domain.blockJobInfo('vdb', flags=0).AndReturn({'cur': 1, 'end': 1000})
domain.blockJobInfo('vdb', flags=0).AndReturn(
{'cur': 1000, 'end': 1000})
domain.blockJobInfo('vdb', flags=0).AndReturn({
'type': 0,
'bandwidth': 0,
'cur': 1,
'end': 1000})
domain.blockJobInfo('vdb', flags=0).AndReturn({
'type': 0,
'bandwidth': 0,
'cur': 1000,
'end': 1000})
self.mox.ReplayAll()
@ -17986,9 +18027,16 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
domain.blockRebase('vdb', 'vdb[1]', 0,
flags=fakelibvirt.VIR_DOMAIN_BLOCK_REBASE_RELATIVE)
domain.blockJobInfo('vdb', flags=0).AndReturn({'cur': 1, 'end': 1000})
domain.blockJobInfo('vdb', flags=0).AndReturn(
{'cur': 1000, 'end': 1000})
domain.blockJobInfo('vdb', flags=0).AndReturn({
'type': 0,
'bandwidth': 0,
'cur': 1,
'end': 1000})
domain.blockJobInfo('vdb', flags=0).AndReturn({
'type': 0,
'bandwidth': 0,
'cur': 1000,
'end': 1000})
self.mox.ReplayAll()
@ -18065,9 +18113,16 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
domain.blockCommit('vdb', 'vdb[0]', 'vdb[1]', 0,
flags=fakelibvirt.VIR_DOMAIN_BLOCK_COMMIT_RELATIVE)
domain.blockJobInfo('vdb', flags=0).AndReturn({'cur': 1, 'end': 1000})
domain.blockJobInfo('vdb', flags=0).AndReturn(
{'cur': 1000, 'end': 1000})
domain.blockJobInfo('vdb', flags=0).AndReturn({
'type': 0,
'bandwidth': 0,
'cur': 1,
'end': 1000})
domain.blockJobInfo('vdb', flags=0).AndReturn({
'type': 0,
'bandwidth': 0,
'cur': 1000,
'end': 1000})
self.mox.ReplayAll()

View File

@ -642,48 +642,42 @@ class GuestBlockTestCase(test.NoDBTestCase):
'vda', "foo", "top", 0,
flags=fakelibvirt.VIR_DOMAIN_BLOCK_COMMIT_RELATIVE)
def test_wait_for_job_cur_end_zeros(self):
def test_is_job_complete_cur_end_zeros(self):
self.domain.blockJobInfo.return_value = {
"type": 4,
"bandwidth": 18,
"cur": 0,
"end": 0}
in_progress = self.gblock.wait_for_job()
self.assertTrue(in_progress)
is_complete = self.gblock.is_job_complete()
self.assertFalse(is_complete)
def test_wait_for_job_current_lower_than_end(self):
def test_is_job_complete_current_lower_than_end(self):
self.domain.blockJobInfo.return_value = {
"type": 4,
"bandwidth": 18,
"cur": 95,
"end": 100}
in_progress = self.gblock.wait_for_job()
self.assertTrue(in_progress)
is_complete = self.gblock.is_job_complete()
self.assertFalse(is_complete)
def test_wait_for_job_finished(self):
def test_is_job_complete_finished(self):
self.domain.blockJobInfo.return_value = {
"type": 4,
"bandwidth": 18,
"cur": 100,
"end": 100}
in_progress = self.gblock.wait_for_job()
self.assertFalse(in_progress)
is_complete = self.gblock.is_job_complete()
self.assertTrue(is_complete)
def test_wait_for_job_clean(self):
self.domain.blockJobInfo.return_value = {"type": 0}
in_progress = self.gblock.wait_for_job(wait_for_job_clean=True)
self.assertFalse(in_progress)
def test_is_job_complete_no_job(self):
self.domain.blockJobInfo.return_value = {}
is_complete = self.gblock.is_job_complete()
self.assertTrue(is_complete)
def test_wait_for_job_abort_on_error(self):
self.domain.blockJobInfo.return_value = -1
self.assertRaises(
exception.NovaException,
self.gblock.wait_for_job, abort_on_error=True)
def test_wait_for_job_ignore_on_error(self):
self.domain.blockJobInfo.return_value = -1
in_progress = self.gblock.wait_for_job()
self.assertFalse(in_progress)
def test_is_job_complete_exception(self):
self.domain.blockJobInfo.side_effect = fakelibvirt.libvirtError('fake')
self.assertRaises(fakelibvirt.libvirtError,
self.gblock.is_job_complete)
class JobInfoTestCase(test.NoDBTestCase):

View File

@ -1209,7 +1209,7 @@ class LibvirtDriver(driver.ComputeDriver):
# allow writing to existing external volume file
dev.rebase(new_path, copy=True, reuse_ext=True)
while dev.wait_for_job():
while not dev.is_job_complete():
time.sleep(0.5)
dev.abort_job(pivot=True)
@ -1217,7 +1217,7 @@ class LibvirtDriver(driver.ComputeDriver):
# NOTE(alex_xu): domain.blockJobAbort isn't sync call. This
# is bug in libvirt. So we need waiting for the pivot is
# finished. libvirt bug #1119173
while dev.wait_for_job(wait_for_job_clean=True):
while not dev.is_job_complete():
time.sleep(0.5)
dev.resize(resize_to * units.Gi / units.Ki)
finally:
@ -1710,7 +1710,7 @@ class LibvirtDriver(driver.ComputeDriver):
# issue an abort once we have a complete copy.
dev.rebase(disk_delta, copy=True, reuse_ext=True, shallow=True)
while dev.wait_for_job():
while not dev.is_job_complete():
time.sleep(0.5)
dev.abort_job()
@ -2116,7 +2116,7 @@ class LibvirtDriver(driver.ComputeDriver):
LOG.debug('blockRebase started successfully',
instance=instance)
while dev.wait_for_job(abort_on_error=True):
while not dev.is_job_complete():
LOG.debug('waiting for blockRebase job completion',
instance=instance)
time.sleep(0.5)
@ -2177,7 +2177,7 @@ class LibvirtDriver(driver.ComputeDriver):
LOG.debug('blockCommit started successfully',
instance=instance)
while dev.wait_for_job(abort_on_error=True):
while not dev.is_job_complete():
LOG.debug('waiting for blockCommit job completion',
instance=instance)
time.sleep(0.5)

View File

@ -659,15 +659,24 @@ class BlockDevice(object):
def get_job_info(self):
"""Returns information about job currently running
:returns: BlockDeviceJobInfo or None
:returns: BlockDeviceJobInfo, or None if no job exists
:raises: libvirt.libvirtError on error fetching block job info
"""
# libvirt's blockJobInfo() raises libvirt.libvirtError if there was an
# error. It returns {} if the job no longer exists, or a fully
# populated dict if the job exists.
status = self._guest._domain.blockJobInfo(self._disk, flags=0)
if status != -1:
return BlockDeviceJobInfo(
job=status.get("type", 0),
bandwidth=status.get("bandwidth", 0),
cur=status.get("cur", 0),
end=status.get("end", 0))
# The job no longer exists
if not status:
return None
return BlockDeviceJobInfo(
job=status['type'],
bandwidth=status['bandwidth'],
cur=status['cur'],
end=status['end'])
def rebase(self, base, shallow=False, reuse_ext=False,
copy=False, relative=False):
@ -701,39 +710,41 @@ class BlockDevice(object):
"""Resizes block device to Kib size."""
self._guest._domain.blockResize(self._disk, size_kb)
def wait_for_job(self, abort_on_error=False, wait_for_job_clean=False):
"""Wait for libvirt block job to complete.
def is_job_complete(self):
"""Return True if the job is complete, False otherwise
Libvirt may return either cur==end or an empty dict when
the job is complete, depending on whether the job has been
cleaned up by libvirt yet, or not.
It can also return end=0 if qemu has not yet started the block
operation.
:param abort_on_error: Whether to stop process and raise NovaException
on error (default: False)
:param wait_for_job_clean: Whether to force wait to ensure job is
finished (see bug: RH Bugzilla#1119173)
:returns: True if still in progress
False if completed
:returns: True if the job is complete, False otherwise
:raises: libvirt.libvirtError on error fetching block job info
"""
# NOTE(mdbooth): This method polls for block job completion. It returns
# true if either we get a status which indicates completion, or there
# is no longer a record of the job. Ideally this method and its
# callers would be rewritten to consume libvirt events from the job.
# This would provide a couple of advantages. Firstly, as it would no
# longer be polling it would notice completion immediately rather than
# at the next 0.5s check, and would also consume fewer resources.
# Secondly, with the current method we only know that 'no job'
# indicates completion. It does not necessarily indicate successful
# completion: the job could have failed, or been cancelled. When
# polling for block job info we have no way to detect this, so we
# assume success.
status = self.get_job_info()
if not status:
if abort_on_error:
msg = _('libvirt error while requesting blockjob info.')
raise exception.NovaException(msg)
return False
if wait_for_job_clean:
job_ended = status.job == 0
else:
# NOTE(slaweq): because of bug in libvirt, which is described in
# http://www.redhat.com/archives/libvir-list/2016-September/msg00017.html
# if status.end == 0 job is not started yet so it is not finished
job_ended = status.end != 0 and status.cur == status.end
# If the job no longer exists, it is because it has completed
# NOTE(mdbooth): See comment above: it may not have succeeded.
if status is None:
return True
return not job_ended
# NOTE(slaweq): because of bug in libvirt, which is described in
# http://www.redhat.com/archives/libvir-list/2016-September/msg00017.html
# if status.end == 0 job is not started yet so it is not finished
# NOTE(mdbooth): The fix was committed upstream here:
# http://libvirt.org/git/?p=libvirt.git;a=commit;h=988218c
# The earliest tag which contains this commit is v2.3.0-rc1, so we
# should be able to remove this workaround when MIN_LIBVIRT_VERSION
# reaches 2.3.0, or we move to handling job events instead.
return status.end != 0 and status.cur == status.end
class VCPUInfo(object):