libvirt: Fix BlockDevice.wait_for_job when qemu reports no job
We were misinterpreting the return value of blockJobInfo. Most
immediately we were expecting it to return an integer, which has never
been the case. blockJobInfo also raises an exception on error. Note
that the implementation of abort_on_error has always expected an
integer return value, and exceptions have never been handled, which
means that abort_on_error has always been a no-op, and exceptions have
never been swallowed. As this is also the most intuitive behaviour, we
make it explicit by removing abort_on_error. Any exception raised by
blockJobInfo will continue to propagate unhandled.
We were obfuscating the return value indicating that the job did not
exist, {}, by populating a BlockDeviceJobInfo with fake values. We
de-obfuscate this by returning None instead, which is unambiguous.
wait_for_job() was misnamed, as it does not wait. This is renamed to
is_job_complete() to be less confusing. Note that the logic is
reversed.
After discussion with Eric Blake of the libvirt team (see review
comments: https://review.openstack.org/#/c/375652/), we are now
confident asserting that if no job exists then it has completed
(although we are still not sure that it succeeded). Consequently we
remove the wait_for_job_clean parameter, and always assume that no job
means it has completed. Previously this was implicit because no job
meant a defaulted BlockDeviceJobInfo.job value of 0.
Co-authored-by: Sławek Kapłoński <slawek@kaplonski.pl>
Closes-Bug: #1627134
Change-Id: I2d0daa32b1d37fa60412ad7a374ee38cebdeb579
(cherry picked from commit 0f4bd24166
)
This commit is contained in:
parent
99f8a3c4e9
commit
a021a72dda
|
@ -14461,7 +14461,12 @@ class LibvirtConnTestCase(test.NoDBTestCase):
|
|||
|
||||
mock_dom.XMLDesc.return_value = xmldoc
|
||||
mock_dom.isPersistent.return_value = True
|
||||
mock_dom.blockJobInfo.return_value = {'cur': 100, 'end': 100}
|
||||
mock_dom.blockJobInfo.return_value = {
|
||||
'type': 0,
|
||||
'bandwidth': 0,
|
||||
'cur': 100,
|
||||
'end': 100
|
||||
}
|
||||
|
||||
drvr._swap_volume(guest, srcfile, dstfile, 1)
|
||||
|
||||
|
@ -17570,9 +17575,16 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
|
|||
|
||||
domain.blockRebase('vda', 'snap.img', 0, flags=0)
|
||||
|
||||
domain.blockJobInfo('vda', flags=0).AndReturn({'cur': 1, 'end': 1000})
|
||||
domain.blockJobInfo('vda', flags=0).AndReturn(
|
||||
{'cur': 1000, 'end': 1000})
|
||||
domain.blockJobInfo('vda', flags=0).AndReturn({
|
||||
'type': 0,
|
||||
'bandwidth': 0,
|
||||
'cur': 1,
|
||||
'end': 1000})
|
||||
domain.blockJobInfo('vda', flags=0).AndReturn({
|
||||
'type': 0,
|
||||
'bandwidth': 0,
|
||||
'cur': 1000,
|
||||
'end': 1000})
|
||||
|
||||
self.mox.ReplayAll()
|
||||
|
||||
|
@ -17605,9 +17617,16 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
|
|||
domain.blockRebase('vda', 'snap.img', 0,
|
||||
flags=fakelibvirt.VIR_DOMAIN_BLOCK_REBASE_RELATIVE)
|
||||
|
||||
domain.blockJobInfo('vda', flags=0).AndReturn({'cur': 1, 'end': 1000})
|
||||
domain.blockJobInfo('vda', flags=0).AndReturn(
|
||||
{'cur': 1000, 'end': 1000})
|
||||
domain.blockJobInfo('vda', flags=0).AndReturn({
|
||||
'type': 0,
|
||||
'bandwidth': 0,
|
||||
'cur': 1,
|
||||
'end': 1000})
|
||||
domain.blockJobInfo('vda', flags=0).AndReturn({
|
||||
'type': 0,
|
||||
'bandwidth': 0,
|
||||
'cur': 1000,
|
||||
'end': 1000})
|
||||
|
||||
self.mox.ReplayAll()
|
||||
|
||||
|
@ -17762,9 +17781,16 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
|
|||
domain.blockCommit('vda', 'other-snap.img', 'snap.img', 0,
|
||||
flags=fakelibvirt.VIR_DOMAIN_BLOCK_COMMIT_RELATIVE)
|
||||
|
||||
domain.blockJobInfo('vda', flags=0).AndReturn({'cur': 1, 'end': 1000})
|
||||
domain.blockJobInfo('vda', flags=0).AndReturn({'cur': 1000,
|
||||
'end': 1000})
|
||||
domain.blockJobInfo('vda', flags=0).AndReturn({
|
||||
'type': 0,
|
||||
'bandwidth': 0,
|
||||
'cur': 1,
|
||||
'end': 1000})
|
||||
domain.blockJobInfo('vda', flags=0).AndReturn({
|
||||
'type': 0,
|
||||
'bandwidth': 0,
|
||||
'cur': 1000,
|
||||
'end': 1000})
|
||||
|
||||
self.mox.ReplayAll()
|
||||
|
||||
|
@ -17789,7 +17815,11 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
|
|||
return_value=guest),
|
||||
mock.patch.object(domain, 'blockRebase'),
|
||||
mock.patch.object(domain, 'blockJobInfo',
|
||||
return_value={'cur': 1000, 'end': 1000})
|
||||
return_value={
|
||||
'type': 4, # See virDomainBlockJobType enum
|
||||
'bandwidth': 0,
|
||||
'cur': 1000,
|
||||
'end': 1000})
|
||||
) as (mock_xmldesc, mock_get_guest,
|
||||
mock_rebase, mock_job_info):
|
||||
|
||||
|
@ -17819,7 +17849,11 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
|
|||
return_value=guest),
|
||||
mock.patch.object(domain, 'blockRebase'),
|
||||
mock.patch.object(domain, 'blockJobInfo',
|
||||
return_value={'cur': 1000, 'end': 1000})
|
||||
return_value={
|
||||
'type': 0,
|
||||
'bandwidth': 0,
|
||||
'cur': 1000,
|
||||
'end': 1000})
|
||||
) as (mock_xmldesc, mock_get_guest,
|
||||
mock_rebase, mock_job_info):
|
||||
|
||||
|
@ -17946,9 +17980,16 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
|
|||
|
||||
domain.blockRebase('vdb', 'vdb[1]', 0, flags=0)
|
||||
|
||||
domain.blockJobInfo('vdb', flags=0).AndReturn({'cur': 1, 'end': 1000})
|
||||
domain.blockJobInfo('vdb', flags=0).AndReturn(
|
||||
{'cur': 1000, 'end': 1000})
|
||||
domain.blockJobInfo('vdb', flags=0).AndReturn({
|
||||
'type': 0,
|
||||
'bandwidth': 0,
|
||||
'cur': 1,
|
||||
'end': 1000})
|
||||
domain.blockJobInfo('vdb', flags=0).AndReturn({
|
||||
'type': 0,
|
||||
'bandwidth': 0,
|
||||
'cur': 1000,
|
||||
'end': 1000})
|
||||
|
||||
self.mox.ReplayAll()
|
||||
|
||||
|
@ -17986,9 +18027,16 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
|
|||
domain.blockRebase('vdb', 'vdb[1]', 0,
|
||||
flags=fakelibvirt.VIR_DOMAIN_BLOCK_REBASE_RELATIVE)
|
||||
|
||||
domain.blockJobInfo('vdb', flags=0).AndReturn({'cur': 1, 'end': 1000})
|
||||
domain.blockJobInfo('vdb', flags=0).AndReturn(
|
||||
{'cur': 1000, 'end': 1000})
|
||||
domain.blockJobInfo('vdb', flags=0).AndReturn({
|
||||
'type': 0,
|
||||
'bandwidth': 0,
|
||||
'cur': 1,
|
||||
'end': 1000})
|
||||
domain.blockJobInfo('vdb', flags=0).AndReturn({
|
||||
'type': 0,
|
||||
'bandwidth': 0,
|
||||
'cur': 1000,
|
||||
'end': 1000})
|
||||
|
||||
self.mox.ReplayAll()
|
||||
|
||||
|
@ -18065,9 +18113,16 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase):
|
|||
domain.blockCommit('vdb', 'vdb[0]', 'vdb[1]', 0,
|
||||
flags=fakelibvirt.VIR_DOMAIN_BLOCK_COMMIT_RELATIVE)
|
||||
|
||||
domain.blockJobInfo('vdb', flags=0).AndReturn({'cur': 1, 'end': 1000})
|
||||
domain.blockJobInfo('vdb', flags=0).AndReturn(
|
||||
{'cur': 1000, 'end': 1000})
|
||||
domain.blockJobInfo('vdb', flags=0).AndReturn({
|
||||
'type': 0,
|
||||
'bandwidth': 0,
|
||||
'cur': 1,
|
||||
'end': 1000})
|
||||
domain.blockJobInfo('vdb', flags=0).AndReturn({
|
||||
'type': 0,
|
||||
'bandwidth': 0,
|
||||
'cur': 1000,
|
||||
'end': 1000})
|
||||
|
||||
self.mox.ReplayAll()
|
||||
|
||||
|
|
|
@ -642,48 +642,42 @@ class GuestBlockTestCase(test.NoDBTestCase):
|
|||
'vda', "foo", "top", 0,
|
||||
flags=fakelibvirt.VIR_DOMAIN_BLOCK_COMMIT_RELATIVE)
|
||||
|
||||
def test_wait_for_job_cur_end_zeros(self):
|
||||
def test_is_job_complete_cur_end_zeros(self):
|
||||
self.domain.blockJobInfo.return_value = {
|
||||
"type": 4,
|
||||
"bandwidth": 18,
|
||||
"cur": 0,
|
||||
"end": 0}
|
||||
in_progress = self.gblock.wait_for_job()
|
||||
self.assertTrue(in_progress)
|
||||
is_complete = self.gblock.is_job_complete()
|
||||
self.assertFalse(is_complete)
|
||||
|
||||
def test_wait_for_job_current_lower_than_end(self):
|
||||
def test_is_job_complete_current_lower_than_end(self):
|
||||
self.domain.blockJobInfo.return_value = {
|
||||
"type": 4,
|
||||
"bandwidth": 18,
|
||||
"cur": 95,
|
||||
"end": 100}
|
||||
in_progress = self.gblock.wait_for_job()
|
||||
self.assertTrue(in_progress)
|
||||
is_complete = self.gblock.is_job_complete()
|
||||
self.assertFalse(is_complete)
|
||||
|
||||
def test_wait_for_job_finished(self):
|
||||
def test_is_job_complete_finished(self):
|
||||
self.domain.blockJobInfo.return_value = {
|
||||
"type": 4,
|
||||
"bandwidth": 18,
|
||||
"cur": 100,
|
||||
"end": 100}
|
||||
in_progress = self.gblock.wait_for_job()
|
||||
self.assertFalse(in_progress)
|
||||
is_complete = self.gblock.is_job_complete()
|
||||
self.assertTrue(is_complete)
|
||||
|
||||
def test_wait_for_job_clean(self):
|
||||
self.domain.blockJobInfo.return_value = {"type": 0}
|
||||
in_progress = self.gblock.wait_for_job(wait_for_job_clean=True)
|
||||
self.assertFalse(in_progress)
|
||||
def test_is_job_complete_no_job(self):
|
||||
self.domain.blockJobInfo.return_value = {}
|
||||
is_complete = self.gblock.is_job_complete()
|
||||
self.assertTrue(is_complete)
|
||||
|
||||
def test_wait_for_job_abort_on_error(self):
|
||||
self.domain.blockJobInfo.return_value = -1
|
||||
self.assertRaises(
|
||||
exception.NovaException,
|
||||
self.gblock.wait_for_job, abort_on_error=True)
|
||||
|
||||
def test_wait_for_job_ignore_on_error(self):
|
||||
self.domain.blockJobInfo.return_value = -1
|
||||
in_progress = self.gblock.wait_for_job()
|
||||
self.assertFalse(in_progress)
|
||||
def test_is_job_complete_exception(self):
|
||||
self.domain.blockJobInfo.side_effect = fakelibvirt.libvirtError('fake')
|
||||
self.assertRaises(fakelibvirt.libvirtError,
|
||||
self.gblock.is_job_complete)
|
||||
|
||||
|
||||
class JobInfoTestCase(test.NoDBTestCase):
|
||||
|
|
|
@ -1209,7 +1209,7 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||
# allow writing to existing external volume file
|
||||
dev.rebase(new_path, copy=True, reuse_ext=True)
|
||||
|
||||
while dev.wait_for_job():
|
||||
while not dev.is_job_complete():
|
||||
time.sleep(0.5)
|
||||
|
||||
dev.abort_job(pivot=True)
|
||||
|
@ -1217,7 +1217,7 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||
# NOTE(alex_xu): domain.blockJobAbort isn't sync call. This
|
||||
# is bug in libvirt. So we need waiting for the pivot is
|
||||
# finished. libvirt bug #1119173
|
||||
while dev.wait_for_job(wait_for_job_clean=True):
|
||||
while not dev.is_job_complete():
|
||||
time.sleep(0.5)
|
||||
dev.resize(resize_to * units.Gi / units.Ki)
|
||||
finally:
|
||||
|
@ -1710,7 +1710,7 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||
# issue an abort once we have a complete copy.
|
||||
dev.rebase(disk_delta, copy=True, reuse_ext=True, shallow=True)
|
||||
|
||||
while dev.wait_for_job():
|
||||
while not dev.is_job_complete():
|
||||
time.sleep(0.5)
|
||||
|
||||
dev.abort_job()
|
||||
|
@ -2116,7 +2116,7 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||
LOG.debug('blockRebase started successfully',
|
||||
instance=instance)
|
||||
|
||||
while dev.wait_for_job(abort_on_error=True):
|
||||
while not dev.is_job_complete():
|
||||
LOG.debug('waiting for blockRebase job completion',
|
||||
instance=instance)
|
||||
time.sleep(0.5)
|
||||
|
@ -2177,7 +2177,7 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||
LOG.debug('blockCommit started successfully',
|
||||
instance=instance)
|
||||
|
||||
while dev.wait_for_job(abort_on_error=True):
|
||||
while not dev.is_job_complete():
|
||||
LOG.debug('waiting for blockCommit job completion',
|
||||
instance=instance)
|
||||
time.sleep(0.5)
|
||||
|
|
|
@ -659,15 +659,24 @@ class BlockDevice(object):
|
|||
def get_job_info(self):
|
||||
"""Returns information about job currently running
|
||||
|
||||
:returns: BlockDeviceJobInfo or None
|
||||
:returns: BlockDeviceJobInfo, or None if no job exists
|
||||
:raises: libvirt.libvirtError on error fetching block job info
|
||||
"""
|
||||
|
||||
# libvirt's blockJobInfo() raises libvirt.libvirtError if there was an
|
||||
# error. It returns {} if the job no longer exists, or a fully
|
||||
# populated dict if the job exists.
|
||||
status = self._guest._domain.blockJobInfo(self._disk, flags=0)
|
||||
if status != -1:
|
||||
return BlockDeviceJobInfo(
|
||||
job=status.get("type", 0),
|
||||
bandwidth=status.get("bandwidth", 0),
|
||||
cur=status.get("cur", 0),
|
||||
end=status.get("end", 0))
|
||||
|
||||
# The job no longer exists
|
||||
if not status:
|
||||
return None
|
||||
|
||||
return BlockDeviceJobInfo(
|
||||
job=status['type'],
|
||||
bandwidth=status['bandwidth'],
|
||||
cur=status['cur'],
|
||||
end=status['end'])
|
||||
|
||||
def rebase(self, base, shallow=False, reuse_ext=False,
|
||||
copy=False, relative=False):
|
||||
|
@ -701,39 +710,41 @@ class BlockDevice(object):
|
|||
"""Resizes block device to Kib size."""
|
||||
self._guest._domain.blockResize(self._disk, size_kb)
|
||||
|
||||
def wait_for_job(self, abort_on_error=False, wait_for_job_clean=False):
|
||||
"""Wait for libvirt block job to complete.
|
||||
def is_job_complete(self):
|
||||
"""Return True if the job is complete, False otherwise
|
||||
|
||||
Libvirt may return either cur==end or an empty dict when
|
||||
the job is complete, depending on whether the job has been
|
||||
cleaned up by libvirt yet, or not.
|
||||
It can also return end=0 if qemu has not yet started the block
|
||||
operation.
|
||||
|
||||
:param abort_on_error: Whether to stop process and raise NovaException
|
||||
on error (default: False)
|
||||
:param wait_for_job_clean: Whether to force wait to ensure job is
|
||||
finished (see bug: RH Bugzilla#1119173)
|
||||
|
||||
:returns: True if still in progress
|
||||
False if completed
|
||||
:returns: True if the job is complete, False otherwise
|
||||
:raises: libvirt.libvirtError on error fetching block job info
|
||||
"""
|
||||
# NOTE(mdbooth): This method polls for block job completion. It returns
|
||||
# true if either we get a status which indicates completion, or there
|
||||
# is no longer a record of the job. Ideally this method and its
|
||||
# callers would be rewritten to consume libvirt events from the job.
|
||||
# This would provide a couple of advantages. Firstly, as it would no
|
||||
# longer be polling it would notice completion immediately rather than
|
||||
# at the next 0.5s check, and would also consume fewer resources.
|
||||
# Secondly, with the current method we only know that 'no job'
|
||||
# indicates completion. It does not necessarily indicate successful
|
||||
# completion: the job could have failed, or been cancelled. When
|
||||
# polling for block job info we have no way to detect this, so we
|
||||
# assume success.
|
||||
|
||||
status = self.get_job_info()
|
||||
if not status:
|
||||
if abort_on_error:
|
||||
msg = _('libvirt error while requesting blockjob info.')
|
||||
raise exception.NovaException(msg)
|
||||
return False
|
||||
|
||||
if wait_for_job_clean:
|
||||
job_ended = status.job == 0
|
||||
else:
|
||||
# NOTE(slaweq): because of bug in libvirt, which is described in
|
||||
# http://www.redhat.com/archives/libvir-list/2016-September/msg00017.html
|
||||
# if status.end == 0 job is not started yet so it is not finished
|
||||
job_ended = status.end != 0 and status.cur == status.end
|
||||
# If the job no longer exists, it is because it has completed
|
||||
# NOTE(mdbooth): See comment above: it may not have succeeded.
|
||||
if status is None:
|
||||
return True
|
||||
|
||||
return not job_ended
|
||||
# NOTE(slaweq): because of bug in libvirt, which is described in
|
||||
# http://www.redhat.com/archives/libvir-list/2016-September/msg00017.html
|
||||
# if status.end == 0 job is not started yet so it is not finished
|
||||
# NOTE(mdbooth): The fix was committed upstream here:
|
||||
# http://libvirt.org/git/?p=libvirt.git;a=commit;h=988218c
|
||||
# The earliest tag which contains this commit is v2.3.0-rc1, so we
|
||||
# should be able to remove this workaround when MIN_LIBVIRT_VERSION
|
||||
# reaches 2.3.0, or we move to handling job events instead.
|
||||
return status.end != 0 and status.cur == status.end
|
||||
|
||||
|
||||
class VCPUInfo(object):
|
||||
|
|
Loading…
Reference in New Issue