Merge "libvirt: handle code=38 + sigkill (ebusy) in destroy()" into stable/kilo

This commit is contained in:
Jenkins 2015-07-06 15:29:25 +00:00 committed by Gerrit Code Review
commit b8c4f1bce3
2 changed files with 77 additions and 1 deletions

View File

@ -8366,6 +8366,54 @@ class LibvirtConnTestCase(test.NoDBTestCase):
self.assertRaises(fakelibvirt.libvirtError, conn._destroy,
instance)
def test_private_destroy_ebusy_timeout(self):
# Tests that _destroy will retry 3 times to destroy the guest when an
# EBUSY is raised, but eventually times out and raises the libvirtError
ex = fakelibvirt.make_libvirtError(
fakelibvirt.libvirtError,
("Failed to terminate process 26425 with SIGKILL: "
"Device or resource busy"),
error_code=fakelibvirt.VIR_ERR_SYSTEM_ERROR,
int1=errno.EBUSY)
instance = objects.Instance(**self.test_instance)
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
with mock.patch.object(drvr._host, 'get_domain') as mock_get_domain:
mock_domain = mock.MagicMock()
mock_domain.ID.return_value = 1
mock_get_domain.return_value = mock_domain
mock_domain.destroy.side_effect = ex
self.assertRaises(fakelibvirt.libvirtError, drvr._destroy,
instance)
self.assertEqual(3, mock_domain.destroy.call_count)
def test_private_destroy_ebusy_multiple_attempt_ok(self):
# Tests that the _destroy attempt loop is broken when EBUSY is no
# longer raised.
ex = fakelibvirt.make_libvirtError(
fakelibvirt.libvirtError,
("Failed to terminate process 26425 with SIGKILL: "
"Device or resource busy"),
error_code=fakelibvirt.VIR_ERR_SYSTEM_ERROR,
int1=errno.EBUSY)
inst_info = hardware.InstanceInfo(power_state.SHUTDOWN, id=1)
instance = objects.Instance(**self.test_instance)
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
with mock.patch.object(drvr._host, 'get_domain') as mock_get_domain, \
mock.patch.object(drvr, 'get_info', return_value=inst_info):
mock_domain = mock.MagicMock()
mock_domain.ID.return_value = 1
mock_get_domain.return_value = mock_domain
mock_domain.destroy.side_effect = ex, None
drvr._destroy(instance)
self.assertEqual(2, mock_domain.destroy.call_count)
def test_undefine_domain_with_not_found_instance(self):
def fake_get_domain(self, instance):
raise exception.InstanceNotFound(instance_id=instance.name)

View File

@ -617,7 +617,7 @@ class LibvirtDriver(driver.ComputeDriver):
rootfs_dev = instance.system_metadata.get('rootfs_device_name')
disk.teardown_container(container_dir, rootfs_dev)
def _destroy(self, instance):
def _destroy(self, instance, attempt=1):
try:
virt_dom = self._host.get_domain(instance)
except exception.InstanceNotFound:
@ -660,6 +660,34 @@ class LibvirtDriver(driver.ComputeDriver):
instance=instance)
reason = _("operation time out")
raise exception.InstancePowerOffFailure(reason=reason)
elif errcode == libvirt.VIR_ERR_SYSTEM_ERROR:
if e.get_int1() == errno.EBUSY:
# NOTE(danpb): When libvirt kills a process it sends it
# SIGTERM first and waits 10 seconds. If it hasn't gone
# it sends SIGKILL and waits another 5 seconds. If it
# still hasn't gone then you get this EBUSY error.
# Usually when a QEMU process fails to go away upon
# SIGKILL it is because it is stuck in an
# uninterruptable kernel sleep waiting on I/O from
# some non-responsive server.
# Given the CPU load of the gate tests though, it is
# conceivable that the 15 second timeout is too short,
# particularly if the VM running tempest has a high
# steal time from the cloud host. ie 15 wallclock
# seconds may have passed, but the VM might have only
# have a few seconds of scheduled run time.
LOG.warn(_LW('Error from libvirt during destroy. '
'Code=%(errcode)s Error=%(e)s; '
'attempt %(attempt)d of 3'),
{'errcode': errcode, 'e': e,
'attempt': attempt},
instance=instance)
with excutils.save_and_reraise_exception() as ctxt:
# Try up to 3 times before giving up.
if attempt < 3:
ctxt.reraise = False
self._destroy(instance, attempt + 1)
return
if not is_okay:
with excutils.save_and_reraise_exception():