libvirt: handle code=38 + sigkill (ebusy) in destroy()

Handle the libvirt error during destroy when the sigkill fails due to an
EBUSY. This is taken from a comment by danpb in the bug report as a
potential workaround.

Co-authored-by: Daniel Berrange (berrange@redhat.com)

Closes-Bug: #1353939

Conflicts:
    nova/tests/unit/virt/libvirt/test_driver.py

    NOTE (kashyapc): 'stable/kilo' branch doesn't have the
    'libvirt_guest' object, so, adjust the below unit tests accordingly:

        test_private_destroy_ebusy_timeout
        test_private_destroy_ebusy_multiple_attempt_ok

Change-Id: I128bf6b939fbbc85df521fd3fe23c3c6f93b1b2c
(cherry picked from commit 3907867601)
This commit is contained in:
Matt Riedemann 2015-05-10 18:46:37 -07:00 committed by Kashyap Chamarthy
parent 7b14cb8945
commit dc6af6bf86
2 changed files with 77 additions and 1 deletions

View File

@ -8418,6 +8418,54 @@ class LibvirtConnTestCase(test.NoDBTestCase):
self.assertRaises(fakelibvirt.libvirtError, conn._destroy,
instance)
def test_private_destroy_ebusy_timeout(self):
# Tests that _destroy will retry 3 times to destroy the guest when an
# EBUSY is raised, but eventually times out and raises the libvirtError
ex = fakelibvirt.make_libvirtError(
fakelibvirt.libvirtError,
("Failed to terminate process 26425 with SIGKILL: "
"Device or resource busy"),
error_code=fakelibvirt.VIR_ERR_SYSTEM_ERROR,
int1=errno.EBUSY)
instance = objects.Instance(**self.test_instance)
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
with mock.patch.object(drvr._host, 'get_domain') as mock_get_domain:
mock_domain = mock.MagicMock()
mock_domain.ID.return_value = 1
mock_get_domain.return_value = mock_domain
mock_domain.destroy.side_effect = ex
self.assertRaises(fakelibvirt.libvirtError, drvr._destroy,
instance)
self.assertEqual(3, mock_domain.destroy.call_count)
def test_private_destroy_ebusy_multiple_attempt_ok(self):
# Tests that the _destroy attempt loop is broken when EBUSY is no
# longer raised.
ex = fakelibvirt.make_libvirtError(
fakelibvirt.libvirtError,
("Failed to terminate process 26425 with SIGKILL: "
"Device or resource busy"),
error_code=fakelibvirt.VIR_ERR_SYSTEM_ERROR,
int1=errno.EBUSY)
inst_info = hardware.InstanceInfo(power_state.SHUTDOWN, id=1)
instance = objects.Instance(**self.test_instance)
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
with mock.patch.object(drvr._host, 'get_domain') as mock_get_domain, \
mock.patch.object(drvr, 'get_info', return_value=inst_info):
mock_domain = mock.MagicMock()
mock_domain.ID.return_value = 1
mock_get_domain.return_value = mock_domain
mock_domain.destroy.side_effect = ex, None
drvr._destroy(instance)
self.assertEqual(2, mock_domain.destroy.call_count)
def test_undefine_domain_with_not_found_instance(self):
def fake_get_domain(self, instance):
raise exception.InstanceNotFound(instance_id=instance.name)

View File

@ -618,7 +618,7 @@ class LibvirtDriver(driver.ComputeDriver):
rootfs_dev = instance.system_metadata.get('rootfs_device_name')
disk.teardown_container(container_dir, rootfs_dev)
def _destroy(self, instance):
def _destroy(self, instance, attempt=1):
try:
virt_dom = self._host.get_domain(instance)
except exception.InstanceNotFound:
@ -661,6 +661,34 @@ class LibvirtDriver(driver.ComputeDriver):
instance=instance)
reason = _("operation time out")
raise exception.InstancePowerOffFailure(reason=reason)
elif errcode == libvirt.VIR_ERR_SYSTEM_ERROR:
if e.get_int1() == errno.EBUSY:
# NOTE(danpb): When libvirt kills a process it sends it
# SIGTERM first and waits 10 seconds. If it hasn't gone
# it sends SIGKILL and waits another 5 seconds. If it
# still hasn't gone then you get this EBUSY error.
# Usually when a QEMU process fails to go away upon
# SIGKILL it is because it is stuck in an
# uninterruptable kernel sleep waiting on I/O from
# some non-responsive server.
# Given the CPU load of the gate tests though, it is
# conceivable that the 15 second timeout is too short,
# particularly if the VM running tempest has a high
# steal time from the cloud host. ie 15 wallclock
# seconds may have passed, but the VM might have only
# have a few seconds of scheduled run time.
LOG.warn(_LW('Error from libvirt during destroy. '
'Code=%(errcode)s Error=%(e)s; '
'attempt %(attempt)d of 3'),
{'errcode': errcode, 'e': e,
'attempt': attempt},
instance=instance)
with excutils.save_and_reraise_exception() as ctxt:
# Try up to 3 times before giving up.
if attempt < 3:
ctxt.reraise = False
self._destroy(instance, attempt + 1)
return
if not is_okay:
with excutils.save_and_reraise_exception():