libvirt: wait for interface detach from the guest

The test_reassign_port_between_servers test in Tempest creates
a port in neutron and two servers. It attaches the port to the
first server and then quickly detaches it and waits for the
port.device_id to be unbound from the server. Then it repeats
that for the second server.

The interface detach from the guest is asynchronous and happens
before nova unbinds the port, so there is a race where the port's
device_id is unset but the interface is still on the first guest
when we try to attach to the second guest, which fails.

This is a latent bug, but apparently has been tickled by the
move to our neutron CI jobs to use ubuntu xenial nodes.

The fix is to add a detach and retry loop on the interface detach
on the guest so that we can wait until the interface is gone
from the guest before nova unbinds the port in neutron, which is
what the Tempest test is waiting for. Then the device should be
available for attaching to the second guest.

This is similar to what we do with detaching volumes.

Closes-Bug: #1607714

Change-Id: Ic04aad8923ea2edf1d16e32c208cd41fdf898834
(cherry picked from commit a3b3e8d831)
This commit is contained in:
Matt Riedemann 2017-02-09 15:54:41 -05:00 committed by Matt Riedemann
parent c95cd011d0
commit 02ad4f862a
4 changed files with 77 additions and 15 deletions

View File

@ -16783,7 +16783,8 @@ class LibvirtDriverTestCase(test.NoDBTestCase):
power_state.SHUTDOWN,
expected_flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG))
def _test_detach_interface(self, power_state, expected_flags):
def _test_detach_interface(self, power_state, expected_flags,
device_not_found=False):
# setup some mocks
instance = self._create_instance()
network_info = _fake_network_info(self, 1)
@ -16812,13 +16813,27 @@ class LibvirtDriverTestCase(test.NoDBTestCase):
<target dev='tap12345678'/>
</interface>""")
if device_not_found:
# This will trigger detach_device_with_retry to raise
# DeviceNotFound
get_interface_calls = [expected_cfg, None]
else:
get_interface_calls = [expected_cfg, expected_cfg, None]
with test.nested(
mock.patch.object(host.Host, 'get_guest', return_value=guest),
mock.patch.object(self.drvr.vif_driver, 'get_config',
return_value=expected_cfg),
mock.patch.object(domain, 'detachDeviceFlags')
# This is called multiple times in a retry loop so we use a
# side_effect to simulate the calls to stop the loop.
mock.patch.object(guest, 'get_interface_by_cfg',
side_effect=get_interface_calls),
mock.patch.object(domain, 'detachDeviceFlags'),
mock.patch('nova.virt.libvirt.driver.LOG.warning')
) as (
mock_get_guest, mock_get_config, mock_detach_device_flags
mock_get_guest, mock_get_config,
mock_get_interface, mock_detach_device_flags,
mock_warning
):
# run the detach method
self.drvr.detach_interface(self.context, instance, network_info[0])
@ -16829,8 +16844,16 @@ class LibvirtDriverTestCase(test.NoDBTestCase):
instance, network_info[0], test.MatchType(objects.ImageMeta),
test.MatchType(objects.Flavor), CONF.libvirt.virt_type,
self.drvr._host)
mock_detach_device_flags.assert_called_once_with(
expected_cfg.to_xml(), flags=expected_flags)
mock_get_interface.assert_has_calls(
[mock.call(expected_cfg) for x in range(len(get_interface_calls))])
if device_not_found:
mock_detach_device_flags.assert_not_called()
self.assertTrue(mock_warning.called)
else:
mock_detach_device_flags.assert_called_once_with(
expected_cfg.to_xml(), flags=expected_flags)
mock_warning.assert_not_called()
def test_detach_interface_with_running_instance(self):
self._test_detach_interface(
@ -16838,6 +16861,15 @@ class LibvirtDriverTestCase(test.NoDBTestCase):
expected_flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG |
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE))
def test_detach_interface_with_running_instance_device_not_found(self):
"""Tests that the interface is detached before we try to detach it.
"""
self._test_detach_interface(
power_state.RUNNING,
expected_flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG |
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE),
device_not_found=True)
def test_detach_interface_with_pause_instance(self):
self._test_detach_interface(
power_state.PAUSED,
@ -16922,7 +16954,9 @@ class LibvirtDriverTestCase(test.NoDBTestCase):
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)
domain.detachDeviceFlags(expected.to_xml(), flags=expected_flags)
self.mox.ReplayAll()
self.drvr.detach_interface(self.context, instance, network_info[0])
with mock.patch.object(libvirt_guest.Guest, 'get_interface_by_cfg',
side_effect=[expected, expected, None]):
self.drvr.detach_interface(self.context, instance, network_info[0])
self.mox.VerifyAll()
@mock.patch('nova.virt.libvirt.utils.write_to_file')

View File

@ -18,6 +18,7 @@ import sys
import mock
from oslo_utils import encodeutils
import six
from nova import context
from nova import exception
@ -272,9 +273,19 @@ class GuestTestCase(test.NoDBTestCase):
def test_detach_device_with_retry_device_not_found(self):
get_config = mock.Mock(return_value=None)
self.assertRaises(
ex = self.assertRaises(
exception.DeviceNotFound, self.guest.detach_device_with_retry,
get_config, "/dev/vdb", persistent=True, live=True)
self.assertIn("/dev/vdb", six.text_type(ex))
def test_detach_device_with_retry_device_not_found_alt_name(self):
"""Tests to make sure we use the alternative name in errors."""
get_config = mock.Mock(return_value=None)
ex = self.assertRaises(
exception.DeviceNotFound, self.guest.detach_device_with_retry,
get_config, mock.sentinel.device, persistent=True, live=True,
alternative_device_name='foo')
self.assertIn('foo', six.text_type(ex))
@mock.patch.object(libvirt_guest.Guest, "detach_device")
def test_detach_device_with_retry_operation_failed(self, mock_detach):

View File

@ -1434,7 +1434,17 @@ class LibvirtDriver(driver.ComputeDriver):
state = guest.get_power_state(self._host)
live = state in (power_state.RUNNING, power_state.PAUSED)
guest.detach_device(interface, persistent=True, live=live)
# Now we are going to loop until the interface is detached or we
# timeout.
wait_for_detach = guest.detach_device_with_retry(
guest.get_interface_by_cfg, cfg, persistent=True, live=live,
alternative_device_name=vif.get('address'))
wait_for_detach()
except exception.DeviceNotFound:
# The interface is gone so just log it as a warning.
LOG.warning(_LW('Detaching interface %(mac)s failed because '
'the device is no longer found on the guest.'),
{'mac': vif.get('address')}, instance=instance)
except libvirt.libvirtError as ex:
error_code = ex.get_error_code()
if error_code == libvirt.VIR_ERR_NO_DOMAIN:
@ -1461,7 +1471,7 @@ class LibvirtDriver(driver.ComputeDriver):
instance_uuid=instance.uuid)
# The interface is gone so just log it as a warning.
LOG.warning(_LW('Detaching interface %(mac)s failed because '
LOG.warning(_LW('Detaching interface %(mac)s failed because '
'the device is no longer found on the guest.'),
{'mac': mac}, instance=instance)

View File

@ -364,7 +364,8 @@ class Guest(object):
def detach_device_with_retry(self, get_device_conf_func, device,
persistent, live, max_retry_count=7,
inc_sleep_time=2,
max_sleep_time=30):
max_sleep_time=30,
alternative_device_name=None):
"""Detaches a device from the guest. After the initial detach request,
a function is returned which can be used to ensure the device is
successfully removed from the guest domain (retrying the removal as
@ -385,7 +386,11 @@ class Guest(object):
time will not be incremented using param
inc_sleep_time. On reaching this threshold,
max_sleep_time will be used as the sleep time.
:param alternative_device_name: This is an alternative identifier for
the device if device is not an ID, used solely for error messages.
"""
alternative_device_name = alternative_device_name or device
def _try_detach_device(conf, persistent=False, live=False):
# Raise DeviceNotFound if the device isn't found during detach
try:
@ -398,17 +403,19 @@ class Guest(object):
if 'not found' in errmsg:
# This will be raised if the live domain
# detach fails because the device is not found
raise exception.DeviceNotFound(device=device)
raise exception.DeviceNotFound(
device=alternative_device_name)
elif errcode == libvirt.VIR_ERR_INVALID_ARG:
errmsg = ex.get_error_message()
if 'no target device' in errmsg:
# This will be raised if the persistent domain
# detach fails because the device is not found
raise exception.DeviceNotFound(device=device)
raise exception.DeviceNotFound(
device=alternative_device_name)
conf = get_device_conf_func(device)
if conf is None:
raise exception.DeviceNotFound(device=device)
raise exception.DeviceNotFound(device=alternative_device_name)
_try_detach_device(conf, persistent, live)
@ -424,8 +431,8 @@ class Guest(object):
_try_detach_device(config, persistent=False, live=live)
reason = _("Unable to detach from guest transient domain.")
raise exception.DeviceDetachFailed(device=device,
reason=reason)
raise exception.DeviceDetachFailed(
device=alternative_device_name, reason=reason)
return _do_wait_and_retry_detach