From 02ad4f862a7c5b51100288b6b22f15087788d8d7 Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Thu, 9 Feb 2017 15:54:41 -0500 Subject: [PATCH] libvirt: wait for interface detach from the guest The test_reassign_port_between_servers test in Tempest creates a port in neutron and two servers. It attaches the port to the first server and then quickly detaches it and waits for the port.device_id to be unbound from the server. Then it repeats that for the second server. The interface detach from the guest is asynchronous and happens before nova unbinds the port, so there is a race where the port's device_id is unset but the interface is still on the first guest when we try to attach to the second guest, which fails. This is a latent bug, but apparently has been tickled by the move to our neutron CI jobs to use ubuntu xenial nodes. The fix is to add a detach and retry loop on the interface detach on the guest so that we can wait until the interface is gone from the guest before nova unbinds the port in neutron, which is what the Tempest test is waiting for. Then the device should be available for attaching to the second guest. This is similar to what we do with detaching volumes. Closes-Bug: #1607714 Change-Id: Ic04aad8923ea2edf1d16e32c208cd41fdf898834 (cherry picked from commit a3b3e8d8314b0cedc2604be509f0f4d523a35ed5) --- nova/tests/unit/virt/libvirt/test_driver.py | 46 ++++++++++++++++++--- nova/tests/unit/virt/libvirt/test_guest.py | 13 +++++- nova/virt/libvirt/driver.py | 14 ++++++- nova/virt/libvirt/guest.py | 19 ++++++--- 4 files changed, 77 insertions(+), 15 deletions(-) diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index cf579d430c7b..93672d7265d1 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -16783,7 +16783,8 @@ class LibvirtDriverTestCase(test.NoDBTestCase): power_state.SHUTDOWN, expected_flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG)) - def _test_detach_interface(self, power_state, expected_flags): + def _test_detach_interface(self, power_state, expected_flags, + device_not_found=False): # setup some mocks instance = self._create_instance() network_info = _fake_network_info(self, 1) @@ -16812,13 +16813,27 @@ class LibvirtDriverTestCase(test.NoDBTestCase): """) + if device_not_found: + # This will trigger detach_device_with_retry to raise + # DeviceNotFound + get_interface_calls = [expected_cfg, None] + else: + get_interface_calls = [expected_cfg, expected_cfg, None] + with test.nested( mock.patch.object(host.Host, 'get_guest', return_value=guest), mock.patch.object(self.drvr.vif_driver, 'get_config', return_value=expected_cfg), - mock.patch.object(domain, 'detachDeviceFlags') + # This is called multiple times in a retry loop so we use a + # side_effect to simulate the calls to stop the loop. + mock.patch.object(guest, 'get_interface_by_cfg', + side_effect=get_interface_calls), + mock.patch.object(domain, 'detachDeviceFlags'), + mock.patch('nova.virt.libvirt.driver.LOG.warning') ) as ( - mock_get_guest, mock_get_config, mock_detach_device_flags + mock_get_guest, mock_get_config, + mock_get_interface, mock_detach_device_flags, + mock_warning ): # run the detach method self.drvr.detach_interface(self.context, instance, network_info[0]) @@ -16829,8 +16844,16 @@ class LibvirtDriverTestCase(test.NoDBTestCase): instance, network_info[0], test.MatchType(objects.ImageMeta), test.MatchType(objects.Flavor), CONF.libvirt.virt_type, self.drvr._host) - mock_detach_device_flags.assert_called_once_with( - expected_cfg.to_xml(), flags=expected_flags) + mock_get_interface.assert_has_calls( + [mock.call(expected_cfg) for x in range(len(get_interface_calls))]) + + if device_not_found: + mock_detach_device_flags.assert_not_called() + self.assertTrue(mock_warning.called) + else: + mock_detach_device_flags.assert_called_once_with( + expected_cfg.to_xml(), flags=expected_flags) + mock_warning.assert_not_called() def test_detach_interface_with_running_instance(self): self._test_detach_interface( @@ -16838,6 +16861,15 @@ class LibvirtDriverTestCase(test.NoDBTestCase): expected_flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) + def test_detach_interface_with_running_instance_device_not_found(self): + """Tests that the interface is detached before we try to detach it. + """ + self._test_detach_interface( + power_state.RUNNING, + expected_flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | + fakelibvirt.VIR_DOMAIN_AFFECT_LIVE), + device_not_found=True) + def test_detach_interface_with_pause_instance(self): self._test_detach_interface( power_state.PAUSED, @@ -16922,7 +16954,9 @@ class LibvirtDriverTestCase(test.NoDBTestCase): fakelibvirt.VIR_DOMAIN_AFFECT_LIVE) domain.detachDeviceFlags(expected.to_xml(), flags=expected_flags) self.mox.ReplayAll() - self.drvr.detach_interface(self.context, instance, network_info[0]) + with mock.patch.object(libvirt_guest.Guest, 'get_interface_by_cfg', + side_effect=[expected, expected, None]): + self.drvr.detach_interface(self.context, instance, network_info[0]) self.mox.VerifyAll() @mock.patch('nova.virt.libvirt.utils.write_to_file') diff --git a/nova/tests/unit/virt/libvirt/test_guest.py b/nova/tests/unit/virt/libvirt/test_guest.py index d2d341d063b4..6d8a65dacca7 100644 --- a/nova/tests/unit/virt/libvirt/test_guest.py +++ b/nova/tests/unit/virt/libvirt/test_guest.py @@ -18,6 +18,7 @@ import sys import mock from oslo_utils import encodeutils +import six from nova import context from nova import exception @@ -272,9 +273,19 @@ class GuestTestCase(test.NoDBTestCase): def test_detach_device_with_retry_device_not_found(self): get_config = mock.Mock(return_value=None) - self.assertRaises( + ex = self.assertRaises( exception.DeviceNotFound, self.guest.detach_device_with_retry, get_config, "/dev/vdb", persistent=True, live=True) + self.assertIn("/dev/vdb", six.text_type(ex)) + + def test_detach_device_with_retry_device_not_found_alt_name(self): + """Tests to make sure we use the alternative name in errors.""" + get_config = mock.Mock(return_value=None) + ex = self.assertRaises( + exception.DeviceNotFound, self.guest.detach_device_with_retry, + get_config, mock.sentinel.device, persistent=True, live=True, + alternative_device_name='foo') + self.assertIn('foo', six.text_type(ex)) @mock.patch.object(libvirt_guest.Guest, "detach_device") def test_detach_device_with_retry_operation_failed(self, mock_detach): diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 303eb2ea9e70..49fdc9d8e965 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -1434,7 +1434,17 @@ class LibvirtDriver(driver.ComputeDriver): state = guest.get_power_state(self._host) live = state in (power_state.RUNNING, power_state.PAUSED) - guest.detach_device(interface, persistent=True, live=live) + # Now we are going to loop until the interface is detached or we + # timeout. + wait_for_detach = guest.detach_device_with_retry( + guest.get_interface_by_cfg, cfg, persistent=True, live=live, + alternative_device_name=vif.get('address')) + wait_for_detach() + except exception.DeviceNotFound: + # The interface is gone so just log it as a warning. + LOG.warning(_LW('Detaching interface %(mac)s failed because ' + 'the device is no longer found on the guest.'), + {'mac': vif.get('address')}, instance=instance) except libvirt.libvirtError as ex: error_code = ex.get_error_code() if error_code == libvirt.VIR_ERR_NO_DOMAIN: @@ -1461,7 +1471,7 @@ class LibvirtDriver(driver.ComputeDriver): instance_uuid=instance.uuid) # The interface is gone so just log it as a warning. - LOG.warning(_LW('Detaching interface %(mac)s failed because ' + LOG.warning(_LW('Detaching interface %(mac)s failed because ' 'the device is no longer found on the guest.'), {'mac': mac}, instance=instance) diff --git a/nova/virt/libvirt/guest.py b/nova/virt/libvirt/guest.py index 4d2e47e05a59..2d483d599ea0 100644 --- a/nova/virt/libvirt/guest.py +++ b/nova/virt/libvirt/guest.py @@ -364,7 +364,8 @@ class Guest(object): def detach_device_with_retry(self, get_device_conf_func, device, persistent, live, max_retry_count=7, inc_sleep_time=2, - max_sleep_time=30): + max_sleep_time=30, + alternative_device_name=None): """Detaches a device from the guest. After the initial detach request, a function is returned which can be used to ensure the device is successfully removed from the guest domain (retrying the removal as @@ -385,7 +386,11 @@ class Guest(object): time will not be incremented using param inc_sleep_time. On reaching this threshold, max_sleep_time will be used as the sleep time. + :param alternative_device_name: This is an alternative identifier for + the device if device is not an ID, used solely for error messages. """ + alternative_device_name = alternative_device_name or device + def _try_detach_device(conf, persistent=False, live=False): # Raise DeviceNotFound if the device isn't found during detach try: @@ -398,17 +403,19 @@ class Guest(object): if 'not found' in errmsg: # This will be raised if the live domain # detach fails because the device is not found - raise exception.DeviceNotFound(device=device) + raise exception.DeviceNotFound( + device=alternative_device_name) elif errcode == libvirt.VIR_ERR_INVALID_ARG: errmsg = ex.get_error_message() if 'no target device' in errmsg: # This will be raised if the persistent domain # detach fails because the device is not found - raise exception.DeviceNotFound(device=device) + raise exception.DeviceNotFound( + device=alternative_device_name) conf = get_device_conf_func(device) if conf is None: - raise exception.DeviceNotFound(device=device) + raise exception.DeviceNotFound(device=alternative_device_name) _try_detach_device(conf, persistent, live) @@ -424,8 +431,8 @@ class Guest(object): _try_detach_device(config, persistent=False, live=live) reason = _("Unable to detach from guest transient domain.") - raise exception.DeviceDetachFailed(device=device, - reason=reason) + raise exception.DeviceDetachFailed( + device=alternative_device_name, reason=reason) return _do_wait_and_retry_detach