Add a lock to prevent race during detach/attach of interface

Only allow one detach/attach at a time with the same pattern instance-port_id
in order to avoid race condition when multiple detach/attach are run
concurrently.

When multiple detach run concurrently on a specific instance-port_id,
manager consider many of them as valid because info_cache still contains
the port and info_cache is refreshed only once the first request complete.

So during this gap of time, while the first request accomplishes the task,
all subsequent requests are destined to fail and log a warning [1] in
different location of code, depending on the outcome of the first request.
The Issue is that all those caught requests finally run a
deallocate_port_for_instance which will unbind the port.

This may cause a race condition, because a successful attach can pass between
those unbind, and be silently unbound, resulting in an infrastructure/DB
inconsistency.

[1] 'Detaching interface %(mac)s failed because the device is no longer found
     on the guest.'

Closes-Bug: #1892870
Change-Id: Iea5969d0bd16dc9a6f1ba950224b0115e466ce66
This commit is contained in:
Alexandre Arents 2020-08-24 14:05:27 +00:00
parent d09fade87e
commit 39831c5599
4 changed files with 56 additions and 7 deletions

View File

@ -7481,6 +7481,19 @@ class ComputeManager(manager.Manager):
def attach_interface(self, context, instance, network_id, port_id,
requested_ip, tag):
"""Use hotplug to add an network adapter to an instance."""
lockname = 'interface-%s-%s' % (instance.uuid, port_id)
@utils.synchronized(lockname)
def do_attach_interface(context, instance, network_id, port_id,
requested_ip, tag):
return self._attach_interface(context, instance, network_id,
port_id, requested_ip, tag)
return do_attach_interface(context, instance, network_id, port_id,
requested_ip, tag)
def _attach_interface(self, context, instance, network_id, port_id,
requested_ip, tag):
if not self.driver.capabilities.get('supports_attach_interface',
False):
raise exception.AttachInterfaceNotSupported(
@ -7540,6 +7553,18 @@ class ComputeManager(manager.Manager):
@wrap_instance_fault
def detach_interface(self, context, instance, port_id):
"""Detach a network adapter from an instance."""
lockname = 'interface-%s-%s' % (instance.uuid, port_id)
@utils.synchronized(lockname)
def do_detach_interface(context, instance, port_id):
self._detach_interface(context, instance, port_id)
do_detach_interface(context, instance, port_id)
def _detach_interface(self, context, instance, port_id):
# NOTE(aarents): we need to refresh info cache from DB here,
# as previous detach/attach lock holder just updated it.
compute_utils.refresh_info_cache_for_instance(context, instance)
network_info = instance.info_cache.network_info
condemned = None
for vif in network_info:

View File

@ -10285,11 +10285,15 @@ class ComputeAPITestCase(BaseTestCase):
network_id = nwinfo[0]['network']['id']
port_id = nwinfo[0]['id']
req_ip = '1.2.3.4'
lock_name = 'interface-%s-%s' % (instance.uuid, port_id)
mock_allocate = mock.Mock(return_value=nwinfo)
self.compute.network_api.allocate_port_for_instance = mock_allocate
with mock.patch.dict(self.compute.driver.capabilities,
supports_attach_interface=True):
with test.nested(
mock.patch.dict(self.compute.driver.capabilities,
supports_attach_interface=True),
mock.patch('oslo_concurrency.lockutils.lock')
) as (cap, mock_lock):
vif = self.compute.attach_interface(self.context,
instance,
network_id,
@ -10304,6 +10308,9 @@ class ComputeAPITestCase(BaseTestCase):
action='interface_attach', phase='start'),
mock.call(self.context, instance, self.compute.host,
action='interface_attach', phase='end')])
mock_lock.assert_called_once_with(lock_name, mock.ANY, mock.ANY,
mock.ANY, delay=mock.ANY, do_log=mock.ANY, fair=mock.ANY,
semaphores=mock.ANY)
return nwinfo, port_id
@mock.patch.object(compute_utils, 'notify_about_instance_action')
@ -10401,6 +10408,7 @@ class ComputeAPITestCase(BaseTestCase):
self.context, uuids.info_cache_instance)
instance.info_cache.network_info = network_model.NetworkInfo.hydrate(
nwinfo)
lock_name = 'interface-%s-%s' % (instance.uuid, port_id)
port_allocation = {uuids.rp1: {'NET_BW_EGR_KILOBIT_PER_SEC': 10000}}
with test.nested(
@ -10409,8 +10417,9 @@ class ComputeAPITestCase(BaseTestCase):
'remove_resources_from_instance_allocation'),
mock.patch.object(self.compute.network_api,
'deallocate_port_for_instance',
return_value=([], port_allocation))) as (
mock_remove_alloc, mock_deallocate):
return_value=([], port_allocation)),
mock.patch('oslo_concurrency.lockutils.lock')) as (
mock_remove_alloc, mock_deallocate, mock_lock):
self.compute.detach_interface(self.context, instance, port_id)
mock_deallocate.assert_called_once_with(
@ -10424,6 +10433,9 @@ class ComputeAPITestCase(BaseTestCase):
action='interface_detach', phase='start'),
mock.call(self.context, instance, self.compute.host,
action='interface_detach', phase='end')])
mock_lock.assert_called_once_with(lock_name, mock.ANY, mock.ANY,
mock.ANY, delay=mock.ANY, do_log=mock.ANY, fair=mock.ANY,
semaphores=mock.ANY)
@mock.patch('nova.compute.manager.LOG.log')
def test_detach_interface_failed(self, mock_log):

View File

@ -2586,8 +2586,9 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
@mock.patch.object(compute_utils, 'EventReporter')
@mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
@mock.patch.object(compute_utils, 'refresh_info_cache_for_instance')
@mock.patch.object(self.compute, '_set_instance_obj_error_state')
def do_test(meth, add_fault, event):
def do_test(meth, refresh, add_fault, event):
self.assertRaises(exception.PortNotFound,
self.compute.detach_interface,
self.context, f_instance, 'port_id')
@ -2596,6 +2597,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
event.assert_called_once_with(
self.context, 'compute_detach_interface', CONF.host,
f_instance.uuid, graceful_exit=False)
refresh.assert_called_once_with(self.context, f_instance)
do_test()
@ -2603,8 +2605,9 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
@mock.patch.object(compute_utils, 'EventReporter')
@mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
@mock.patch.object(compute_utils, 'notify_about_instance_action')
def test_detach_interface_instance_not_found(self, mock_notify, mock_fault,
mock_event, mock_log):
@mock.patch.object(compute_utils, 'refresh_info_cache_for_instance')
def test_detach_interface_instance_not_found(self, mock_refresh,
mock_notify, mock_fault, mock_event, mock_log):
nw_info = network_model.NetworkInfo([
network_model.VIF(uuids.port_id)])
info_cache = objects.InstanceInfoCache(network_info=nw_info,
@ -2617,6 +2620,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
self.assertRaises(exception.InterfaceDetachFailed,
self.compute.detach_interface,
self.context, instance, uuids.port_id)
mock_refresh.assert_called_once_with(self.context, instance)
self.assertEqual(1, mock_log.call_count)
self.assertEqual(logging.DEBUG, mock_log.call_args[0][0])

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Resolve a race condition that may occur during concurrent
``interface detach/attach``, resulting in an interface accidentally unbind
after attached. See `bug 1892870`_ for more details.
.. _bug 1892870: https://bugs.launchpad.net/nova/+bug/1892870