Avoid race condition when deleting trunk bridges

Prior to this change, trunk bridges are created by os-vif but deleted
by Neutron when the last vif is removed from it. This creates race
conditions in some use cases, like DPDK with vhostuserclient mode, when
VMs are rebooted. To avoid these races, Neutron will not delete trunk
bridges anymore. Their creation and deletion will be os-vif's
responsiblity. Since [1], Nova uses the os-vif version that contains
this functionality.

This patch also changes the trunk status change event. During a live
migration, when the trunk parent port has been bound to the destination
host (that means there is only one port binding associated) and the
status has changed to ACTIVE, the method triggers the subport binding
to the new host too. This is because there could be a race condition
between the subport binding, triggered by the OVS agent, and the parent
port binding, triggered by Nova. If when the OVS agent tries to bind the
subports, the parent port is still bound to the source host, the subport
binding remains in the source host too, instead of changing to the
destination.

This patch also reverts [2] and [3]. As commented in the previous
paragraph, this patch fixes the issue reported in LP#1997025. The trunk
port live migration with ML2/OVS must be fixed with this patch.

[1]https://review.opendev.org/c/openstack/nova/+/865031
[2]https://review.opendev.org/c/openstack/neutron/+/865295
[3]https://review.opendev.org/c/openstack/neutron/+/865424

Closes-Bug: #1869244
Closes-Bug: #1997025

Change-Id: I4e16357f3ff214fcf41e418982806c24088a2665
This commit is contained in:
Miguel Lavalle 2022-04-13 18:00:12 -05:00 committed by Rodolfo Alonso Hernandez
parent 970f9fbafa
commit 33de608f04
11 changed files with 121 additions and 74 deletions

View File

@ -14,7 +14,6 @@
# under the License.
import functools
import time
import eventlet
from neutron_lib.callbacks import events
@ -43,7 +42,6 @@ from neutron.services.trunk.rpc import agent
LOG = logging.getLogger(__name__)
DEFAULT_WAIT_FOR_PORT_TIMEOUT = 60
WAIT_BEFORE_TRUNK_DELETE = 6
def lock_on_bridge_name(required_parameter):
@ -208,21 +206,6 @@ class OVSDBHandler(object):
:param bridge_name: Name of the bridge used for locking purposes.
:param port: Parent port dict.
"""
# TODO(njohnston): In the case of DPDK with trunk ports, if nova
# deletes an interface and then re-adds it we can get a race
# condition where the port is re-added and then the bridge is
# deleted because we did not properly catch the re-addition. To
# solve this would require transitioning to ordered event
# resolution, like the L3 agent does with the
# ResourceProcessingQueue class. Until we can make that happen, we
# try to mitigate the issue by checking if there is a port on the
# bridge and if so then do not remove it.
bridge = ovs_lib.OVSBridge(bridge_name)
time.sleep(WAIT_BEFORE_TRUNK_DELETE)
if bridge_has_instance_port(bridge):
LOG.debug("The bridge %s has instances attached so it will not "
"be deleted.", bridge_name)
return
try:
# TODO(jlibosva): Investigate how to proceed during removal of
# trunk bridge that doesn't have metadata stored.
@ -304,8 +287,7 @@ class OVSDBHandler(object):
LOG.debug("Added trunk: %s", trunk_id)
return self._get_current_status(subports, subport_ids)
def unwire_subports_for_trunk(self, trunk_id, subport_ids):
"""Destroy OVS ports associated to the logical subports."""
def _remove_sub_ports(self, trunk_id, subport_ids):
ids = []
for subport_id in subport_ids:
try:
@ -317,6 +299,11 @@ class OVSDBHandler(object):
{'subport_id': subport_id,
'trunk_id': trunk_id,
'err': te})
return ids
def unwire_subports_for_trunk(self, trunk_id, subport_ids):
"""Destroy OVS ports associated to the logical subports."""
ids = self._remove_sub_ports(trunk_id, subport_ids)
try:
# OVS bridge and port to be determined by _update_trunk_metadata
bridge = None

View File

@ -136,7 +136,6 @@ class TrunkParentPort(object):
"""
ovsdb = self.bridge.ovsdb
with ovsdb.transaction() as txn:
txn.add(ovsdb.del_br(self.bridge.br_name))
txn.add(ovsdb.del_port(self.patch_port_int_name,
bridge.br_name))
@ -186,8 +185,9 @@ class SubPort(TrunkParentPort):
"""
ovsdb = self.bridge.ovsdb
with ovsdb.transaction() as txn:
txn.add(ovsdb.del_port(self.patch_port_trunk_name,
self.bridge.br_name))
if self.bridge.exists():
txn.add(ovsdb.del_port(self.patch_port_trunk_name,
self.bridge.br_name))
txn.add(ovsdb.del_port(self.patch_port_int_name,
bridge.br_name))
@ -225,10 +225,7 @@ class TrunkManager(object):
"""Remove the trunk bridge."""
trunk = TrunkParentPort(trunk_id, port_id)
try:
if trunk.bridge.exists():
trunk.unplug(self.br_int)
else:
LOG.debug("Trunk bridge with ID %s does not exist.", trunk_id)
trunk.unplug(self.br_int)
except RuntimeError as e:
raise TrunkManagerError(error=e)
@ -283,10 +280,7 @@ class TrunkManager(object):
# Trunk bridge might have been deleted by calling delete_trunk() before
# remove_sub_port().
try:
if sub_port.bridge.exists():
sub_port.unplug(self.br_int)
else:
LOG.debug("Trunk bridge with ID %s does not exist.", trunk_id)
sub_port.unplug(self.br_int)
except RuntimeError as e:
raise TrunkManagerError(error=e)

View File

@ -14,14 +14,18 @@
from neutron_lib.api.definitions import portbindings
from neutron_lib.callbacks import events
from neutron_lib.callbacks import registry
from neutron_lib.callbacks import resources
from neutron_lib import constants
from neutron_lib.plugins import directory
from neutron_lib.plugins.ml2 import ovs_constants as agent_consts
from neutron_lib.services.trunk import constants as trunk_consts
from oslo_config import cfg
from oslo_log import log as logging
from neutron.objects import trunk as trunk_obj
from neutron.services.trunk.drivers import base
from neutron.services.trunk.drivers.openvswitch import utils
from neutron.services.trunk import exceptions as trunk_exc
LOG = logging.getLogger(__name__)
@ -41,6 +45,16 @@ DRIVER = None
class OVSDriver(base.DriverBase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._core_plugin = None
@property
def core_plugin(self):
if not self._core_plugin:
self._core_plugin = directory.get_plugin()
return self._core_plugin
@property
def is_loaded(self):
try:
@ -55,6 +69,61 @@ class OVSDriver(base.DriverBase):
SUPPORTED_SEGMENTATION_TYPES,
constants.AGENT_TYPE_OVS)
@staticmethod
def _get_trunk(context, trunk_id):
"""Return the trunk object or raise if not found."""
obj = trunk_obj.Trunk.get_object(context, id=trunk_id)
if obj is None:
raise trunk_exc.TrunkNotFound(trunk_id=trunk_id)
return obj
def _update_subport_binding(self, context, trunk_id):
"""Update the subport binding host"""
trunk_obj = self._get_trunk(context, trunk_id)
trunk_port = self.core_plugin.get_port(context, trunk_obj.port_id)
trunk_host = trunk_port.get(portbindings.HOST_ID)
for subport in trunk_obj.sub_ports:
port = self.core_plugin.update_port(
context, subport.port_id,
{'port': {portbindings.HOST_ID: trunk_host,
'device_owner': trunk_consts.TRUNK_SUBPORT_OWNER}})
vif_type = port.get(portbindings.VIF_TYPE)
if vif_type == portbindings.VIF_TYPE_BINDING_FAILED:
raise trunk_exc.SubPortBindingError(
port_id=subport.port_id, trunk_id=trunk_obj.id)
@registry.receives(resources.PORT, [events.AFTER_UPDATE])
def _subport_binding(self, resource, event, trigger, payload=None):
"""Bind the subports to the parent port host
This method listen to the port after update events. If the parent port
is updated and transitions from inactive to active, this method
retrieves the port host ID and binds the subports to this host.
:param resource: neutron_lib.callbacks.resources.PORT
:param event: neutron_lib.callbacks.events.AFTER_UPDATE
:param trigger: the specific driver plugin
"""
updated_port = payload.latest_state
trunk_details = updated_port.get('trunk_details')
vif_details = updated_port.get(portbindings.VIF_DETAILS)
driver = vif_details.get(portbindings.VIF_DETAILS_BOUND_DRIVERS,
{}).get('0')
# If no trunk_details, the port is not the parent of a trunk.
# If this port is not bound to ML2/OVS, we skip this method.
if not trunk_details or not driver == NAME:
return
context = payload.context
orig_status = payload.states[0].get('status')
new_status = updated_port.get('status')
trunk_id = trunk_details['trunk_id']
if (new_status == constants.PORT_STATUS_ACTIVE and
new_status != orig_status):
self._update_subport_binding(context, trunk_id)
def register():
"""Register the driver."""

View File

@ -44,6 +44,7 @@ from neutron.conf.agent import common as config
from neutron.db import db_base_plugin_common as db_base
from neutron.plugins.ml2.drivers.linuxbridge.agent import \
linuxbridge_neutron_agent as linuxbridge_agent
from neutron.services.trunk.drivers.openvswitch.agent import trunk_manager
from neutron.tests.common import base as common_base
from neutron.tests.common import helpers
from neutron.tests import tools
@ -812,6 +813,18 @@ class OVSTrunkBridgeFixture(OVSBridgeFixture):
self.addCleanup(self.bridge.destroy)
class OVSTrunkBridgeFixtureTrunkBridge(fixtures.Fixture):
def __init__(self, trunk_id):
super(OVSTrunkBridgeFixtureTrunkBridge, self).__init__()
self.trunk_id = trunk_id
def _setUp(self):
self.bridge = trunk_manager.TrunkBridge(self.trunk_id)
self.bridge.create()
self.addCleanup(self.bridge.destroy)
class OVSPortFixture(PortFixture):
NIC_NAME_LEN = 14

View File

@ -20,19 +20,12 @@ from oslo_utils import uuidutils
from neutron.common import utils
from neutron.services.trunk.drivers.openvswitch.agent import ovsdb_handler
from neutron.services.trunk.drivers.openvswitch.agent import trunk_manager
from neutron.services.trunk.drivers.openvswitch import utils as trunk_ovs_utils
from neutron.tests.fullstack import base
from neutron.tests.fullstack.resources import environment
from neutron.tests.fullstack.resources import machine
def trunk_bridge_does_not_exist(trunk_id):
"""Return true if trunk bridge for given ID does not exists."""
bridge = trunk_manager.TrunkBridge(trunk_id)
return not bridge.exists()
def make_ip_network(port, network):
"""Make an IPNetwork object from port and network.
@ -282,16 +275,8 @@ class TestTrunkPlugin(base.BaseFullStackTestCase):
vlan_aware_vm.block_until_ping(trunk_network_vm.ip)
vlan_aware_vm.block_until_ping(vlan2_network_vm.ip)
# Delete vm and check that patch ports and trunk bridge are gone
# Delete vm and check that patch ports are gone
vlan_aware_vm.destroy()
bridge_doesnt_exist_predicate = functools.partial(
trunk_bridge_does_not_exist, trunk_id)
utils.wait_until_true(
bridge_doesnt_exist_predicate,
exception=TrunkTestException(
'Trunk bridge with ID %s has not been removed' %
trunk_id)
)
integration_bridge = self.host.get_bridge(None)
no_patch_ports_predicate = functools.partial(

View File

@ -140,8 +140,6 @@ class OVSDBHandlerTestCase(base.OVSAgentTestFramework):
self.wait_until_ports_state(self.ports, up=True)
self.trunk_br.delete_port(self.trunk_port_name)
self.wait_until_ports_state(self.ports, up=False)
common_utils.wait_until_true(lambda:
not self.trunk_br.bridge_exists(self.trunk_br.br_name))
def test_trunk_creation_with_subports(self):
ports = self._fill_trunk_dict()
@ -194,11 +192,3 @@ class OVSDBHandlerTestCase(base.OVSAgentTestFramework):
# Check no resources are left behind.
self.assertFalse(self.trunk_br.exists())
self.assertFalse(ovsdb_handler.bridge_has_service_port(br_int))
def test_do_not_delete_trunk_bridge_with_instance_ports(self):
ports = self._fill_trunk_dict()
self.setup_agent_and_ports(port_dicts=ports)
self.wait_until_ports_state(self.ports, up=True)
self.ovsdb_handler.handle_trunk_remove(self.trunk_br.br_name,
ports.pop())
self.assertTrue(self.trunk_br.exists())

View File

@ -45,8 +45,7 @@ class TrunkParentPortTestCase(base.BaseSudoTestCase):
port_mac = net.get_random_mac('fa:16:3e:00:00:00'.split(':'))
self.trunk = trunk_manager.TrunkParentPort(trunk_id, port_id, port_mac)
self.trunk.bridge = self.useFixture(
net_helpers.OVSTrunkBridgeFixture(
self.trunk.bridge.br_name)).bridge
net_helpers.OVSTrunkBridgeFixtureTrunkBridge(trunk_id)).bridge
self.br_int = self.useFixture(net_helpers.OVSBridgeFixture()).bridge
def test_plug(self):
@ -70,8 +69,6 @@ class TrunkParentPortTestCase(base.BaseSudoTestCase):
def test_unplug(self):
self.trunk.plug(self.br_int)
self.trunk.unplug(self.br_int)
self.assertFalse(
self.trunk.bridge.bridge_exists(self.trunk.bridge.br_name))
self.assertNotIn(self.trunk.patch_port_int_name,
self.br_int.get_port_name_list())
@ -96,9 +93,8 @@ class SubPortTestCase(base.BaseSudoTestCase):
trunk_id = uuidutils.generate_uuid()
port_id = uuidutils.generate_uuid()
port_mac = net.get_random_mac('fa:16:3e:00:00:00'.split(':'))
trunk_bridge_name = utils.gen_trunk_br_name(trunk_id)
trunk_bridge = self.useFixture(
net_helpers.OVSTrunkBridgeFixture(trunk_bridge_name)).bridge
net_helpers.OVSTrunkBridgeFixtureTrunkBridge(trunk_id)).bridge
segmentation_id = helpers.get_not_used_vlan(
trunk_bridge, VLAN_RANGE)
self.subport = trunk_manager.SubPort(

View File

@ -183,9 +183,7 @@ class TestOVSDBHandler(base.BaseTestCase):
def test_handle_trunk_remove_trunk_manager_failure(self, br):
with mock.patch.object(self.ovsdb_handler, '_get_trunk_metadata',
side_effect=trunk_manager.TrunkManagerError(error='error')):
with mock.patch.object(ovsdb_handler, 'bridge_has_instance_port',
return_value=True):
self.ovsdb_handler.handle_trunk_remove('foo', self.fake_port)
self.ovsdb_handler.handle_trunk_remove('foo', self.fake_port)
@mock.patch('neutron.agent.common.ovs_lib.OVSBridge')
def test_handle_trunk_remove_rpc_failure(self, br):

View File

@ -13,10 +13,13 @@
from unittest import mock
from neutron_lib.api.definitions import portbindings
from neutron_lib.callbacks import events
from neutron_lib.callbacks import registry
from neutron_lib import constants
from neutron_lib import context
from neutron_lib.plugins.ml2 import ovs_constants as agent_consts
from neutron_lib.services.trunk import constants as trunk_consts
from oslo_config import cfg
from neutron.services.trunk.drivers.openvswitch import driver
@ -70,3 +73,23 @@ class OVSDriverTestCase(base.BaseTestCase):
}
}))
test_trigger.assert_called_once_with('fake-trunk-br-name')
def test__update_subport_binding(self):
ovs_driver = driver.OVSDriver.create()
core_plugin = mock.Mock()
core_plugin.get_port.return_value = {portbindings.HOST_ID: 'host_id'}
ovs_driver._core_plugin = core_plugin
ctx = context.get_admin_context()
mock_subports = [mock.Mock(port_id='sport_id1'),
mock.Mock(port_id='sport_id2')]
mock_trunk_obj = mock.Mock(sub_ports=mock_subports, port_id='port_id',
id='trunk_id')
with mock.patch.object(ovs_driver, '_get_trunk') as mock_get_trunk:
mock_get_trunk.return_value = mock_trunk_obj
updated_port = {
'port': {portbindings.HOST_ID: 'host_id',
'device_owner': trunk_consts.TRUNK_SUBPORT_OWNER}}
ovs_driver._update_subport_binding(ctx, 'trunk_id')
core_plugin.update_port.assert_has_calls(
[mock.call(ctx, 'sport_id1', updated_port),
mock.call(ctx, 'sport_id2', updated_port)], any_order=True)

View File

@ -59,7 +59,7 @@ pyOpenSSL>=17.1.0 # Apache-2.0
python-novaclient>=9.1.0 # Apache-2.0
openstacksdk>=0.31.2 # Apache-2.0
python-designateclient>=2.7.0 # Apache-2.0
os-vif>=1.15.1 # Apache-2.0
os-vif>=3.1.0 # Apache-2.0
futurist>=1.2.0 # Apache-2.0
tooz>=1.58.0 # Apache-2.0
wmi>=1.4.9;sys_platform=='win32' # MIT

View File

@ -41,10 +41,6 @@
- ^zuul.d/(?!(project)).*\.yaml
vars:
tox_envlist: integrated-network
# NOTE(ralonsoh): to be removed once LP#1997025 is fixed.
tempest_exclude_regex: "\
(^tempest.api.compute.admin.test_live_migration.LiveAutoBlockMigrationV225Test.test_live_migration_with_trunk)|\
(^tempest.api.compute.admin.test_live_migration.LiveMigrationTest.test_live_migration_with_trunk)"
devstack_localrc:
CIRROS_VERSION: 0.5.1
DEFAULT_IMAGE_NAME: cirros-0.5.1-x86_64-uec
@ -171,10 +167,6 @@
timeout: 10800
irrelevant-files: *openvswitch-irrelevant-files
vars:
# NOTE(ralonsoh): to be removed once LP#1997025 is fixed.
tempest_exclude_regex: "\
(^tempest.api.compute.admin.test_live_migration.LiveAutoBlockMigrationV225Test.test_live_migration_with_trunk)|\
(^tempest.api.compute.admin.test_live_migration.LiveMigrationTest.test_live_migration_with_trunk)"
tox_envlist: integrated-network
devstack_localrc:
CIRROS_VERSION: 0.5.1