Fix breaking vf-lag functionality in os-net-config

Because of racing issue to activate vf-lag after moving the second
sriov_pf interface to switchdev mode in Nvidia\Mellanox nics, we may
bind sriov_vfs while the LAG is not active yet.
Another reason for breaking vf-lag functionality is that we are doing
ifdown/ifup for sriov_pfs after binding the vfs(in case of linux_bond
is member of ovs_bridge).

As a solution for this issue, we are doing the binding after assuring
the LAG is active, and also moving the ifdown/ifup before start binding

Closes-Bug: #2020085
Change-Id: If0cad8c856ee62064205b9a88f0148980653fcb2
This commit is contained in:
waleedm 2023-05-18 12:54:29 +00:00
parent 0350a82f19
commit b1a7c9c5f0
4 changed files with 120 additions and 5 deletions

View File

@ -1004,6 +1004,9 @@ class LinuxBond(_BaseOpts):
self.bonding_options = bonding_options
self.ethtool_opts = ethtool_opts
for member in self.members:
if isinstance(member, SriovPF):
utils.update_sriov_pf_map(member.name, member.numvfs, False,
lag_candidate=True)
if isinstance(member, SriovVF):
LinuxBond.update_vf_config(member)
member.linux_bond_name = name

View File

@ -50,6 +50,7 @@ MLNX5_VDPA_KMODS = [
]
MAX_RETRIES = 10
MLNX_LAG_PATH = "/sys/kernel/debug/mlx5/{pf_pci}/lag/state"
PF_FUNC_RE = re.compile(r"\.(\d+)$", 0)
VF_PCI_RE = re.compile(r'/[\d]{4}\:(\d+):(\d+)\.(\d+)/net/[^\/]+$')
@ -303,6 +304,7 @@ def configure_sriov_pf(execution_from_cli=False, restart_openvswitch=False):
sriov_map = common.get_sriov_map()
dpdk_vfs_pcis_list = []
vf_lag_sriov_pfs_list = []
trigger_udev_rule = False
# Cleanup the previous config by puppet-tripleo
@ -372,6 +374,10 @@ def configure_sriov_pf(execution_from_cli=False, restart_openvswitch=False):
# Configure switchdev mode
configure_switchdev(item['name'])
# Add sriovpf to vf_lag_sriov_pfs_list if it's
# a linux bond member (lag_candidate)
if item.get('lag_candidate', False):
vf_lag_sriov_pfs_list.append(item['name'])
# Adding a udev rule to rename vf-representors
else:
trigger_udev_rule = add_udev_rule_for_vdpa_representors(
@ -387,6 +393,12 @@ def configure_sriov_pf(execution_from_cli=False, restart_openvswitch=False):
if execution_from_cli:
if_up_interface(item['name'])
if restart_openvswitch:
restart_ovs_and_pfs_netdevs()
if vf_lag_sriov_pfs_list and execution_from_cli:
_wait_for_lag_creation(vf_lag_sriov_pfs_list)
if dpdk_vfs_pcis_list and not vdpa:
sriov_bind_pcis_map = {_MLNX_DRIVER: dpdk_vfs_pcis_list}
if not execution_from_cli:
@ -400,8 +412,6 @@ def configure_sriov_pf(execution_from_cli=False, restart_openvswitch=False):
trigger_udev_rules()
udev_monitor_stop(observer)
if restart_openvswitch:
restart_ovs_and_pfs_netdevs()
def _wait_for_uplink_rep_creation(pf_name):
@ -416,6 +426,26 @@ def _wait_for_uplink_rep_creation(pf_name):
raise RuntimeError(f"{pf_name}: Timeout waiting uplink representor")
def _wait_for_lag_creation(lag_sriov_pf_list):
for sriov_pf in lag_sriov_pf_list:
pf_pci = get_pf_pci(sriov_pf)
lag_path = MLNX_LAG_PATH.format(pf_pci=pf_pci)
if os.path.exists(lag_path):
for i in range(MAX_RETRIES):
lag_state = common.get_file_data(lag_path).strip()
if lag_state == "active":
logger.info(f"VF-LAG is enabled for interface {sriov_pf}"
f" after {i} retries")
break
time.sleep(1)
else:
raise RuntimeError("VF-LAG is not created for interface"
f" {sriov_pf} after {i} retries")
else:
logger.warning(f"Lag path {lag_path} does not exist for this "
"kernel, skipping..")
def create_rep_link_name_script():
with open(_REP_LINK_NAME_FILE, "w") as f:
f.write(_REP_LINK_NAME_DATA)

View File

@ -243,6 +243,82 @@ class TestUtils(base.TestCase):
self.assertEqual(1, len(pf_map))
self.assertListEqual(pf_final, pf_map)
def test_update_sriov_pf_map_new_with_lag_candidate(self):
def get_numvfs_stub(pf_name):
return 0
self.stub_out('os_net_config.sriov_config.get_numvfs',
get_numvfs_stub)
utils.update_sriov_pf_map('eth1', 10, False, lag_candidate=True)
contents = common.get_file_data(common.SRIOV_CONFIG_FILE)
sriov_pf_map = yaml.safe_load(contents) if contents else []
self.assertEqual(1, len(sriov_pf_map))
test_sriov_pf_map = [{'device_type': 'pf', 'link_mode': 'legacy',
'name': 'eth1', 'numvfs': 10, 'vdpa': False,
'lag_candidate': True}]
self.assertListEqual(test_sriov_pf_map, sriov_pf_map)
def test_update_sriov_pf_map_exist_with_lag_candidate(self):
def get_numvfs_stub(pf_name):
return 10
self.stub_out('os_net_config.sriov_config.get_numvfs',
get_numvfs_stub)
pf_initial = [{'device_type': 'pf', 'link_mode': 'legacy',
'name': 'eth1', 'numvfs': 10, 'promisc': 'on',
'vdpa': False, 'lag_candidate': False}]
utils.write_yaml_config(common.SRIOV_CONFIG_FILE, pf_initial)
utils.update_sriov_pf_map('eth1', 10, False, lag_candidate=True)
pf_final = [{'device_type': 'pf', 'link_mode': 'legacy',
'name': 'eth1', 'numvfs': 10, 'promisc': 'on',
'vdpa': False, 'lag_candidate': True}]
contents = common.get_file_data(common.SRIOV_CONFIG_FILE)
pf_map = yaml.safe_load(contents) if contents else []
self.assertEqual(1, len(pf_map))
self.assertListEqual(pf_final, pf_map)
def test_update_sriov_pf_map_exist_with_lag_candidate_not_exist_true(
self):
def get_numvfs_stub(pf_name):
return 10
self.stub_out('os_net_config.sriov_config.get_numvfs',
get_numvfs_stub)
pf_initial = [{'device_type': 'pf', 'link_mode': 'legacy',
'name': 'eth1', 'numvfs': 10, 'promisc': 'on',
'vdpa': False}]
utils.write_yaml_config(common.SRIOV_CONFIG_FILE, pf_initial)
utils.update_sriov_pf_map('eth1', 10, False, lag_candidate=True)
pf_final = [{'device_type': 'pf', 'link_mode': 'legacy',
'name': 'eth1', 'numvfs': 10, 'promisc': 'on',
'vdpa': False, 'lag_candidate': True}]
contents = common.get_file_data(common.SRIOV_CONFIG_FILE)
pf_map = yaml.safe_load(contents) if contents else []
self.assertEqual(1, len(pf_map))
self.assertListEqual(pf_final, pf_map)
def test_update_sriov_pf_map_exist_with_lag_candidate_not_exist_false(
self):
def get_numvfs_stub(pf_name):
return 10
self.stub_out('os_net_config.sriov_config.get_numvfs',
get_numvfs_stub)
pf_initial = [{'device_type': 'pf', 'link_mode': 'legacy',
'name': 'eth1', 'numvfs': 10, 'promisc': 'on',
'vdpa': False}]
utils.write_yaml_config(common.SRIOV_CONFIG_FILE, pf_initial)
utils.update_sriov_pf_map('eth1', 10, False, lag_candidate=False)
pf_final = [{'device_type': 'pf', 'link_mode': 'legacy',
'name': 'eth1', 'numvfs': 10, 'promisc': 'on',
'vdpa': False, 'lag_candidate': False}]
contents = common.get_file_data(common.SRIOV_CONFIG_FILE)
pf_map = yaml.safe_load(contents) if contents else []
self.assertEqual(1, len(pf_map))
self.assertListEqual(pf_final, pf_map)
def test_update_sriov_vf_map_minimal_new(self):
utils.update_sriov_vf_map('eth1', 2, 'eth1_2')
contents = common.get_file_data(common.SRIOV_CONFIG_FILE)

View File

@ -375,7 +375,8 @@ def _update_dpdk_map(ifname, pci_address, mac_address, driver):
def update_sriov_pf_map(ifname, numvfs, noop, promisc=None,
link_mode='legacy', vdpa=False, steering_mode=None):
link_mode='legacy', vdpa=False, steering_mode=None,
lag_candidate=None):
if not noop:
cur_numvfs = sriov_config.get_numvfs(ifname)
if cur_numvfs > 0 and cur_numvfs != numvfs:
@ -391,6 +392,8 @@ def update_sriov_pf_map(ifname, numvfs, noop, promisc=None,
item['link_mode'] = link_mode
if steering_mode is not None:
item['steering_mode'] = steering_mode
if lag_candidate is not None:
item['lag_candidate'] = lag_candidate
break
else:
new_item = {}
@ -403,6 +406,8 @@ def update_sriov_pf_map(ifname, numvfs, noop, promisc=None,
new_item['link_mode'] = link_mode
if steering_mode is not None:
new_item['steering_mode'] = steering_mode
if lag_candidate is not None:
new_item['lag_candidate'] = lag_candidate
sriov_map.append(new_item)
write_yaml_config(common.SRIOV_CONFIG_FILE, sriov_map)
@ -516,8 +521,9 @@ def _configure_sriov_config_service():
def configure_sriov_pfs(execution_from_cli=False, restart_openvswitch=False):
logger.info("Configuring PFs now")
sriov_config.configure_sriov_pf(execution_from_cli=execution_from_cli,
restart_openvswitch=restart_openvswitch)
sriov_config.configure_sriov_pf(
execution_from_cli=execution_from_cli,
restart_openvswitch=restart_openvswitch)
_configure_sriov_config_service()