ovn-l3: reschedule lower priorities

if a gateway chassis is removed we previously only plugged the hole it
left in the priorities of the lrps. This can lead to bad choice since we
are bound by all other currently used chassis.
By allowing us to also reschedule the lower priorities we get
significantly more freedom in choosing the most appropriate chassis and
prevent overloading an individual one.

As an example from the new testcase:
previously we would have had all prio 2 schedules on chassis3, but with
this change now this distributes better also to chassis4.

Partial-Bug: #2023993
Change-Id: I786ff6c0c4d3403b79819df95f9b1d6ac5e8675f
This commit is contained in:
Felix Huettner 2023-09-01 13:00:44 +02:00
parent b5f5f3def3
commit 3d5d82a197
3 changed files with 102 additions and 7 deletions

View File

@ -414,18 +414,44 @@ class OVNL3RouterPlugin(service_base.ServicePluginBase,
unhosted_gateways = self._nb_ovn.get_unhosted_gateways( unhosted_gateways = self._nb_ovn.get_unhosted_gateways(
port_physnet_dict, chassis_with_physnets, port_physnet_dict, chassis_with_physnets,
all_gw_chassis, chassis_with_azs) all_gw_chassis, chassis_with_azs)
for g_name in unhosted_gateways:
self._reschedule_lrps(unhosted_gateways)
def _reschedule_lrps(self, lrps):
# GW ports and its physnets.
port_physnet_dict = self._get_gateway_port_physnet_mapping()
# All chassis with physnets configured.
chassis_with_physnets = self._sb_ovn.get_chassis_and_physnets()
# All chassis with enable_as_gw_chassis set
all_gw_chassis = self._sb_ovn.get_gateway_chassis_from_cms_options()
chassis_with_azs = self._sb_ovn.get_chassis_and_azs()
for g_name in lrps:
physnet = port_physnet_dict.get(g_name[len(ovn_const.LRP_PREFIX):]) physnet = port_physnet_dict.get(g_name[len(ovn_const.LRP_PREFIX):])
# Remove any invalid gateway chassis from the list, otherwise # Remove any invalid gateway chassis from the list, otherwise
# we can have a situation where all existing_chassis are invalid # we can have a situation where all existing_chassis are invalid
existing_chassis = self._nb_ovn.get_gateway_chassis_binding(g_name) existing_chassis = self._nb_ovn.get_gateway_chassis_binding(g_name)
primary = existing_chassis[0] if existing_chassis else None primary = existing_chassis[0] if existing_chassis else None
az_hints = self._nb_ovn.get_gateway_chassis_az_hints(g_name) az_hints = self._nb_ovn.get_gateway_chassis_az_hints(g_name)
existing_chassis = self.scheduler.filter_existing_chassis( filtered_existing_chassis = \
nb_idl=self._nb_ovn, gw_chassis=all_gw_chassis, self.scheduler.filter_existing_chassis(
physnet=physnet, chassis_physnets=chassis_with_physnets, nb_idl=self._nb_ovn, gw_chassis=all_gw_chassis,
existing_chassis=existing_chassis, az_hints=az_hints, physnet=physnet,
chassis_with_azs=chassis_with_azs) chassis_physnets=chassis_with_physnets,
existing_chassis=existing_chassis, az_hints=az_hints,
chassis_with_azs=chassis_with_azs)
if existing_chassis != filtered_existing_chassis:
first_diff = None
for i in range(len(filtered_existing_chassis)):
if existing_chassis[i] != filtered_existing_chassis[i]:
first_diff = i
break
if first_diff is not None:
LOG.debug(
"A chassis for this gateway has been filtered. "
"Rebalancing priorities %s and lower", first_diff)
filtered_existing_chassis = filtered_existing_chassis[
:max(first_diff, 1)]
candidates = self._ovn_client.get_candidates_for_scheduling( candidates = self._ovn_client.get_candidates_for_scheduling(
physnet, cms=all_gw_chassis, physnet, cms=all_gw_chassis,
@ -433,7 +459,7 @@ class OVNL3RouterPlugin(service_base.ServicePluginBase,
availability_zone_hints=az_hints) availability_zone_hints=az_hints)
chassis = self.scheduler.select( chassis = self.scheduler.select(
self._nb_ovn, g_name, candidates=candidates, self._nb_ovn, g_name, candidates=candidates,
existing_chassis=existing_chassis) existing_chassis=filtered_existing_chassis)
if primary and primary != chassis[0]: if primary and primary != chassis[0]:
if primary not in chassis: if primary not in chassis:
LOG.debug("Primary gateway chassis %(old)s " LOG.debug("Primary gateway chassis %(old)s "

View File

@ -1651,6 +1651,69 @@ class TestOVNL3RouterPlugin(test_mech_driver.Ml2PluginV2TestCase):
self.nb_idl().get_unhosted_gateways.assert_called_once_with( self.nb_idl().get_unhosted_gateways.assert_called_once_with(
{'foo-1': 'physnet1'}, mock.ANY, mock.ANY, mock.ANY) {'foo-1': 'physnet1'}, mock.ANY, mock.ANY, mock.ANY)
@mock.patch('neutron.plugins.ml2.drivers.ovn.mech_driver.mech_driver.'
'OVNMechanismDriver.list_availability_zones', lambda *_: [])
@mock.patch('neutron.services.ovn_l3.plugin.OVNL3RouterPlugin.'
'_get_gateway_port_physnet_mapping')
def test_schedule_unhosted_gateways_rebalances_lower_prios(self, get_gppm):
unhosted_gws = ['lrp-foo-1', 'lrp-foo-2', 'lrp-foo-3']
get_gppm.return_value = {k[len(ovn_const.LRP_PREFIX):]: 'physnet1'
for k in unhosted_gws}
# we skip chasiss2 here since we assume it has been removed
chassis_mappings = {
'chassis1': ['physnet1'],
'chassis3': ['physnet1'],
'chassis4': ['physnet1'],
}
chassis = ['chassis1', 'chassis3', 'chassis4']
self.sb_idl().get_chassis_and_physnets.return_value = (
chassis_mappings)
self.sb_idl().get_gateway_chassis_from_cms_options.return_value = (
chassis)
self.nb_idl().get_unhosted_gateways.return_value = unhosted_gws
self.mock_candidates.return_value = chassis
# all ports have 4 chassis (including chassis2 that will be removed)
# the ports are not perfectly balanced (but this is realistic with a)
# few router creations and deletions
existing_port_bindings = [
['chassis1', 'chassis2', 'chassis3', 'chassis4'],
['chassis2', 'chassis4', 'chassis3', 'chassis1'],
['chassis4', 'chassis3', 'chassis1', 'chassis2']]
self.nb_idl().get_gateway_chassis_binding.side_effect = (
existing_port_bindings)
# for 1. port reschedule all besides the first
# for 2. port reschedule all besides the new first (chassis 4)
# for 3. port keep all and drop the last
self.mock_schedule.side_effect = [
['chassis1', 'chassis4', 'chassis3'],
['chassis4', 'chassis3', 'chassis1'],
['chassis4', 'chassis3', 'chassis1']]
self.l3_inst.schedule_unhosted_gateways()
self.mock_candidates.assert_has_calls([
mock.call(mock.ANY,
chassis_physnets=chassis_mappings,
cms=chassis, availability_zone_hints=[])] * 3)
self.mock_schedule.assert_has_calls([
mock.call(self.nb_idl(), 'lrp-foo-1',
['chassis1', 'chassis3', 'chassis4'],
['chassis1']),
mock.call(self.nb_idl(), 'lrp-foo-2',
['chassis1', 'chassis3', 'chassis4'],
['chassis4']),
mock.call(self.nb_idl(), 'lrp-foo-3',
['chassis1', 'chassis3', 'chassis4'],
['chassis4', 'chassis3', 'chassis1'])])
# make sure that the primary chassis stays untouched
self.nb_idl().update_lrouter_port.assert_has_calls([
mock.call('lrp-foo-1',
gateway_chassis=['chassis1', 'chassis4', 'chassis3']),
mock.call('lrp-foo-2',
gateway_chassis=['chassis4', 'chassis3', 'chassis1']),
mock.call('lrp-foo-3',
gateway_chassis=['chassis4', 'chassis3', 'chassis1'])])
@mock.patch('neutron.plugins.ml2.plugin.Ml2Plugin.get_network') @mock.patch('neutron.plugins.ml2.plugin.Ml2Plugin.get_network')
@mock.patch('neutron.plugins.ml2.plugin.Ml2Plugin.get_networks') @mock.patch('neutron.plugins.ml2.plugin.Ml2Plugin.get_networks')
@mock.patch('neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb.' @mock.patch('neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb.'

View File

@ -0,0 +1,6 @@
---
other:
- |
The OVN L3 scheduler will now update lower priorities of exising LRPs in
case of a chassis change. This can create increased load on OVN during
chassis shutdown, but improves the load distribution of LRPs.