Call update_all_ha_network_port_statuses on agent start
As explained in bug [1] when l3 agent fails to report state to the server, its state is set to AGENT_REVIVED, triggering fetch_and_sync_all_routers, which will set all its HA network ports to DOWN, resulting in 1) ovs agent rewiring these ports and setting status to ACTIVE 2) when these ports are active, server sends router update to l3 agent As server, ovs and l3 agents are busy with this processing, l3 agent may fail again reporting state, repeating this process. As l3 agent is repeatedly processing same routers, SIGHUPs are frequently sent to keepalived, resulting in multiple masters. To fix this, we call update_all_ha_network_port_statuses in l3 agent start instead of calling from fetch_and_sync_all_routers. [1] https://bugs.launchpad.net/neutron/+bug/1731595/comments/7 Conflicts: neutron/agent/l3/agent.py neutron/api/rpc/handlers/l3_rpc.py Note: This RPC update_all_ha_network_port_statuses is added in only pike and later branches. In older branches, we were using get_router_ids RPC to invoke _update_ha_network_port_status. As we need to invoke this functionality during l3 agent start and get_service_plugin_list() is the only available RPC which is called during l3 agent start, we call _update_ha_network_port_status from get_service_plugin_list. Change-Id: Ia9d5549f7d53b538c9c9f93fe6aa71ffff15524a Related-bug: #1597461 Closes-Bug: #1731595 (cherry picked from commit9ab1ad1433
) (cherry picked from commita6d985bbca
)
This commit is contained in:
parent
609ef51148
commit
385ac553e3
|
@ -136,10 +136,10 @@ class L3PluginApi(object):
|
|||
return cctxt.call(context, 'get_agent_gateway_port',
|
||||
network_id=fip_net, host=self.host)
|
||||
|
||||
def get_service_plugin_list(self, context):
|
||||
def get_service_plugin_list(self, context, host=None):
|
||||
"""Make a call to get the list of activated services."""
|
||||
cctxt = self.client.prepare(version='1.3')
|
||||
return cctxt.call(context, 'get_service_plugin_list')
|
||||
return cctxt.call(context, 'get_service_plugin_list', host=host)
|
||||
|
||||
def update_ha_routers_states(self, context, states):
|
||||
"""Update HA routers states."""
|
||||
|
@ -207,7 +207,8 @@ class L3NATAgent(ha.AgentMixin,
|
|||
while True:
|
||||
try:
|
||||
self.neutron_service_plugins = (
|
||||
self.plugin_rpc.get_service_plugin_list(self.context))
|
||||
self.plugin_rpc.get_service_plugin_list(self.context,
|
||||
host=host))
|
||||
except oslo_messaging.RemoteError as e:
|
||||
with excutils.save_and_reraise_exception() as ctx:
|
||||
ctx.reraise = False
|
||||
|
|
|
@ -21,6 +21,7 @@ from oslo_log import log as logging
|
|||
import oslo_messaging
|
||||
import six
|
||||
|
||||
from neutron._i18n import _LI
|
||||
from neutron.common import constants as n_const
|
||||
from neutron.common import utils
|
||||
from neutron import context as neutron_context
|
||||
|
@ -82,19 +83,7 @@ class L3RpcCallback(object):
|
|||
|
||||
This will autoschedule unhosted routers to l3 agent on <host> and then
|
||||
return all ids of routers scheduled to it.
|
||||
This will also update HA network port status to down for all HA routers
|
||||
hosted on <host>. This is needed to avoid l3 agent spawning keepalived
|
||||
when l2 agent not yet wired the port. This can happen after a system
|
||||
reboot that has wiped out flows, etc and the L2 agent hasn't started up
|
||||
yet. The port will still be ACTIVE in the data model and the L3 agent
|
||||
will use that info to mistakenly think that L2 network is ready.
|
||||
By forcing into DOWN, we will require the L2 agent to essentially ack
|
||||
that the port is indeed ACTIVE by reacting to the port update and
|
||||
calling update_device_up.
|
||||
"""
|
||||
if utils.is_extension_supported(
|
||||
self.plugin, constants.PORT_BINDING_EXT_ALIAS):
|
||||
self._update_ha_network_port_status(context, host)
|
||||
if utils.is_extension_supported(
|
||||
self.l3plugin, constants.L3_AGENT_SCHEDULER_EXT_ALIAS):
|
||||
if cfg.CONF.router_auto_schedule:
|
||||
|
@ -237,7 +226,31 @@ class L3RpcCallback(object):
|
|||
return net_id
|
||||
|
||||
def get_service_plugin_list(self, context, **kwargs):
|
||||
return directory.get_plugins().keys()
|
||||
"""Returns list of activated services.
|
||||
|
||||
This will also update HA network port status to down for all HA routers
|
||||
hosted on <host>. This is needed to avoid l3 agent spawning keepalived
|
||||
when l2 agent not yet wired the port. This can happen after a system
|
||||
reboot that has wiped out flows, etc and the L2 agent hasn't started up
|
||||
yet. The port will still be ACTIVE in the data model and the L3 agent
|
||||
will use that info to mistakenly think that L2 network is ready.
|
||||
By forcing into DOWN, we will require the L2 agent to essentially ack
|
||||
that the port is indeed ACTIVE by reacting to the port update and
|
||||
calling update_device_up.
|
||||
"""
|
||||
host = kwargs.get('host')
|
||||
# "_update_ha_network_port_status()" will result more RPC calls between
|
||||
# L2 agents and server(as l2 agent will try to wire the ports). These
|
||||
# resulting RPC calls shouldn't block(or delay processing of)
|
||||
# get_plugins() and thus current RPC execution. So get_plugins() is
|
||||
# called before _update_ha_network_port_status()
|
||||
plugins = directory.get_plugins().keys()
|
||||
if host and utils.is_extension_supported(
|
||||
self.plugin, constants.PORT_BINDING_EXT_ALIAS):
|
||||
LOG.info(_LI("Host %s requested to set all its HA network "
|
||||
"ports status to DOWN."), host)
|
||||
self._update_ha_network_port_status(context, host)
|
||||
return plugins
|
||||
|
||||
@db_api.retry_db_errors
|
||||
def update_floatingip_statuses(self, context, router_id, fip_statuses):
|
||||
|
|
|
@ -1042,7 +1042,7 @@ class L3HAModeDbTestCase(L3HATestFramework):
|
|||
for port in self._get_router_port_bindings(router['id']):
|
||||
self.assertEqual(self.agent2['host'], port[portbindings.HOST_ID])
|
||||
|
||||
def test_get_router_ids_updates_ha_network_port_status(self):
|
||||
def test_get_service_plugin_list_updates_ha_network_port_status(self):
|
||||
router = self._create_router(ha=True)
|
||||
callback = l3_rpc.L3RpcCallback()
|
||||
callback._l3plugin = self.plugin
|
||||
|
@ -1069,7 +1069,7 @@ class L3HAModeDbTestCase(L3HATestFramework):
|
|||
ctx, port['id'], constants.PORT_STATUS_ACTIVE, host=host)
|
||||
port = self.core_plugin.get_port(ctx, port['id'])
|
||||
self.assertEqual(constants.PORT_STATUS_ACTIVE, port['status'])
|
||||
callback.get_router_ids(ctx, host)
|
||||
callback.get_service_plugin_list(ctx, host=host)
|
||||
port = self.core_plugin.get_port(ctx, port['id'])
|
||||
self.assertEqual(constants.PORT_STATUS_DOWN, port['status'])
|
||||
|
||||
|
|
Loading…
Reference in New Issue