Call update_all_ha_network_port_statuses on agent start

As explained in bug [1] when l3 agent fails to report state to the
server, its state is set to AGENT_REVIVED, triggering
fetch_and_sync_all_routers, which will set all its HA network ports
to DOWN, resulting in
1) ovs agent rewiring these ports and setting status to ACTIVE
2) when these ports are active, server sends router update to l3 agent
As server, ovs and l3 agents are busy with this processing, l3 agent
may fail again reporting state, repeating this process.

As l3 agent is repeatedly processing same routers, SIGHUPs are
frequently sent to keepalived, resulting in multiple masters.

To fix this, we call update_all_ha_network_port_statuses in l3 agent
start instead of calling from fetch_and_sync_all_routers.

[1] https://bugs.launchpad.net/neutron/+bug/1731595/comments/7
Conflicts:
	neutron/agent/l3/agent.py
        neutron/api/rpc/handlers/l3_rpc.py

Note: This RPC update_all_ha_network_port_statuses is added in only pike
and later branches. In older branches, we were using get_router_ids RPC
to invoke _update_ha_network_port_status. As we need to invoke this
functionality during l3 agent start and get_service_plugin_list() is the
only available RPC which is called during l3 agent start, we call
_update_ha_network_port_status from get_service_plugin_list.

Change-Id: Ia9d5549f7d53b538c9c9f93fe6aa71ffff15524a
Related-bug: #1597461
Closes-Bug: #1731595
(cherry picked from commit 9ab1ad1433)
(cherry picked from commit a6d985bbca)
This commit is contained in:
venkata anil 2017-11-23 18:40:30 +00:00
parent 609ef51148
commit 385ac553e3
3 changed files with 32 additions and 18 deletions

View File

@ -136,10 +136,10 @@ class L3PluginApi(object):
return cctxt.call(context, 'get_agent_gateway_port',
network_id=fip_net, host=self.host)
def get_service_plugin_list(self, context):
def get_service_plugin_list(self, context, host=None):
"""Make a call to get the list of activated services."""
cctxt = self.client.prepare(version='1.3')
return cctxt.call(context, 'get_service_plugin_list')
return cctxt.call(context, 'get_service_plugin_list', host=host)
def update_ha_routers_states(self, context, states):
"""Update HA routers states."""
@ -207,7 +207,8 @@ class L3NATAgent(ha.AgentMixin,
while True:
try:
self.neutron_service_plugins = (
self.plugin_rpc.get_service_plugin_list(self.context))
self.plugin_rpc.get_service_plugin_list(self.context,
host=host))
except oslo_messaging.RemoteError as e:
with excutils.save_and_reraise_exception() as ctx:
ctx.reraise = False

View File

@ -21,6 +21,7 @@ from oslo_log import log as logging
import oslo_messaging
import six
from neutron._i18n import _LI
from neutron.common import constants as n_const
from neutron.common import utils
from neutron import context as neutron_context
@ -82,19 +83,7 @@ class L3RpcCallback(object):
This will autoschedule unhosted routers to l3 agent on <host> and then
return all ids of routers scheduled to it.
This will also update HA network port status to down for all HA routers
hosted on <host>. This is needed to avoid l3 agent spawning keepalived
when l2 agent not yet wired the port. This can happen after a system
reboot that has wiped out flows, etc and the L2 agent hasn't started up
yet. The port will still be ACTIVE in the data model and the L3 agent
will use that info to mistakenly think that L2 network is ready.
By forcing into DOWN, we will require the L2 agent to essentially ack
that the port is indeed ACTIVE by reacting to the port update and
calling update_device_up.
"""
if utils.is_extension_supported(
self.plugin, constants.PORT_BINDING_EXT_ALIAS):
self._update_ha_network_port_status(context, host)
if utils.is_extension_supported(
self.l3plugin, constants.L3_AGENT_SCHEDULER_EXT_ALIAS):
if cfg.CONF.router_auto_schedule:
@ -237,7 +226,31 @@ class L3RpcCallback(object):
return net_id
def get_service_plugin_list(self, context, **kwargs):
return directory.get_plugins().keys()
"""Returns list of activated services.
This will also update HA network port status to down for all HA routers
hosted on <host>. This is needed to avoid l3 agent spawning keepalived
when l2 agent not yet wired the port. This can happen after a system
reboot that has wiped out flows, etc and the L2 agent hasn't started up
yet. The port will still be ACTIVE in the data model and the L3 agent
will use that info to mistakenly think that L2 network is ready.
By forcing into DOWN, we will require the L2 agent to essentially ack
that the port is indeed ACTIVE by reacting to the port update and
calling update_device_up.
"""
host = kwargs.get('host')
# "_update_ha_network_port_status()" will result more RPC calls between
# L2 agents and server(as l2 agent will try to wire the ports). These
# resulting RPC calls shouldn't block(or delay processing of)
# get_plugins() and thus current RPC execution. So get_plugins() is
# called before _update_ha_network_port_status()
plugins = directory.get_plugins().keys()
if host and utils.is_extension_supported(
self.plugin, constants.PORT_BINDING_EXT_ALIAS):
LOG.info(_LI("Host %s requested to set all its HA network "
"ports status to DOWN."), host)
self._update_ha_network_port_status(context, host)
return plugins
@db_api.retry_db_errors
def update_floatingip_statuses(self, context, router_id, fip_statuses):

View File

@ -1042,7 +1042,7 @@ class L3HAModeDbTestCase(L3HATestFramework):
for port in self._get_router_port_bindings(router['id']):
self.assertEqual(self.agent2['host'], port[portbindings.HOST_ID])
def test_get_router_ids_updates_ha_network_port_status(self):
def test_get_service_plugin_list_updates_ha_network_port_status(self):
router = self._create_router(ha=True)
callback = l3_rpc.L3RpcCallback()
callback._l3plugin = self.plugin
@ -1069,7 +1069,7 @@ class L3HAModeDbTestCase(L3HATestFramework):
ctx, port['id'], constants.PORT_STATUS_ACTIVE, host=host)
port = self.core_plugin.get_port(ctx, port['id'])
self.assertEqual(constants.PORT_STATUS_ACTIVE, port['status'])
callback.get_router_ids(ctx, host)
callback.get_service_plugin_list(ctx, host=host)
port = self.core_plugin.get_port(ctx, port['id'])
self.assertEqual(constants.PORT_STATUS_DOWN, port['status'])