From 9677cf87cb831cfd344bc63cbee23be52392c300 Mon Sep 17 00:00:00 2001 From: Kevin Benton Date: Wed, 30 Jul 2014 15:49:59 -0700 Subject: [PATCH] Option to remove routers from dead l3 agents Add a configuration-enabled periodic check to examine the status of all L3 agents with routers scheduled to them and admin_state_up set to True. If the agent is dead, the router will be rescheduled to an alive agent. Neutron considers and agent 'dead' when the server doesn't receive any heartbeat messages from the agent over the RPC channel within a given number of seconds (agent_down_time). There are various false positive scenarios where the agent may fail to report even though the node is still forwarding traffic. This is configuration driven because a dead L3 agent with active namespaces forwarding traffic and responding to ARP requests may cause issues. If the network backend does not block the dead agent's node from using the router's IP addresses, there will be a conflict between the old and new namespace. This conflict should not break east-west traffic because both namespaces will be attached to the appropriate networks and either can forward the traffic without state. However, traffic being overloaded onto the router's external network interface IP in north-south traffic will be impacted because the matching translation for port address translation will only be present on one router. Additionally, floating IPs associated to ports after the rescheduling will not work traversing the old namespace because the mapping will not be present. DocImpact Partial-Bug: #1174591 Change-Id: Id7d487f54ca54fdd46b7616c0969319afc0bb589 --- etc/neutron.conf | 4 ++ neutron/db/l3_agentschedulers_db.py | 62 +++++++++++++++++++ .../services/l3_router/l3_router_plugin.py | 1 + .../unit/openvswitch/test_agent_scheduler.py | 60 ++++++++++++++++++ 4 files changed, 127 insertions(+) diff --git a/etc/neutron.conf b/etc/neutron.conf index eadcf5e1f93..f552c4325d0 100644 --- a/etc/neutron.conf +++ b/etc/neutron.conf @@ -160,6 +160,10 @@ lock_path = $state_path/lock # routers to first L3 agent which sends sync_routers message to neutron server # router_auto_schedule = True +# Allow automatic rescheduling of routers from dead L3 agents with +# admin_state_up set to True to alive agents. +# allow_automatic_l3agent_failover = False + # Number of DHCP agents scheduled to host a network. This enables redundant # DHCP agents for configured networks. # dhcp_agents_per_network = 1 diff --git a/neutron/db/l3_agentschedulers_db.py b/neutron/db/l3_agentschedulers_db.py index 3d6927218ee..fe9f6734c9c 100644 --- a/neutron/db/l3_agentschedulers_db.py +++ b/neutron/db/l3_agentschedulers_db.py @@ -12,6 +12,9 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. +import datetime +import random +import time from oslo.config import cfg import sqlalchemy as sa @@ -21,11 +24,19 @@ from sqlalchemy.orm import exc from sqlalchemy.orm import joinedload from neutron.common import constants +from neutron import context as n_ctx from neutron.db import agents_db from neutron.db import agentschedulers_db from neutron.db import model_base from neutron.extensions import l3agentscheduler from neutron import manager +from neutron.openstack.common.gettextutils import _LI, _LW +from neutron.openstack.common import log as logging +from neutron.openstack.common import loopingcall +from neutron.openstack.common import timeutils + + +LOG = logging.getLogger(__name__) L3_AGENTS_SCHEDULER_OPTS = [ cfg.StrOpt('router_scheduler_driver', @@ -34,6 +45,9 @@ L3_AGENTS_SCHEDULER_OPTS = [ 'router to a default L3 agent')), cfg.BoolOpt('router_auto_schedule', default=True, help=_('Allow auto scheduling of routers to L3 agent.')), + cfg.BoolOpt('allow_automatic_l3agent_failover', default=False, + help=_('Automatically reschedule routers from offline L3 ' + 'agents to online L3 agents.')), ] cfg.CONF.register_opts(L3_AGENTS_SCHEDULER_OPTS) @@ -59,6 +73,54 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase, router_scheduler = None + def start_periodic_agent_status_check(self): + if not cfg.CONF.allow_automatic_l3agent_failover: + LOG.info(_LI("Skipping period L3 agent status check because " + "automatic router rescheduling is disabled.")) + return + + self.periodic_agent_loop = loopingcall.FixedIntervalLoopingCall( + self.reschedule_routers_from_down_agents) + interval = max(cfg.CONF.agent_down_time / 2, 1) + # add random initial delay to allow agents to check in after the + # neutron server first starts. random to offset multiple servers + self.periodic_agent_loop.start(interval=interval, + initial_delay=random.randint(interval, interval * 2)) + + def reschedule_routers_from_down_agents(self): + """Reschedule routers from down l3 agents if admin state is up.""" + + # give agents extra time to handle transient failures + agent_dead_limit = cfg.CONF.agent_down_time * 2 + + # check for an abrupt clock change since last check. if a change is + # detected, sleep for a while to let the agents check in. + tdelta = timeutils.utcnow() - getattr(self, '_clock_jump_canary', + timeutils.utcnow()) + if timeutils.total_seconds(tdelta) > cfg.CONF.agent_down_time: + LOG.warn(_LW("Time since last L3 agent reschedule check has " + "exceeded the interval between checks. Waiting " + "before check to allow agents to send a heartbeat " + "in case there was a clock adjustment.")) + time.sleep(agent_dead_limit) + self._clock_jump_canary = timeutils.utcnow() + + context = n_ctx.get_admin_context() + cutoff = timeutils.utcnow() - datetime.timedelta( + seconds=agent_dead_limit) + down_bindings = ( + context.session.query(RouterL3AgentBinding). + filter(agents_db.Agent.heartbeat_timestamp < cutoff, + agents_db.Agent.admin_state_up)) + for binding in down_bindings: + LOG.warn(_LW("Rescheduling router %(router)s from agent %(agent)s " + "because the agent did not report to the server in " + "the last %(dead_time)s seconds."), + {'router': binding.router_id, + 'agent': binding.l3_agent_id, + 'dead_time': agent_dead_limit}) + self.reschedule_router(context, binding.router_id) + def add_router_to_l3_agent(self, context, agent_id, router_id): """Add a l3 agent to host a router.""" router = self.get_router(context, router_id) diff --git a/neutron/services/l3_router/l3_router_plugin.py b/neutron/services/l3_router/l3_router_plugin.py index 61614d684a0..f4f4b779a26 100644 --- a/neutron/services/l3_router/l3_router_plugin.py +++ b/neutron/services/l3_router/l3_router_plugin.py @@ -64,6 +64,7 @@ class L3RouterPlugin(common_db_mixin.CommonDbMixin, self.setup_rpc() self.router_scheduler = importutils.import_object( cfg.CONF.router_scheduler_driver) + self.start_periodic_agent_status_check() def setup_rpc(self): # RPC support diff --git a/neutron/tests/unit/openvswitch/test_agent_scheduler.py b/neutron/tests/unit/openvswitch/test_agent_scheduler.py index c9057ed3363..ab855b61c93 100644 --- a/neutron/tests/unit/openvswitch/test_agent_scheduler.py +++ b/neutron/tests/unit/openvswitch/test_agent_scheduler.py @@ -15,6 +15,7 @@ import contextlib import copy +import datetime import mock from oslo.config import cfg @@ -27,6 +28,7 @@ from neutron.common import constants from neutron import context from neutron.db import agents_db from neutron.db import dhcp_rpc_base +from neutron.db import l3_agentschedulers_db from neutron.db import l3_rpc_base from neutron.extensions import agent from neutron.extensions import dhcpagentscheduler @@ -231,6 +233,9 @@ class OvsAgentSchedulerTestCaseBase(test_l3_plugin.L3NatTestCaseMixin, self.l3_notify_p = mock.patch( 'neutron.extensions.l3agentscheduler.notify') self.patched_l3_notify = self.l3_notify_p.start() + self.l3_periodic_p = mock.patch('neutron.db.L3AgentSchedulerDbMixin.' + 'start_periodic_agent_status_check') + self.patched_l3_periodic = self.l3_notify_p.start() self.dhcp_notify_p = mock.patch( 'neutron.extensions.dhcpagentscheduler.notify') self.patched_dhcp_notify = self.dhcp_notify_p.start() @@ -617,6 +622,61 @@ class OvsAgentSchedulerTestCase(OvsAgentSchedulerTestCaseBase): self.assertEqual(port_list['ports'][0]['device_id'], constants.DEVICE_ID_RESERVED_DHCP_PORT) + def _take_down_agent_and_run_reschedule(self, host): + # take down the agent on host A and ensure B is alive + self.adminContext.session.begin(subtransactions=True) + query = self.adminContext.session.query(agents_db.Agent) + agt = query.filter_by(host=host).first() + agt.heartbeat_timestamp = ( + agt.heartbeat_timestamp - datetime.timedelta(hours=1)) + self.adminContext.session.commit() + + plugin = manager.NeutronManager.get_service_plugins().get( + service_constants.L3_ROUTER_NAT) + + plugin.reschedule_routers_from_down_agents() + + def _set_agent_admin_state_up(self, host, state): + self.adminContext.session.begin(subtransactions=True) + query = self.adminContext.session.query(agents_db.Agent) + agt_db = query.filter_by(host=host).first() + agt_db.admin_state_up = state + self.adminContext.session.commit() + + def test_router_reschedule_from_dead_agent(self): + with self.router(): + l3_rpc = l3_rpc_base.L3RpcCallbackMixin() + self._register_agent_states() + + # schedule the router to host A + ret_a = l3_rpc.sync_routers(self.adminContext, host=L3_HOSTA) + self._take_down_agent_and_run_reschedule(L3_HOSTA) + + # B should now pick up the router + ret_b = l3_rpc.sync_routers(self.adminContext, host=L3_HOSTB) + self.assertEqual(ret_b, ret_a) + + def test_router_no_reschedule_from_dead_admin_down_agent(self): + with self.router() as r: + l3_rpc = l3_rpc_base.L3RpcCallbackMixin() + self._register_agent_states() + + # schedule the router to host A + l3_rpc.sync_routers(self.adminContext, host=L3_HOSTA) + self._set_agent_admin_state_up(L3_HOSTA, False) + self._take_down_agent_and_run_reschedule(L3_HOSTA) + + # A should still have it even though it was inactive due to the + # admin_state being down + rab = l3_agentschedulers_db.RouterL3AgentBinding + binding = (self.adminContext.session.query(rab). + filter(rab.router_id == r['router']['id']).first()) + self.assertEqual(binding.l3_agent.host, L3_HOSTA) + + # B should not pick up the router + ret_b = l3_rpc.sync_routers(self.adminContext, host=L3_HOSTB) + self.assertFalse(ret_b) + def test_router_auto_schedule_with_invalid_router(self): with self.router() as router: l3_rpc = l3_rpc_base.L3RpcCallbackMixin()