diff --git a/neutron/agent/l3/agent.py b/neutron/agent/l3/agent.py index 142b209b6ce..0cd9028c13a 100644 --- a/neutron/agent/l3/agent.py +++ b/neutron/agent/l3/agent.py @@ -575,6 +575,10 @@ class L3NATAgent(ha.AgentMixin, ns_manager.keep_ext_net(ext_net_id) elif is_snat_agent: ns_manager.ensure_snat_cleanup(r['id']) + # For HA routers check that DB state matches actual state + if r.get('ha'): + self.check_ha_state_for_router( + r['id'], r.get(l3_constants.HA_ROUTER_STATE_KEY)) update = queue.RouterUpdate( r['id'], queue.PRIORITY_SYNC_ROUTERS_TASK, diff --git a/neutron/agent/l3/ha.py b/neutron/agent/l3/ha.py index 178e0f4266a..c9fef216d0b 100644 --- a/neutron/agent/l3/ha.py +++ b/neutron/agent/l3/ha.py @@ -22,12 +22,17 @@ import webob from neutron._i18n import _LI from neutron.agent.linux import utils as agent_utils +from neutron.common import constants from neutron.notifiers import batch_notifier LOG = logging.getLogger(__name__) KEEPALIVED_STATE_CHANGE_SERVER_BACKLOG = 4096 +TRANSLATION_MAP = {'master': constants.HA_ROUTER_STATE_ACTIVE, + 'backup': constants.HA_ROUTER_STATE_STANDBY, + 'fault': constants.HA_ROUTER_STATE_STANDBY} + class KeepalivedStateChangeHandler(object): def __init__(self, agent): @@ -77,6 +82,21 @@ class AgentMixin(object): self._calculate_batch_duration(), self.notify_server) eventlet.spawn(self._start_keepalived_notifications_server) + def _get_router_info(self, router_id): + try: + return self.router_info[router_id] + except KeyError: + LOG.info(_LI('Router %s is not managed by this agent. It was ' + 'possibly deleted concurrently.'), router_id) + + def check_ha_state_for_router(self, router_id, current_state): + ri = self._get_router_info(router_id) + if ri and current_state != TRANSLATION_MAP[ri.ha_state]: + LOG.debug("Updating server with state %(state)s for router " + "%(router_id)s", {'router_id': router_id, + 'state': ri.ha_state}) + self.state_change_notifier.queue_event((router_id, ri.ha_state)) + def _start_keepalived_notifications_server(self): state_change_server = ( L3AgentKeepalivedStateChangeServer(self, self.conf)) @@ -97,11 +117,8 @@ class AgentMixin(object): {'router_id': router_id, 'state': state}) - try: - ri = self.router_info[router_id] - except KeyError: - LOG.info(_LI('Router %s is not managed by this agent. It was ' - 'possibly deleted concurrently.'), router_id) + ri = self._get_router_info(router_id) + if ri is None: return self._configure_ipv6_ra_on_ext_gw_port_if_necessary(ri, state) @@ -144,10 +161,7 @@ class AgentMixin(object): ri.disable_radvd() def notify_server(self, batched_events): - translation_map = {'master': 'active', - 'backup': 'standby', - 'fault': 'standby'} - translated_states = dict((router_id, translation_map[state]) for + translated_states = dict((router_id, TRANSLATION_MAP[state]) for router_id, state in batched_events) LOG.debug('Updating server with HA routers states %s', translated_states) diff --git a/neutron/db/l3_hamode_db.py b/neutron/db/l3_hamode_db.py index c68f88440ce..2854c90436e 100644 --- a/neutron/db/l3_hamode_db.py +++ b/neutron/db/l3_hamode_db.py @@ -620,16 +620,19 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin, """ with context.session.begin(subtransactions=True): bindings = self.get_ha_router_port_bindings(context, [router_id]) - dead_agents = [ - binding.agent for binding in bindings - if binding.state == n_const.HA_ROUTER_STATE_ACTIVE and - not (binding.agent.is_active and binding.agent.admin_state_up)] - - for dead_agent in dead_agents: - self.update_routers_states( - context, {router_id: n_const.HA_ROUTER_STATE_STANDBY}, - dead_agent.host) - + dead_agents = [] + active = [binding for binding in bindings + if binding.state == n_const.HA_ROUTER_STATE_ACTIVE] + # Check dead agents only if we have more then one active agent + if len(active) > 1: + dead_agents = [binding.agent for binding in active + if not (binding.agent.is_active and + binding.agent.admin_state_up)] + for dead_agent in dead_agents: + self.update_routers_states( + context, + {router_id: n_const.HA_ROUTER_STATE_STANDBY}, + dead_agent.host) if dead_agents: return self.get_ha_router_port_bindings(context, [router_id]) return bindings diff --git a/neutron/tests/unit/agent/l3/test_agent.py b/neutron/tests/unit/agent/l3/test_agent.py index 7d934198289..3df1cdeabc7 100644 --- a/neutron/tests/unit/agent/l3/test_agent.py +++ b/neutron/tests/unit/agent/l3/test_agent.py @@ -211,6 +211,48 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework): agent.enqueue_state_change(router.id, 'master') self.assertFalse(agent._update_metadata_proxy.call_count) + def test_check_ha_state_for_router_master_standby(self): + agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) + router = mock.Mock() + router.id = '1234' + router_info = mock.MagicMock() + agent.router_info[router.id] = router_info + router_info.ha_state = 'master' + with mock.patch.object(agent.state_change_notifier, + 'queue_event') as queue_event: + agent.check_ha_state_for_router(router.id, + n_const.HA_ROUTER_STATE_STANDBY) + queue_event.assert_called_once_with((router.id, 'master')) + + def test_check_ha_state_for_router_standby_standby(self): + agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) + router = mock.Mock() + router.id = '1234' + router_info = mock.MagicMock() + agent.router_info[router.id] = router_info + router_info.ha_state = 'backup' + with mock.patch.object(agent.state_change_notifier, + 'queue_event') as queue_event: + agent.check_ha_state_for_router(router.id, + n_const.HA_ROUTER_STATE_STANDBY) + queue_event.assert_not_called() + + def test_periodic_sync_routers_task_call_check_ha_state_for_router(self): + agent = l3_agent.L3NATAgentWithStateReport(HOSTNAME, self.conf) + ha_id = _uuid() + active_routers = [ + {'id': ha_id, + n_const.HA_ROUTER_STATE_KEY: n_const.HA_ROUTER_STATE_STANDBY, + 'ha': True}, + {'id': _uuid()}] + self.plugin_api.get_router_ids.return_value = [r['id'] for r + in active_routers] + self.plugin_api.get_routers.return_value = active_routers + with mock.patch.object(agent, 'check_ha_state_for_router') as check: + agent.periodic_sync_routers_task(agent.context) + check.assert_called_once_with(ha_id, + n_const.HA_ROUTER_STATE_STANDBY) + def test_periodic_sync_routers_task_raise_exception(self): agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) self.plugin_api.get_router_ids.return_value = ['fake_id'] diff --git a/neutron/tests/unit/db/test_l3_hamode_db.py b/neutron/tests/unit/db/test_l3_hamode_db.py index 522b9c59b5d..74addfdab1b 100644 --- a/neutron/tests/unit/db/test_l3_hamode_db.py +++ b/neutron/tests/unit/db/test_l3_hamode_db.py @@ -190,29 +190,51 @@ class L3HATestCase(L3HATestFramework): self.admin_ctx, router['id']) self.assertEqual([], bindings) - def _assert_ha_state_for_agent_is_standby(self, router, agent): + def _assert_ha_state_for_agent(self, router, agent, + state=n_const.HA_ROUTER_STATE_STANDBY): bindings = ( self.plugin.get_l3_bindings_hosting_router_with_ha_states( self.admin_ctx, router['id'])) agent_ids = [(a[0]['id'], a[1]) for a in bindings] - self.assertIn((agent['id'], 'standby'), agent_ids) + self.assertIn((agent['id'], state), agent_ids) def test_get_l3_bindings_hosting_router_with_ha_states_active_and_dead( self): router = self._create_router() self.plugin.update_routers_states( - self.admin_ctx, {router['id']: 'active'}, self.agent1['host']) + self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE}, + self.agent1['host']) + self.plugin.update_routers_states( + self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE}, + self.agent2['host']) with mock.patch.object(agent_utils, 'is_agent_down', return_value=True): - self._assert_ha_state_for_agent_is_standby(router, self.agent1) + self._assert_ha_state_for_agent(router, self.agent1) def test_get_l3_bindings_hosting_router_agents_admin_state_up_is_false( self): router = self._create_router() self.plugin.update_routers_states( - self.admin_ctx, {router['id']: 'active'}, self.agent1['host']) + self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE}, + self.agent1['host']) + self.plugin.update_routers_states( + self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE}, + self.agent2['host']) helpers.set_agent_admin_state(self.agent1['id']) - self._assert_ha_state_for_agent_is_standby(router, self.agent1) + self._assert_ha_state_for_agent(router, self.agent1) + + def test_get_l3_bindings_hosting_router_with_ha_states_one_dead(self): + router = self._create_router() + self.plugin.update_routers_states( + self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE}, + self.agent1['host']) + self.plugin.update_routers_states( + self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_STANDBY}, + self.agent2['host']) + with mock.patch.object(agent_utils, 'is_agent_down', + return_value=True): + self._assert_ha_state_for_agent( + router, self.agent1, state=n_const.HA_ROUTER_STATE_ACTIVE) def test_router_created_in_active_state(self): router = self._create_router()