Add check for ha state

If all agents are shown as a standby it is possible changing state
were lost due to problems with RabbitMQ. Current change adds check
for ha state in fetch_and_sync_all_routers. If state is different -
 notify server that state should be changed.

Also change _get_bindings_and_update_router_state_for_dead_agents
to set standby for dead agent only in case we have more than one
active.

Change-Id: If5596eb24041ea9fae1d5d2563dcaf655c5face7
Closes-bug:#1648242
This commit is contained in:
AKamyshnikova 2016-12-16 16:00:59 +04:00 committed by Ann Taraday
parent 12191ed9cd
commit 1927da1bc7
5 changed files with 110 additions and 25 deletions

View File

@ -575,6 +575,10 @@ class L3NATAgent(ha.AgentMixin,
ns_manager.keep_ext_net(ext_net_id)
elif is_snat_agent:
ns_manager.ensure_snat_cleanup(r['id'])
# For HA routers check that DB state matches actual state
if r.get('ha'):
self.check_ha_state_for_router(
r['id'], r.get(l3_constants.HA_ROUTER_STATE_KEY))
update = queue.RouterUpdate(
r['id'],
queue.PRIORITY_SYNC_ROUTERS_TASK,

View File

@ -22,12 +22,17 @@ import webob
from neutron._i18n import _LI
from neutron.agent.linux import utils as agent_utils
from neutron.common import constants
from neutron.notifiers import batch_notifier
LOG = logging.getLogger(__name__)
KEEPALIVED_STATE_CHANGE_SERVER_BACKLOG = 4096
TRANSLATION_MAP = {'master': constants.HA_ROUTER_STATE_ACTIVE,
'backup': constants.HA_ROUTER_STATE_STANDBY,
'fault': constants.HA_ROUTER_STATE_STANDBY}
class KeepalivedStateChangeHandler(object):
def __init__(self, agent):
@ -77,6 +82,21 @@ class AgentMixin(object):
self._calculate_batch_duration(), self.notify_server)
eventlet.spawn(self._start_keepalived_notifications_server)
def _get_router_info(self, router_id):
try:
return self.router_info[router_id]
except KeyError:
LOG.info(_LI('Router %s is not managed by this agent. It was '
'possibly deleted concurrently.'), router_id)
def check_ha_state_for_router(self, router_id, current_state):
ri = self._get_router_info(router_id)
if ri and current_state != TRANSLATION_MAP[ri.ha_state]:
LOG.debug("Updating server with state %(state)s for router "
"%(router_id)s", {'router_id': router_id,
'state': ri.ha_state})
self.state_change_notifier.queue_event((router_id, ri.ha_state))
def _start_keepalived_notifications_server(self):
state_change_server = (
L3AgentKeepalivedStateChangeServer(self, self.conf))
@ -97,11 +117,8 @@ class AgentMixin(object):
{'router_id': router_id,
'state': state})
try:
ri = self.router_info[router_id]
except KeyError:
LOG.info(_LI('Router %s is not managed by this agent. It was '
'possibly deleted concurrently.'), router_id)
ri = self._get_router_info(router_id)
if ri is None:
return
self._configure_ipv6_ra_on_ext_gw_port_if_necessary(ri, state)
@ -144,10 +161,7 @@ class AgentMixin(object):
ri.disable_radvd()
def notify_server(self, batched_events):
translation_map = {'master': 'active',
'backup': 'standby',
'fault': 'standby'}
translated_states = dict((router_id, translation_map[state]) for
translated_states = dict((router_id, TRANSLATION_MAP[state]) for
router_id, state in batched_events)
LOG.debug('Updating server with HA routers states %s',
translated_states)

View File

@ -620,16 +620,19 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin,
"""
with context.session.begin(subtransactions=True):
bindings = self.get_ha_router_port_bindings(context, [router_id])
dead_agents = [
binding.agent for binding in bindings
if binding.state == n_const.HA_ROUTER_STATE_ACTIVE and
not (binding.agent.is_active and binding.agent.admin_state_up)]
for dead_agent in dead_agents:
self.update_routers_states(
context, {router_id: n_const.HA_ROUTER_STATE_STANDBY},
dead_agent.host)
dead_agents = []
active = [binding for binding in bindings
if binding.state == n_const.HA_ROUTER_STATE_ACTIVE]
# Check dead agents only if we have more then one active agent
if len(active) > 1:
dead_agents = [binding.agent for binding in active
if not (binding.agent.is_active and
binding.agent.admin_state_up)]
for dead_agent in dead_agents:
self.update_routers_states(
context,
{router_id: n_const.HA_ROUTER_STATE_STANDBY},
dead_agent.host)
if dead_agents:
return self.get_ha_router_port_bindings(context, [router_id])
return bindings

View File

@ -211,6 +211,48 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
agent.enqueue_state_change(router.id, 'master')
self.assertFalse(agent._update_metadata_proxy.call_count)
def test_check_ha_state_for_router_master_standby(self):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
router = mock.Mock()
router.id = '1234'
router_info = mock.MagicMock()
agent.router_info[router.id] = router_info
router_info.ha_state = 'master'
with mock.patch.object(agent.state_change_notifier,
'queue_event') as queue_event:
agent.check_ha_state_for_router(router.id,
n_const.HA_ROUTER_STATE_STANDBY)
queue_event.assert_called_once_with((router.id, 'master'))
def test_check_ha_state_for_router_standby_standby(self):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
router = mock.Mock()
router.id = '1234'
router_info = mock.MagicMock()
agent.router_info[router.id] = router_info
router_info.ha_state = 'backup'
with mock.patch.object(agent.state_change_notifier,
'queue_event') as queue_event:
agent.check_ha_state_for_router(router.id,
n_const.HA_ROUTER_STATE_STANDBY)
queue_event.assert_not_called()
def test_periodic_sync_routers_task_call_check_ha_state_for_router(self):
agent = l3_agent.L3NATAgentWithStateReport(HOSTNAME, self.conf)
ha_id = _uuid()
active_routers = [
{'id': ha_id,
n_const.HA_ROUTER_STATE_KEY: n_const.HA_ROUTER_STATE_STANDBY,
'ha': True},
{'id': _uuid()}]
self.plugin_api.get_router_ids.return_value = [r['id'] for r
in active_routers]
self.plugin_api.get_routers.return_value = active_routers
with mock.patch.object(agent, 'check_ha_state_for_router') as check:
agent.periodic_sync_routers_task(agent.context)
check.assert_called_once_with(ha_id,
n_const.HA_ROUTER_STATE_STANDBY)
def test_periodic_sync_routers_task_raise_exception(self):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
self.plugin_api.get_router_ids.return_value = ['fake_id']

View File

@ -190,29 +190,51 @@ class L3HATestCase(L3HATestFramework):
self.admin_ctx, router['id'])
self.assertEqual([], bindings)
def _assert_ha_state_for_agent_is_standby(self, router, agent):
def _assert_ha_state_for_agent(self, router, agent,
state=n_const.HA_ROUTER_STATE_STANDBY):
bindings = (
self.plugin.get_l3_bindings_hosting_router_with_ha_states(
self.admin_ctx, router['id']))
agent_ids = [(a[0]['id'], a[1]) for a in bindings]
self.assertIn((agent['id'], 'standby'), agent_ids)
self.assertIn((agent['id'], state), agent_ids)
def test_get_l3_bindings_hosting_router_with_ha_states_active_and_dead(
self):
router = self._create_router()
self.plugin.update_routers_states(
self.admin_ctx, {router['id']: 'active'}, self.agent1['host'])
self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
self.agent1['host'])
self.plugin.update_routers_states(
self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
self.agent2['host'])
with mock.patch.object(agent_utils, 'is_agent_down',
return_value=True):
self._assert_ha_state_for_agent_is_standby(router, self.agent1)
self._assert_ha_state_for_agent(router, self.agent1)
def test_get_l3_bindings_hosting_router_agents_admin_state_up_is_false(
self):
router = self._create_router()
self.plugin.update_routers_states(
self.admin_ctx, {router['id']: 'active'}, self.agent1['host'])
self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
self.agent1['host'])
self.plugin.update_routers_states(
self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
self.agent2['host'])
helpers.set_agent_admin_state(self.agent1['id'])
self._assert_ha_state_for_agent_is_standby(router, self.agent1)
self._assert_ha_state_for_agent(router, self.agent1)
def test_get_l3_bindings_hosting_router_with_ha_states_one_dead(self):
router = self._create_router()
self.plugin.update_routers_states(
self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
self.agent1['host'])
self.plugin.update_routers_states(
self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_STANDBY},
self.agent2['host'])
with mock.patch.object(agent_utils, 'is_agent_down',
return_value=True):
self._assert_ha_state_for_agent(
router, self.agent1, state=n_const.HA_ROUTER_STATE_ACTIVE)
def test_router_created_in_active_state(self):
router = self._create_router()