Auto reschedule loadbalancers from dead agents

Similarly to what was done in 'allow_automatic_l3agent_failover'
This patch adds a periodic check to examine the status of all LBaaS
agents with loadbalancers scheduled to them.
If the agent is dead, the loadbalancer will be rescheduled to an alive
agent.

The periodic LBaaS agents status check is invoked by
add_agent_status_check_worker() which was introduced in neutron codebase in
I3a32a95489831f0d862930384309eefdc881d8f6 to allow safe process forking.

Closes-Bug: #1565511

Depends-On: I652ab029b7427c8783e4b2f0443a89ee884bf064
Change-Id: Id8d3218bf1e52722cc10ddcd34e3e734eef90658
This commit is contained in:
Nir Magnezi 2016-03-31 17:36:04 +03:00
parent 375201eddd
commit 6ef87fe033
6 changed files with 250 additions and 2 deletions

View File

@ -97,6 +97,21 @@ class LbaasAgentSchedulerDbMixin(agentschedulers_db.AgentSchedulerDbMixin,
candidates.append(agent)
return candidates
def get_down_loadbalancer_bindings(self, context, agent_dead_limit):
cutoff = self.get_cutoff_time(agent_dead_limit)
return (context.session.query(LoadbalancerAgentBinding).join(
agents_db.Agent).filter(
agents_db.Agent.heartbeat_timestamp < cutoff,
agents_db.Agent.admin_state_up))
def _unschedule_loadbalancer(self, context, loadbalancer_id, agent_id):
with context.session.begin(subtransactions=True):
query = context.session.query(LoadbalancerAgentBinding)
query = query.filter(
LoadbalancerAgentBinding.loadbalancer_id == loadbalancer_id,
LoadbalancerAgentBinding.agent_id == agent_id)
query.delete()
class ChanceScheduler(object):
"""Allocate a loadbalancer agent for a vip in a random way."""

View File

@ -49,3 +49,8 @@ class MisMatchedKey(TLSException):
class CertificateStorageException(TLSException):
message = _LE('Could not store certificate: %(msg)s')
class LoadbalancerReschedulingFailed(exceptions.Conflict):
message = _LE("Failed rescheduling loadbalancer %(loadbalancer_id)s: "
"no eligible lbaas agent found.")

View File

@ -14,19 +14,25 @@
from neutron.common import rpc as n_rpc
from neutron.db import agents_db
from neutron.db import common_db_mixin
from neutron.services import provider_configuration as provconf
from neutron_lib import exceptions as n_exc
from oslo_config import cfg
from oslo_log import log as logging
import oslo_messaging as messaging
from oslo_utils import importutils
from neutron_lbaas._i18n import _
from neutron_lbaas import agent_scheduler as agent_scheduler_v2
from neutron_lbaas.common import exceptions
from neutron_lbaas.db.loadbalancer import loadbalancer_dbv2 as ldbv2
from neutron_lbaas.drivers.common import agent_callbacks
from neutron_lbaas.drivers import driver_base
from neutron_lbaas.extensions import lbaas_agentschedulerv2
from neutron_lbaas.services.loadbalancer import constants as lb_const
from neutron_lbaas.services.loadbalancer import data_models
LOG = logging.getLogger(__name__)
LB_SCHEDULERS = 'loadbalancer_schedulers'
@ -35,6 +41,11 @@ AGENT_SCHEDULER_OPTS = [
default='neutron_lbaas.agent_scheduler.ChanceScheduler',
help=_('Driver to use for scheduling '
'to a default loadbalancer agent')),
cfg.BoolOpt('allow_automatic_lbaas_agent_failover',
default=False,
help=_('Automatically reschedule loadbalancer from offline '
'to online lbaas agents. This is only supported for '
'drivers who use the neutron LBaaSv2 agent')),
]
cfg.CONF.register_opts(AGENT_SCHEDULER_OPTS)
@ -144,7 +155,46 @@ class LoadBalancerAgentApi(object):
healthmonitor=healthmonitor)
class LoadBalancerManager(driver_base.BaseLoadBalancerManager):
class LoadBalancerManager(driver_base.BaseLoadBalancerManager,
agent_scheduler_v2.LbaasAgentSchedulerDbMixin,
common_db_mixin.CommonDbMixin):
def __init__(self, driver):
self.driver = driver
self.db = ldbv2.LoadBalancerPluginDbv2()
def reschedule_lbaas_from_down_agents(self):
"""Reschedule lbaas from down lbaasv2 agents if admin state is up."""
self.reschedule_resources_from_down_agents(
agent_type=lb_const.AGENT_TYPE_LOADBALANCERV2,
get_down_bindings=self.get_down_loadbalancer_bindings,
agent_id_attr='agent_id',
resource_id_attr='loadbalancer_id',
resource_name='loadbalancer',
reschedule_resource=self.reschedule_loadbalancer,
rescheduling_failed=exceptions.LoadbalancerReschedulingFailed)
def reschedule_loadbalancer(self, context, loadbalancer_id):
"""Reschedule loadbalancer to a new lbaas agent
Remove the loadbalancer from the agent currently hosting it and
schedule it again
"""
cur_agent = self.get_agent_hosting_loadbalancer(context,
loadbalancer_id)
agent_data = cur_agent['agent']
with context.session.begin(subtransactions=True):
self._unschedule_loadbalancer(context, loadbalancer_id,
agent_data['id'])
self._schedule_loadbalancer(context, loadbalancer_id)
new_agent = self.get_agent_hosting_loadbalancer(context,
loadbalancer_id)
if not new_agent:
raise exceptions.LoadbalancerReschedulingFailed(
loadbalancer_id=loadbalancer_id)
def _schedule_loadbalancer(self, context, loadbalancer_id):
lb_db = self.db.get_loadbalancer(context, loadbalancer_id)
self.create(context, lb_db)
def update(self, context, old_loadbalancer, loadbalancer):
super(LoadBalancerManager, self).update(context, old_loadbalancer,
@ -334,6 +384,13 @@ class AgentDriverBase(driver_base.LoadBalancerBaseDriver):
self.loadbalancer_scheduler = importutils.import_object(
lb_sched_driver)
def get_periodic_jobs(self):
periodic_jobs = []
if cfg.CONF.allow_automatic_lbaas_agent_failover:
periodic_jobs.append(
self.load_balancer.reschedule_lbaas_from_down_agents)
return periodic_jobs
def start_rpc_listeners(self):
# other agent based plugin driver might already set callbacks on plugin
if hasattr(self.plugin, 'agent_callbacks'):

View File

@ -21,9 +21,11 @@ from neutron_lib.plugins import directory
from neutron.api.v2 import attributes as attrs
from neutron.api.v2 import base as napi_base
from neutron import context as ncontext
from neutron.db import agentschedulers_db
from neutron.db import servicetype_db as st_db
from neutron.extensions import flavors
from neutron.plugins.common import constants
from neutron import service
from neutron.services.flavors import flavors_plugin
from neutron.services import provider_configuration as pconf
from neutron.services import service_base
@ -55,7 +57,8 @@ def add_provider_configuration(type_manager, service_type):
pconf.ProviderConfiguration('neutron_lbaas'))
class LoadBalancerPluginv2(loadbalancerv2.LoadBalancerPluginBaseV2):
class LoadBalancerPluginv2(loadbalancerv2.LoadBalancerPluginBaseV2,
agentschedulers_db.AgentSchedulerDbMixin):
"""Implementation of the Neutron Loadbalancer Service Plugin.
This class manages the workflow of LBaaS request/response.
@ -82,8 +85,17 @@ class LoadBalancerPluginv2(loadbalancerv2.LoadBalancerPluginBaseV2):
add_provider_configuration(
self.service_type_manager, constants.LOADBALANCERV2)
self._load_drivers()
self.start_periodic_jobs()
self.start_rpc_listeners()
self.db.subscribe()
rpc_worker = service.RpcWorker([self], worker_process_count=0)
self.add_worker(rpc_worker)
def start_periodic_jobs(self):
for driver_name, driver_class in self.drivers.items():
if hasattr(driver_class, 'get_periodic_jobs'):
for job in self.drivers[driver_name].get_periodic_jobs():
self.add_agent_status_check_worker(job)
def start_rpc_listeners(self):
listeners = []

View File

@ -14,16 +14,21 @@
import mock
from neutron_lib.plugins import directory
from oslo_utils import importutils
from neutron import context
from neutron.db import servicetype_db as st_db
from neutron.plugins.common import constants
from neutron.tests.common import helpers
from neutron_lbaas.common import exceptions
from neutron_lbaas.db.loadbalancer import models
from neutron_lbaas.drivers.common import agent_driver_base
from neutron_lbaas.extensions import loadbalancerv2
from neutron_lbaas.services.loadbalancer import constants as lb_const
from neutron_lbaas.tests import base
from neutron_lbaas.tests.unit.db.loadbalancer import test_db_loadbalancerv2
from neutron_lbaas.tests.unit import test_agent_scheduler
class TestLoadBalancerPluginBase(test_db_loadbalancerv2.LbaasPluginDbTestCase):
@ -578,3 +583,145 @@ class TestLoadBalancerPluginNotificationWrapper(TestLoadBalancerPluginBase):
loadbalancerv2.EntityNotFound,
self.plugin_instance.db.get_healthmonitor,
ctx, hm_id)
class TestLoadBalancerManager(test_agent_scheduler.
LBaaSAgentSchedulerTestCase):
def setUp(self):
super(TestLoadBalancerManager, self).setUp()
self.load_balancer = agent_driver_base.LoadBalancerManager(self)
self.agent_rpc = agent_driver_base.LoadBalancerAgentApi(
lb_const.LOADBALANCER_AGENTV2)
self.plugin = self.lbaas_plugin
self.device_driver = 'haproxy_ns'
self.loadbalancer_scheduler = importutils.import_object(
'neutron_lbaas.agent_scheduler.ChanceScheduler')
def test_reschedule_lbaas_from_down_agents(self):
with mock.patch(
'neutron_lbaas.drivers.common.agent_driver_base.'
'LoadBalancerManager.reschedule_resources_from_down_agents'
) as mock_reschedule_resources:
self.load_balancer.reschedule_lbaas_from_down_agents()
self.assertTrue(mock_reschedule_resources.called)
mock_reschedule_resources.assert_called_once_with(
agent_type=lb_const.AGENT_TYPE_LOADBALANCERV2,
get_down_bindings=(self.load_balancer.
get_down_loadbalancer_bindings),
agent_id_attr='agent_id',
resource_id_attr='loadbalancer_id',
resource_name='loadbalancer',
reschedule_resource=self.load_balancer.reschedule_loadbalancer,
rescheduling_failed=exceptions.LoadbalancerReschedulingFailed)
def test_loadbalancer_reschedule_from_dead_lbaas_agent(self):
self._register_agent_states(lbaas_agents=True)
with self.loadbalancer() as loadbalancer:
loadbalancer_data = loadbalancer['loadbalancer']
self.plugin.db.update_loadbalancer_provisioning_status(
self.adminContext, loadbalancer_data['id'])
original_agent = self._get_lbaas_agent_hosting_loadbalancer(
loadbalancer_data['id'])
self.assertIsNotNone(original_agent)
helpers.kill_agent(original_agent['agent']['id'])
self.load_balancer.reschedule_lbaas_from_down_agents()
rescheduled_agent = self._get_lbaas_agent_hosting_loadbalancer(
loadbalancer_data['id'])
self.assertNotEqual(original_agent, rescheduled_agent)
def test_reschedule_loadbalancer_succeeded(self):
self._register_agent_states(lbaas_agents=True)
with self.loadbalancer() as loadbalancer:
loadbalancer_data = loadbalancer['loadbalancer']
self.plugin.db.update_loadbalancer_provisioning_status(
self.adminContext, loadbalancer_data['id'])
hosting_agent = self.load_balancer.get_agent_hosting_loadbalancer(
self.adminContext, loadbalancer_data['id'])
with mock.patch(
'neutron_lbaas.drivers.common.agent_driver_base.'
'LoadBalancerManager.get_agent_hosting_loadbalancer',
side_effect=(hosting_agent, hosting_agent)
) as mock_get_agent_hosting_lb, mock.patch(
'neutron_lbaas.drivers.common.agent_driver_base.'
'LoadBalancerManager._unschedule_loadbalancer',
side_effect=self.load_balancer._unschedule_loadbalancer
) as mock_unschedule_lb, mock.patch(
'neutron_lbaas.drivers.common.agent_driver_base.'
'LoadBalancerManager._schedule_loadbalancer',
side_effect=self.load_balancer._schedule_loadbalancer
) as mock_schedule_lb:
# rescheduling is expected to succeeded
self.load_balancer.reschedule_loadbalancer(
self.adminContext, loadbalancer_data['id'])
# check the usage of get_agent_hosting_loadbalancer()
self.assertTrue(mock_get_agent_hosting_lb.called)
mock_get_agent_hosting_lb.assert_called_with(
self.adminContext, loadbalancer_data['id'])
# check the usage of _unschedule_loadbalancer()
self.assertTrue(mock_unschedule_lb.called)
mock_unschedule_lb.assert_called_once_with(
self.adminContext, loadbalancer_data['id'],
hosting_agent['agent']['id'])
# check the usage of _schedule_loadbalancer()
self.assertTrue(mock_schedule_lb.called)
mock_schedule_lb.assert_called_once_with(
self.adminContext, loadbalancer_data['id'])
def test_reschedule_loadbalancer_failed(self):
self._register_agent_states(lbaas_agents=True)
with self.loadbalancer() as loadbalancer:
loadbalancer_data = loadbalancer['loadbalancer']
self.plugin.db.update_loadbalancer_provisioning_status(
self.adminContext, loadbalancer_data['id'])
hosting_agent = self.load_balancer.get_agent_hosting_loadbalancer(
self.adminContext, loadbalancer_data['id'])
with mock.patch(
'neutron_lbaas.drivers.common.agent_driver_base.'
'LoadBalancerManager.get_agent_hosting_loadbalancer',
side_effect=(hosting_agent, None)
) as mock_get_agent_hosting_lb, mock.patch(
'neutron_lbaas.drivers.common.agent_driver_base.'
'LoadBalancerManager._unschedule_loadbalancer',
side_effect=self.load_balancer._unschedule_loadbalancer
) as mock_unschedule_lb, mock.patch(
'neutron_lbaas.drivers.common.agent_driver_base.'
'LoadBalancerManager._schedule_loadbalancer',
side_effect=self.load_balancer._schedule_loadbalancer
) as mock_schedule_lb:
# rescheduling is expected to fail
self.assertRaises(exceptions.LoadbalancerReschedulingFailed,
self.load_balancer.reschedule_loadbalancer,
self.adminContext, loadbalancer_data['id'])
# check the usage of get_agent_hosting_loadbalancer()
self.assertTrue(mock_get_agent_hosting_lb.called)
mock_get_agent_hosting_lb.assert_called_with(
self.adminContext, loadbalancer_data['id'])
# check the usage of _unschedule_loadbalancer()
self.assertTrue(mock_unschedule_lb.called)
mock_unschedule_lb.assert_called_once_with(
self.adminContext, loadbalancer_data['id'],
hosting_agent['agent']['id'])
# check the usage of _schedule_loadbalancer()
self.assertTrue(mock_schedule_lb.called)
mock_schedule_lb.assert_called_once_with(
self.adminContext, loadbalancer_data['id'])
def test__schedule_loadbalancer(self):
self._register_agent_states(lbaas_agents=True)
with self.loadbalancer() as loadbalancer:
loadbalancer_data = loadbalancer['loadbalancer']
self.plugin.db.update_loadbalancer_provisioning_status(
self.adminContext, loadbalancer_data['id'])
with mock.patch(
'neutron_lbaas.db.loadbalancer.loadbalancer_dbv2.'
'LoadBalancerPluginDbv2.get_loadbalancer') as mock_get_lb,\
mock.patch(
'neutron_lbaas.drivers.common.agent_driver_base.'
'LoadBalancerManager.create') as mock_create:
self.load_balancer._schedule_loadbalancer(
self.adminContext, loadbalancer_data['id'])
self.assertTrue(mock_get_lb.called)
mock_get_lb.assert_called_once_with(self.adminContext,
loadbalancer_data['id'])
self.assertTrue(mock_create.called)

View File

@ -0,0 +1,12 @@
---
features:
- Adds the ability to automatically reschedule load
balancers from LBaaS agents the server detects to have
died. Previously, load balancers could be scheduled
and realized across multiple LBaaS agents, however
if a hypervisor died, the load balancers scheduled
to that node would cease operation. Now, these load
balancers will be automatically rescheduled to a
different agent. This feature is turned off by
default and controlled via
allow_automatic_lbaas_agent_failover