From 3c97dc59f53a301bae0f9ef8b4d01e8b1062ff8e Mon Sep 17 00:00:00 2001 From: Alan Meadows Date: Mon, 29 Jul 2013 08:04:01 -0700 Subject: [PATCH] Add high availability tool for neutron networking * Setup optional cron jobs to call the tool in order to find offline l3 agents and migrate them to another online agent * Supports a rebalance feature to shuffle tenants around given the scheduler for neutron is very simplistic right now * Supports rebalancing DHCP configuration to all DHCP agents Change-Id: Iaf0686c2fa93ec1dcdd644b75cb3b4528d4cabf0 --- files/default/quantum-ha-tool.py | 417 +++++++++++++++++++++++++++++++ recipes/server.rb | 19 ++ spec/server-opensuse_spec.rb | 9 +- spec/server_spec.rb | 2 + 4 files changed, 444 insertions(+), 3 deletions(-) create mode 100755 files/default/quantum-ha-tool.py diff --git a/files/default/quantum-ha-tool.py b/files/default/quantum-ha-tool.py new file mode 100755 index 00000000..291e6528 --- /dev/null +++ b/files/default/quantum-ha-tool.py @@ -0,0 +1,417 @@ +#! /usr/bin/env python +# vim: tabstop=4 shiftwidth=4 softtabstop=4 + +# Copyright 2013 AT&T Services, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +import logging +import os +import sys +import json +import argparse +from collections import OrderedDict +from random import choice +from quantumclient.quantum import client + +LOG = logging.getLogger('quantum-ha-tool') +LOG_FORMAT='%(asctime)s %(name)-12s %(levelname)-8s %(message)s' +LOG_DATE = '%m-%d %H:%M' +DESCRIPTION = "Quantum High Availability Tool" + +def parse_args(): + + # ensure environment has necessary items to authenticate + for key in ['OS_TENANT_NAME', 'OS_USERNAME', 'OS_PASSWORD', + 'OS_AUTH_URL']: + if key not in os.environ.keys(): + LOG.exception("Your environment is missing '%s'") + + ap = argparse.ArgumentParser(description=DESCRIPTION) + ap.add_argument('-d', '--debug', action='store_true', + default=False, help='Show debugging output') + ap.add_argument('-n', '--noop', action='store_true', + default=False, help='Do not do any modifying operations (dry-run)') + ap.add_argument('--l3-agent-check', action='store_true', + default=False, help='Show routers associated with offline l3 agents') + ap.add_argument('--l3-agent-migrate', action='store_true', + default=False, help='Migrate routers away from offline l3 agents') + ap.add_argument('--l3-agent-rebalance', action='store_true', + default=False, help='Rebalance router count on all l3 agents') + ap.add_argument('--replicate-dhcp', action='store_true', + default=False, help='Replicate DHCP configuration to all agents') + return ap.parse_args() + +def setup_logging(args): + level = logging.INFO + if args.debug: + level = logging.DEBUG + logging.basicConfig(level=level, format=LOG_FORMAT, date_fmt=LOG_DATE) + +def run(args): + + # instantiate client + qclient = client.Client('2.0', auth_url=os.environ['OS_AUTH_URL'], + username=os.environ['OS_USERNAME'], + tenant_name=os.environ['OS_TENANT_NAME'], + password=os.environ['OS_PASSWORD']) + + # set json return type + qclient.format = 'json' + + if args.l3_agent_check: + LOG.info("Performing L3 Agent Health Check") + l3_agent_check(qclient, args.noop) + + if args.l3_agent_migrate: + LOG.info("Performing L3 Agent Migration for Offline L3 Agents") + l3_agent_check(qclient, args.noop) + + if args.l3_agent_rebalance: + LOG.info("Rebalancing L3 Agent Router Count") + l3_agent_rebalance(qclient, args.noop) + + if args.replicate_dhcp: + LOG.info("Performing DHCP Replication of Networks to Agents") + replicate_dhcp(qclient, args.noop) + +def l3_agent_rebalance(qclient, noop=False): + """ + Rebalance l3 agent router count across agents. The number of routers + on each l3 agent will be as close as possible which should help + distribute load as new l3 agents come online. + + :param qclient: A quantumclient + :param noop: Optional noop flag + """ + + # {u'binary': u'quantum-l3-agent', u'description': None, u'admin_state_up': True, u'heartbeat_timestamp': u'2013-07-02 22:20:23', u'alive': True, u'topic': + # u'l3_agent', u'host': u'o3r3.int.san3.attcompute.com', u'agent_type': u'L3 agent', u'created_at': u'2013-07-02 14:50:58', u'started_at': u'2013-07-02 18:00:55', + # u'id': u'6efe494a-616c-41ea-9c8f-2c592f4d46ff', u'configurations': {u'router_id': u'', u'gateway_external_network_id': u'', u'handle_internal_only_routers': True, + # u'use_namespaces': True, u'routers': 5, u'interfaces': 3, u'floating_ips': 9, u'interface_driver': u'quantum.agent.linux.interface.OVSInterfaceDriver', u'ex_gw_ports': 3}}, + + l3_agent_dict={} + agents = list_agents(qclient, agent_type='L3 agent') + num_agents = len(agents) + if num_agents <= 1: + LOG.info("No rebalancing required for 1 or fewer agents") + return + + for l3_agent in agents: + num_routers=l3_agent['configurations']['routers'] + l3_agent_dict[l3_agent['id']] = list_routers_on_l3_agent(qclient, l3_agent['id']) + + ordered_l3_agent_dict = OrderedDict(sorted(l3_agent_dict.items(), key=lambda t: len(t[0]))) + ordered_l3_agent_list = list(ordered_l3_agent_dict) + num_agents = len(ordered_l3_agent_list) + LOG.info("Agent list: %s", ordered_l3_agent_list[0:(num_agents-1/2)+1]) + i=0 + for agent in ordered_l3_agent_list[0:num_agents-1/2]: + low_agent_id=ordered_l3_agent_list[i] + hgh_agent_id=ordered_l3_agent_list[-(i+1)] + + # do nothing if we end up comparing the same router + if low_agent_id == hgh_agent_id: + continue + + LOG.info("Examining low_agent=%s, high_agent=%s", low_agent_id, hgh_agent_id) + + low_agent_router_count = len(l3_agent_dict[low_agent_id]) + hgh_agent_router_count = len(l3_agent_dict[hgh_agent_id]) + + LOG.info("Low Count=%s, High Count=%s", low_agent_router_count, hgh_agent_router_count) + + for router_id in l3_agent_dict[hgh_agent_id]: + if low_agent_router_count >= hgh_agent_router_count: + break + else: + LOG.info("Migrating router=%s from agent=%s to agent=%s", router_id, hgh_agent_id, low_agent_id) + try: + if not noop: + migrate_router(router_id, hgh_agent_id, low_agent_id) + low_agent_router_count += 1 + hgh_agent_router_count -= 1 + except: + LOG.traceback("Failed to migrate router=%s from agent=%s to agent=%s" % (router_id, hgh_agent_id, low_agent_id)) + continue + i+=1 + +def l3_agent_check(qclient, noop=False): + """ + Walk the l3 agents searching for agents that are offline. Show routers + that are offline and where we would migrate them too. + + :param qclient: A quantumclient + :param noop: Optional noop flag + + """ + + migration_count = 0 + agent_list = list_agents(qclient) + agent_dead_list = agent_dead_id_list(agent_list, 'L3 agent') + agent_alive_list = agent_alive_id_list(agent_list, 'L3 agent') + LOG.info("There are %s offline L3 agents and %s online L3 agents", len(agent_dead_list), len(agent_alive_list)) + + if len(agent_dead_list) > 0: + + for agent_id in agent_dead_list: + + LOG.info("Querying agent_id=%s for routers to migrate", agent_id) + router_id_list = list_routers_on_l3_agent(qclient, agent_id) + + for router_id in router_id_list: + + try: + target_id = choice(agent_alive_list) + except: + LOG.warn("There are no l3 agents alive we could migrate routers onto") + target_id = None + + LOG.info("Would like to migrate router=%s to agent=%s", router_id, target_id) + +def l3_agent_migrate(qclient, noop=False): + """ + Walk the l3 agents searching for agents that are offline. For those that are + offline, we will retrieve a list of routers on them and migrate them to a + random l3 agent that is online. + + :param qclient: A quantumclient + :param noop: Optional noop flag + + """ + + migration_count = 0 + agent_list = list_agents(qclient) + agent_dead_list = agent_dead_id_list(agent_list, 'L3 agent') + agent_alive_list = agent_alive_id_list(agent_list, 'L3 agent') + LOG.info("There are %s offline L3 agents and %s online L3 agents", len(agent_dead_list), len(agent_alive_list)) + + if len(agent_dead_list) > 0: + + if len(agents_alive_list) < 1: + LOG.exception("There are no l3 agents alive to migrate routers onto") + + for agent_id in agent_dead_list: + + LOG.info("Querying agent_id=%s for routers to migrate", agent_id) + router_id_list = list_routers_on_l3_agent(qclient, agent_id) + + for router_id in router_id_list: + + target_id = choice(agent_alive_list) + LOG.info("Migrating router=%s to agent=%s", router_id, target_id) + router_body = {'router_id': router_id} + + try: + + if not noop: + migrate_router(router_id, agent_id, target_id) + migration_count+=1 + + except: + + LOG.exception("There was an error migrating a router") + continue + + LOG.info("%s routers required migration from offline L3 agents", migration_count) + +def replicate_dhcp(qclient, noop=False): + """ + Retrieve a network list and then probe each DHCP agent to ensure they have that + network assigned. + + :param qclient: A quantumclient + :param noop: Optional noop flag + """ + + added=0 + networks = list_networks(qclient) + network_id_list = [n['id'] for n in networks] + agents = list_agents(qclient, agent_type='DHCP agent') + LOG.info("Replicating %s networks to %s DHCP agents", len(networks), len(agents)) + for dhcp_agent_id in [a['id'] for a in agents]: + networks_on_agent = qclient.list_networks_on_dhcp_agent(dhcp_agent_id)['networks'] + network_id_on_agent = [n['id'] for n in networks_on_agent] + for network_id in network_id_list: + if network_id not in network_id_on_agent: + try: + dhcp_body = {'network_id': network_id} + if not noop: + qclient.add_network_to_dhcp_agent(dhcp_agent_id, dhcp_body) + LOG.info("Added missing network=%s to dhcp agent=%s", network_id, dhcp_agent_id) + added+=1 + except: + LOG.exception("Failed to add network_id=%s to dhcp_agent=%s", network_id, dhcp_agent_id) + continue + + LOG.info("Added %s networks to DHCP agents", added) + + +def migrate_router(qclient, router, old_agent, new_agent): + """ + Returns nothing, and raises on exception + + :param qclient: A quantumclient + :param router: The id of the router to migrate + :param old_agent: The id of the l3 agent to migrate from + :param new_agent: The id of the l3 agent to migrate to + """ + + # N.B. The quantum API will return "success" even when there is a subsequent + # failure during the add or remove process so we must check to ensure the + # router has been added or removed + + # remove the router from the dead agent + qclient.remove_router_from_l3_agent(agent_id, router_id) + + # ensure it is removed or log an error + if router_id in list_routers_on_l3_agent(qclient, agent_id): + LOG.exception("Failed to remove router_id=%s from agent_id=%s", router_id, agent_id) + + # add the router id to a live agent + qclient.add_router_to_l3_agent(target_id, router_body) + + # ensure it is removed or log an error + if router_id not in list_routers_on_l3_agent(qclient, target_id): + LOG.exception("Failed to add router_id=%s from agent_id=%s", router_id, agent_id) + + +def list_networks(qclient): + """ + Return a list of network objects + + :param qclient: A quantumclient + """ + + resp = qclient.list_networks() + LOG.debug("list_networks: %s", resp) + return resp['networks'] + +def list_dhcp_agent_networks(qclient, agent_id): + """ + Return a list of network ids assigned to a particular DHCP agent + + :param qclient: A quantumclient + :param agent_id: A DHCP agent id + """ + + resp = qclient.list_networks_on_dhcp_agent(agent_id) + LOG.debug("list_networks_on_dhcp_agent: %s", resp) + return [s['id'] for s in resp['networks']] + + + +def list_routers(qclient): + """ + Return a list of router objects + + :param qclient: A quantumclient + + # {'routers': [{u'status': u'ACTIVE', u'external_gateway_info': {u'network_id': u'b970297c-d80e-4527-86d7-e49d2da9fdef'}, u'name': u'router1', + # u'admin_state_up': True, u'tenant_id': u'5603b97ee7f047ea999e25492c7fcb23', u'routes': [], u'id': u'0a122e5c-1623-412e-8c53-a1e21d1daff8'}, + + """ + + resp = qclient.list_routers() + LOG.debug("list_routers: %s", resp) + return resp['routers'] + +def list_routers_on_l3_agent(qclient, agent_id): + """ + Return a list of router ids on an agent + + :param qclient: A quantumclient + """ + + resp = qclient.list_routers_on_l3_agent(agent_id) + LOG.debug("list_routers_on_l3_agent: %s", resp) + return [r['id'] for r in resp['routers']] + +def list_agents(qclient, agent_type=None): + """ + Return a list of agent objects + + :param qclient: A quantumclient + + # openvswitch + # + # {u'agents': [{u'binary': u'quantum-openvswitch-agent', u'description': None, u'admin_state_up': True, u'heartbeat_timestamp': u'2013-07-02 22:20:25', + # u'alive': True, u'topic': u'N/A', u'host': u'o3r3.int.san3.attcompute.com', u'agent_type': u'Open vSwitch agent', u'created_at': u'2013-07-02 14:50:57', + # u'started_at': u'2013-07-02 14:50:57', u'id': u'3a577f1d-d86e-4f1a-a395-8d4c8e4df1e2', u'configurations': {u'devices': 10}}, + # + # dhcp + # + # {u'binary': u'quantum-dhcp-agent', u'description': None, u'admin_state_up': True, u'heartbeat_timestamp': u'2013-07-02 22:20:23', u'alive': True, + # u'topic': u'dhcp_agent', u'host': u'o5r4.int.san3.attcompute.com', u'agent_type': u'DHCP agent', u'created_at': u'2013-06-26 16:21:02', u'started_at': + # u'2013-06-28 13:32:52', u'id': u'3e8be28e-05a0-472b-9288-a59f8d8d2271', u'configurations': {u'subnets': 4, u'use_namespaces': True, u'dhcp_driver': + # u'quantum.agent.linux.dhcp.Dnsmasq', u'networks': 4, u'dhcp_lease_time': 120, u'ports': 38}}, + # + # l3 + # + # {u'binary': u'quantum-l3-agent', u'description': None, u'admin_state_up': True, u'heartbeat_timestamp': u'2013-07-02 22:20:23', u'alive': True, u'topic': + # u'l3_agent', u'host': u'o3r3.int.san3.attcompute.com', u'agent_type': u'L3 agent', u'created_at': u'2013-07-02 14:50:58', u'started_at': u'2013-07-02 18:00:55', + # u'id': u'6efe494a-616c-41ea-9c8f-2c592f4d46ff', u'configurations': {u'router_id': u'', u'gateway_external_network_id': u'', u'handle_internal_only_routers': True, + # u'use_namespaces': True, u'routers': 5, u'interfaces': 3, u'floating_ips': 9, u'interface_driver': u'quantum.agent.linux.interface.OVSInterfaceDriver', u'ex_gw_ports': 3}}, + """ + + resp = qclient.list_agents() + LOG.debug("list_agents: %s", resp) + if agent_type: + filtered=[] + for agent in resp['agents']: + if agent['agent_type'] == agent_type: + filtered.append(agent) + return filtered + return resp['agents'] + +def agent_alive_id_list(agent_list, agent_type): + """ + Return a list of agents that are alive from an API list of agents + + :param agent_list: API response for list_agents() + + """ + live_list=[] + for agent in agent_list: + if agent['agent_type'] == agent_type and agent['alive'] is True: + live_list.append(agent['id']) + return live_list + +def agent_dead_id_list(agent_list, agent_type): + """ + Return a list of agents that are dead from an API list of agents + + :param agent_list: API response for list_agents() + + """ + dead_list=[] + for agent in agent_list: + if agent['agent_type'] == agent_type and agent['alive'] is False: + dead_list.append(agent['id']) + return dead_list + +if __name__ == '__main__': + + args = parse_args() + setup_logging(args) + + try: + run(args) + sys.exit(0) + except Exception as err: + print "ERROR: %s" % err + sys.exit(1) + except KeyboardInterrupt: + sys.exit(1) diff --git a/recipes/server.rb b/recipes/server.rb index 6255f50b..fd56fc38 100644 --- a/recipes/server.rb +++ b/recipes/server.rb @@ -43,3 +43,22 @@ service "quantum-server" do supports :status => true, :restart => true action :enable end + +if node["openstack"]["network"]["quantum_ha_cmd_cron"] + # ensure period checks are offset between multiple l3 agent nodes + # and assumes splay will remain constant (i.e. based on hostname) + # Generate a uniformly distributed unique number to sleep. + checksum = Digest::MD5.hexdigest(node['fqdn'] || 'unknown-hostname') + splay = node['chef_client']['splay'].to_i || 3000 + sleep_time = checksum.to_s.hex % splay + + cron "quantum-ha-healthcheck" do + minute node["openstack"]["network"]["cron_l3_healthcheck"] + command "sleep #{sleep_time} ; . /root/openrc && #{node["openstack"]["network"]["quantum_ha_cmd"]} --l3-agent-migrate" + end + + cron "quantum-ha-replicate-dhcp" do + minute node["openstack"]["network"]["cron_replicate_dhcp"] + command "sleep #{sleep_time} ; . /root/openrc && #{node["openstack"]["network"]["quantum_ha_cmd"]} --replicate-dhcp" + end +end diff --git a/spec/server-opensuse_spec.rb b/spec/server-opensuse_spec.rb index 681986d9..89b326d5 100644 --- a/spec/server-opensuse_spec.rb +++ b/spec/server-opensuse_spec.rb @@ -4,9 +4,10 @@ describe 'openstack-network::server' do describe "opensuse" do before do quantum_stubs - @chef_run = ::ChefSpec::ChefRunner.new ::OPENSUSE_OPTS + @chef_run = ::ChefSpec::ChefRunner.new ::OPENSUSE_OPTS do |n| + n.set["chef_client"]["splay"] = 300 + end @node = @chef_run.node - @chef_run.converge "openstack-network::server" end @@ -20,7 +21,9 @@ describe 'openstack-network::server' do it "does not install openvswitch package" do opts = ::OPENSUSE_OPTS.merge(:evaluate_guards => true) - chef_run = ::ChefSpec::ChefRunner.new opts + chef_run = ::ChefSpec::ChefRunner.new opts do |n| + n.set["chef_client"]["splay"] = 300 + end chef_run.converge "openstack-network::server" expect(chef_run).not_to install_package "openstack-quantum-openvswitch" diff --git a/spec/server_spec.rb b/spec/server_spec.rb index eb78485a..2aa12625 100644 --- a/spec/server_spec.rb +++ b/spec/server_spec.rb @@ -7,6 +7,7 @@ describe 'openstack-network::server' do n.set["openstack"]["mq"] = { "host" => "127.0.0.1" } + n.set["chef_client"]["splay"] = 300 end @chef_run.converge "openstack-network::server" end @@ -134,6 +135,7 @@ describe 'openstack-network::server' do before do @chef_run = ::ChefSpec::ChefRunner.new(::UBUNTU_OPTS) do |n| n.set["openstack"]["network"]["rabbit"]["ha"] = true + n.set["chef_client"]["splay"] = 300 end @chef_run.converge "openstack-network::server" end