From 27aee4a9c53ad9e8916e074f5cb7675871964448 Mon Sep 17 00:00:00 2001 From: Hynek Mlnarik Date: Wed, 10 Aug 2016 10:07:42 +0200 Subject: [PATCH] Connectivity tests for OVS agent failures/restarts Adding two tests: * A test that for native ovs-ofctl interface verifies that stopping the ovs-neutron-agent does not disrupt network traffic. Stopping the agent means also stopping the OVS bridge controller, hence OVS can decide to take over management of OpenFlow rules, clear them up, and this way cause network traffic disruption. * A test that creates two ports in a single network, then starts pinging one from the other while restarting OVS agents. The test verifies that no packet is lost during OVS agent restarts. Change-Id: I2cd1195fc0622c8c8d614f00e9dd6884ad388d69 Related-Bug: 1514056 Related-Bug: 1607787 --- neutron/tests/common/net_helpers.py | 5 +- neutron/tests/fullstack/resources/client.py | 12 +- neutron/tests/fullstack/resources/config.py | 9 +- .../tests/fullstack/resources/environment.py | 7 + neutron/tests/fullstack/resources/process.py | 41 ++++- neutron/tests/fullstack/test_connectivity.py | 155 +++++++++++++++++- 6 files changed, 210 insertions(+), 19 deletions(-) diff --git a/neutron/tests/common/net_helpers.py b/neutron/tests/common/net_helpers.py index 3917d2ef89f..58907635ab3 100644 --- a/neutron/tests/common/net_helpers.py +++ b/neutron/tests/common/net_helpers.py @@ -128,9 +128,10 @@ def assert_async_ping(src_namespace, dst_ip, timeout=1, count=1, interval=1): @contextlib.contextmanager -def async_ping(namespace, ips): +def async_ping(namespace, ips, timeout=1, count=10): with futures.ThreadPoolExecutor(max_workers=len(ips)) as executor: - fs = [executor.submit(assert_async_ping, namespace, ip, count=10) + fs = [executor.submit(assert_async_ping, namespace, ip, count=count, + timeout=timeout) for ip in ips] yield lambda: all(f.done() for f in fs) futures.wait(fs) diff --git a/neutron/tests/fullstack/resources/client.py b/neutron/tests/fullstack/resources/client.py index 7f14f3a4428..dc07b8f2b0a 100644 --- a/neutron/tests/fullstack/resources/client.py +++ b/neutron/tests/fullstack/resources/client.py @@ -61,12 +61,22 @@ class ClientFixture(fixtures.Fixture): return self._create_resource(resource_type, spec) - def create_network(self, tenant_id, name=None, external=False): + def create_network(self, tenant_id, name=None, external=False, + network_type=None, segmentation_id=None, + physical_network=None): resource_type = 'network' name = name or utils.get_rand_name(prefix=resource_type) spec = {'tenant_id': tenant_id, 'name': name} spec['router:external'] = external + + if segmentation_id is not None: + spec['provider:segmentation_id'] = segmentation_id + if network_type is not None: + spec['provider:network_type'] = network_type + if physical_network is not None: + spec['provider:physical_network'] = physical_network + return self._create_resource(resource_type, spec) def create_subnet(self, tenant_id, network_id, diff --git a/neutron/tests/fullstack/resources/config.py b/neutron/tests/fullstack/resources/config.py index fc51067da95..f1723a8edcd 100644 --- a/neutron/tests/fullstack/resources/config.py +++ b/neutron/tests/fullstack/resources/config.py @@ -25,6 +25,8 @@ from neutron.tests.common import config_fixtures from neutron.tests.common.exclusive_resources import port from neutron.tests.common import helpers as c_helpers +PHYSICAL_NETWORK_NAME = "physnet1" + class ConfigFixture(fixtures.Fixture): """A fixture that holds an actual Neutron configuration. @@ -129,7 +131,7 @@ class ML2ConfigFixture(ConfigFixture): 'mechanism_drivers': mechanism_drivers, }, 'ml2_type_vlan': { - 'network_vlan_ranges': 'physnet1:1000:2999', + 'network_vlan_ranges': PHYSICAL_NETWORK_NAME + ':1000:2999', }, 'ml2_type_gre': { 'tunnel_id_ranges': '1:1000', @@ -191,7 +193,8 @@ class OVSConfigFixture(ConfigFixture): super(OVSConfigFixture, self)._setUp() def _generate_bridge_mappings(self): - return 'physnet1:%s' % utils.get_rand_device_name(prefix='br-eth') + return '%s:%s' % (PHYSICAL_NETWORK_NAME, + utils.get_rand_device_name(prefix='br-eth')) def _generate_integration_bridge(self): return utils.get_rand_device_name(prefix='br-int') @@ -258,7 +261,7 @@ class LinuxBridgeConfigFixture(ConfigFixture): }) def _generate_bridge_mappings(self, device_name): - return 'physnet1:%s' % device_name + return '%s:%s' % (PHYSICAL_NETWORK_NAME, device_name) class L3ConfigFixture(ConfigFixture): diff --git a/neutron/tests/fullstack/resources/environment.py b/neutron/tests/fullstack/resources/environment.py index 614601df188..8a5b3248d02 100644 --- a/neutron/tests/fullstack/resources/environment.py +++ b/neutron/tests/fullstack/resources/environment.py @@ -297,6 +297,13 @@ class Host(fixtures.Fixture): def linuxbridge_agent(self, agent): self.agents['linuxbridge'] = agent + @property + def l2_agent(self): + if self.host_desc.l2_agent_type == constants.AGENT_TYPE_LINUXBRIDGE: + return self.linuxbridge_agent + elif self.host_desc.l2_agent_type == constants.AGENT_TYPE_OVS: + return self.ovs_agent + class Environment(fixtures.Fixture): """Represents a deployment topology. diff --git a/neutron/tests/fullstack/resources/process.py b/neutron/tests/fullstack/resources/process.py index 983ebbef0d8..1e7c3f58e41 100644 --- a/neutron/tests/fullstack/resources/process.py +++ b/neutron/tests/fullstack/resources/process.py @@ -21,6 +21,7 @@ import signal import fixtures from neutronclient.common import exceptions as nc_exc from neutronclient.v2_0 import client +from oslo_log import log as logging from oslo_utils import fileutils from neutron.agent.linux import async_process @@ -31,6 +32,8 @@ from neutron.tests import base from neutron.tests.common import net_helpers from neutron.tests.fullstack import base as fullstack_base +LOG = logging.getLogger(__name__) + class ProcessFixture(fixtures.Fixture): def __init__(self, test_name, process_name, exec_name, config_filenames, @@ -66,13 +69,28 @@ class ProcessFixture(fixtures.Fixture): cmd, run_as_root=run_as_root, namespace=self.namespace ) self.process.start(block=True) + LOG.debug("Process started: %s", self.process_name) - def stop(self): + def stop(self, kill_signal=None): + kill_signal = kill_signal or self.kill_signal try: - self.process.stop(block=True, kill_signal=self.kill_signal) + self.process.stop(block=True, kill_signal=kill_signal) except async_process.AsyncProcessException as e: if "Process is not running" not in str(e): raise + LOG.debug("Process stopped: %s", self.process_name) + + def restart(self, executor=None): + def _restart(): + self.stop() + self.start() + + LOG.debug("Restarting process: %s", self.process_name) + + if executor is None: + _restart() + else: + return executor.submit(_restart) class RabbitmqEnvironmentFixture(fixtures.Fixture): @@ -101,7 +119,18 @@ class RabbitmqEnvironmentFixture(fixtures.Fixture): utils.execute(cmd, run_as_root=True) -class NeutronServerFixture(fixtures.Fixture): +class ServiceFixture(fixtures.Fixture): + def restart(self, executor=None): + return self.process_fixture.restart(executor=executor) + + def start(self): + return self.process_fixture.start() + + def stop(self, kill_signal=None): + return self.process_fixture.stop(kill_signal=kill_signal) + + +class NeutronServerFixture(ServiceFixture): NEUTRON_SERVER = "neutron-server" @@ -141,7 +170,7 @@ class NeutronServerFixture(fixtures.Fixture): return client.Client(auth_strategy="noauth", endpoint_url=url) -class OVSAgentFixture(fixtures.Fixture): +class OVSAgentFixture(ServiceFixture): NEUTRON_OVS_AGENT = "neutron-openvswitch-agent" @@ -174,7 +203,7 @@ class OVSAgentFixture(fixtures.Fixture): kill_signal=signal.SIGTERM)) -class LinuxBridgeAgentFixture(fixtures.Fixture): +class LinuxBridgeAgentFixture(ServiceFixture): NEUTRON_LINUXBRIDGE_AGENT = "neutron-linuxbridge-agent" @@ -206,7 +235,7 @@ class LinuxBridgeAgentFixture(fixtures.Fixture): ) -class L3AgentFixture(fixtures.Fixture): +class L3AgentFixture(ServiceFixture): NEUTRON_L3_AGENT = "neutron-l3-agent" diff --git a/neutron/tests/fullstack/test_connectivity.py b/neutron/tests/fullstack/test_connectivity.py index 9cda06abb0c..54d65746693 100644 --- a/neutron/tests/fullstack/test_connectivity.py +++ b/neutron/tests/fullstack/test_connectivity.py @@ -12,11 +12,18 @@ # License for the specific language governing permissions and limitations # under the License. +from concurrent import futures +import signal + from neutron_lib import constants +from oslo_log import log as logging from oslo_utils import uuidutils import testscenarios +from neutron.common import utils as common_utils +from neutron.tests.common import net_helpers from neutron.tests.fullstack import base +from neutron.tests.fullstack.resources import config from neutron.tests.fullstack.resources import environment from neutron.tests.fullstack.resources import machine from neutron.tests.fullstack import utils @@ -24,6 +31,10 @@ from neutron.tests.unit import testlib_api load_tests = testlib_api.module_load_tests +SEGMENTATION_ID = 1234 + +LOG = logging.getLogger(__name__) + class BaseConnectivitySameNetworkTest(base.BaseFullStackTestCase): @@ -31,6 +42,8 @@ class BaseConnectivitySameNetworkTest(base.BaseFullStackTestCase): ovsdb_interface = None arp_responder = False + num_hosts = 3 + def setUp(self): host_descriptions = [ # There's value in enabling L3 agents registration when l2pop @@ -40,7 +53,8 @@ class BaseConnectivitySameNetworkTest(base.BaseFullStackTestCase): l3_agent=self.l2_pop, of_interface=self.of_interface, ovsdb_interface=self.ovsdb_interface, - l2_agent_type=self.l2_agent_type) for _ in range(3)] + l2_agent_type=self.l2_agent_type) + for _ in range(self.num_hosts)] env = environment.Environment( environment.EnvironmentDescription( network_type=self.network_type, @@ -49,23 +63,39 @@ class BaseConnectivitySameNetworkTest(base.BaseFullStackTestCase): host_descriptions) super(BaseConnectivitySameNetworkTest, self).setUp(env) - def _test_connectivity(self): - tenant_uuid = uuidutils.generate_uuid() + def _prepare_network(self, tenant_uuid): + net_args = {'network_type': self.network_type} + if self.network_type in ['flat', 'vlan']: + net_args['physical_network'] = config.PHYSICAL_NETWORK_NAME + if self.network_type in ['vlan', 'gre', 'vxlan']: + net_args['segmentation_id'] = SEGMENTATION_ID - network = self.safe_client.create_network(tenant_uuid) + network = self.safe_client.create_network(tenant_uuid, **net_args) self.safe_client.create_subnet( tenant_uuid, network['id'], '20.0.0.0/24') - vms = machine.FakeFullstackMachinesList([ + return network + + def _prepare_vms_in_net(self, tenant_uuid, network): + vms = machine.FakeFullstackMachinesList( self.useFixture( machine.FakeFullstackMachine( - self.environment.hosts[i], + host, network['id'], tenant_uuid, self.safe_client)) - for i in range(3)]) + for host in self.environment.hosts) vms.block_until_all_boot() + return vms + + def _prepare_vms_in_single_network(self): + tenant_uuid = uuidutils.generate_uuid() + network = self._prepare_network(tenant_uuid) + return self._prepare_vms_in_net(tenant_uuid, network) + + def _test_connectivity(self): + vms = self._prepare_vms_in_single_network() vms.ping_all() @@ -87,6 +117,61 @@ class TestOvsConnectivitySameNetwork(BaseConnectivitySameNetworkTest): self._test_connectivity() +class TestOvsConnectivitySameNetworkOnOvsBridgeControllerStop( + BaseConnectivitySameNetworkTest): + + num_hosts = 2 + + l2_agent_type = constants.AGENT_TYPE_OVS + network_scenarios = [ + ('VXLAN', {'network_type': 'vxlan', + 'l2_pop': False}), + ('GRE and l2pop', {'network_type': 'gre', + 'l2_pop': True}), + ('VLANs', {'network_type': 'vlan', + 'l2_pop': False})] + + # Do not test for CLI ofctl interface as controller is irrelevant for CLI + scenarios = testscenarios.multiply_scenarios( + network_scenarios, + [(m, v) for (m, v) in utils.get_ovs_interface_scenarios() + if v['of_interface'] != 'ovs-ofctl']) + + def _test_controller_timeout_does_not_break_connectivity(self, + kill_signal=None): + # Environment preparation is effectively the same as connectivity test + vms = self._prepare_vms_in_single_network() + + ns0 = vms[0].namespace + ip1 = vms[1].ip + + LOG.debug("Stopping agents (hence also OVS bridge controllers)") + for host in self.environment.hosts: + if kill_signal is not None: + host.l2_agent.stop(kill_signal=kill_signal) + else: + host.l2_agent.stop() + + # Ping to make sure that 3 x 5 seconds is overcame even under a high + # load. The time was chosen to match three times inactivity_probe time, + # which is the time after which the OVS vswitchd + # treats the controller as dead and starts managing the bridge + # by itself when the fail type settings is not set to secure (see + # ovs-vsctl man page for further details) + with net_helpers.async_ping(ns0, [ip1], timeout=2, count=25) as done: + common_utils.wait_until_true( + done, + exception=RuntimeError("Networking interrupted after " + "controllers have vanished")) + + def test_controller_timeout_does_not_break_connectivity_sigterm(self): + self._test_controller_timeout_does_not_break_connectivity() + + def test_controller_timeout_does_not_break_connectivity_sigkill(self): + self._test_controller_timeout_does_not_break_connectivity( + signal.SIGKILL) + + class TestLinuxBridgeConnectivitySameNetwork(BaseConnectivitySameNetworkTest): l2_agent_type = constants.AGENT_TYPE_LINUXBRIDGE @@ -101,3 +186,59 @@ class TestLinuxBridgeConnectivitySameNetwork(BaseConnectivitySameNetworkTest): def test_connectivity(self): self._test_connectivity() + + +class TestUninterruptedConnectivityOnL2AgentRestart( + BaseConnectivitySameNetworkTest): + + num_hosts = 2 + + ovs_agent_scenario = [('OVS', + {'l2_agent_type': constants.AGENT_TYPE_OVS})] + lb_agent_scenario = [('LB', + {'l2_agent_type': constants.AGENT_TYPE_LINUXBRIDGE})] + + network_scenarios = [ + ('Flat network', {'network_type': 'flat', + 'l2_pop': False}), + ('VLANs', {'network_type': 'vlan', + 'l2_pop': False}), + ('VXLAN', {'network_type': 'vxlan', + 'l2_pop': False}), + ] + scenarios = ( + testscenarios.multiply_scenarios(ovs_agent_scenario, network_scenarios, + utils.get_ovs_interface_scenarios()) + + testscenarios.multiply_scenarios(lb_agent_scenario, network_scenarios) + ) + + def test_l2_agent_restart(self, agent_restart_timeout=20): + # Environment preparation is effectively the same as connectivity test + vms = self._prepare_vms_in_single_network() + + ns0 = vms[0].namespace + ip1 = vms[1].ip + agents = [host.l2_agent for host in self.environment.hosts] + + # Restart agents on all nodes simultaneously while pinging across + # the hosts. The ping has to cross int and phys bridges and travels + # via central bridge as the vms are on separate hosts. + with net_helpers.async_ping(ns0, [ip1], timeout=2, + count=agent_restart_timeout) as done: + LOG.debug("Restarting agents") + executor = futures.ThreadPoolExecutor(max_workers=len(agents)) + restarts = [agent.restart(executor=executor) + for agent in agents] + + futures.wait(restarts, timeout=agent_restart_timeout) + + self.assertTrue(all([r.done() for r in restarts])) + LOG.debug("Restarting agents - done") + + # It is necessary to give agents time to initialize + # because some crucial steps (e.g. setting up bridge flows) + # happen only after RPC is established + common_utils.wait_until_true( + done, + exception=RuntimeError("Could not ping the other VM, L2 agent " + "restart leads to network disruption"))