Connectivity tests for OVS agent failures/restarts

Adding two tests:

* A test that for native ovs-ofctl interface verifies that stopping the
  ovs-neutron-agent does not disrupt network traffic. Stopping the agent
  means also stopping the OVS bridge controller, hence OVS can decide to
  take over management of OpenFlow rules, clear them up, and this way
  cause network traffic disruption.

* A test that creates two ports in a single network, then starts
  pinging one from the other while restarting OVS agents. The test verifies
  that no packet is lost during OVS agent restarts.

Change-Id: I2cd1195fc0622c8c8d614f00e9dd6884ad388d69
Related-Bug: 1514056
Related-Bug: 1607787
(cherry picked from commit 27aee4a9c5)
This commit is contained in:
Hynek Mlnarik 2016-08-10 10:07:42 +02:00 committed by Terry Wilson
parent d963792542
commit 81bb6aa348
6 changed files with 210 additions and 19 deletions

View File

@ -127,9 +127,10 @@ def assert_async_ping(src_namespace, dst_ip, timeout=1, count=1, interval=1):
@contextlib.contextmanager
def async_ping(namespace, ips):
def async_ping(namespace, ips, timeout=1, count=10):
with futures.ThreadPoolExecutor(max_workers=len(ips)) as executor:
fs = [executor.submit(assert_async_ping, namespace, ip, count=10)
fs = [executor.submit(assert_async_ping, namespace, ip, count=count,
timeout=timeout)
for ip in ips]
yield lambda: all(f.done() for f in fs)
futures.wait(fs)

View File

@ -61,12 +61,22 @@ class ClientFixture(fixtures.Fixture):
return self._create_resource(resource_type, spec)
def create_network(self, tenant_id, name=None, external=False):
def create_network(self, tenant_id, name=None, external=False,
network_type=None, segmentation_id=None,
physical_network=None):
resource_type = 'network'
name = name or utils.get_rand_name(prefix=resource_type)
spec = {'tenant_id': tenant_id, 'name': name}
spec['router:external'] = external
if segmentation_id is not None:
spec['provider:segmentation_id'] = segmentation_id
if network_type is not None:
spec['provider:network_type'] = network_type
if physical_network is not None:
spec['provider:physical_network'] = physical_network
return self._create_resource(resource_type, spec)
def create_subnet(self, tenant_id, network_id,

View File

@ -25,6 +25,8 @@ from neutron.tests.common import config_fixtures
from neutron.tests.common.exclusive_resources import port
from neutron.tests.common import helpers as c_helpers
PHYSICAL_NETWORK_NAME = "physnet1"
class ConfigFixture(fixtures.Fixture):
"""A fixture that holds an actual Neutron configuration.
@ -129,7 +131,7 @@ class ML2ConfigFixture(ConfigFixture):
'mechanism_drivers': mechanism_drivers,
},
'ml2_type_vlan': {
'network_vlan_ranges': 'physnet1:1000:2999',
'network_vlan_ranges': PHYSICAL_NETWORK_NAME + ':1000:2999',
},
'ml2_type_gre': {
'tunnel_id_ranges': '1:1000',
@ -192,7 +194,8 @@ class OVSConfigFixture(ConfigFixture):
super(OVSConfigFixture, self)._setUp()
def _generate_bridge_mappings(self):
return 'physnet1:%s' % utils.get_rand_device_name(prefix='br-eth')
return '%s:%s' % (PHYSICAL_NETWORK_NAME,
utils.get_rand_device_name(prefix='br-eth'))
def _generate_integration_bridge(self):
return utils.get_rand_device_name(prefix='br-int')
@ -259,7 +262,7 @@ class LinuxBridgeConfigFixture(ConfigFixture):
})
def _generate_bridge_mappings(self, device_name):
return 'physnet1:%s' % device_name
return '%s:%s' % (PHYSICAL_NETWORK_NAME, device_name)
class L3ConfigFixture(ConfigFixture):

View File

@ -297,6 +297,13 @@ class Host(fixtures.Fixture):
def linuxbridge_agent(self, agent):
self.agents['linuxbridge'] = agent
@property
def l2_agent(self):
if self.host_desc.l2_agent_type == constants.AGENT_TYPE_LINUXBRIDGE:
return self.linuxbridge_agent
elif self.host_desc.l2_agent_type == constants.AGENT_TYPE_OVS:
return self.ovs_agent
class Environment(fixtures.Fixture):
"""Represents a deployment topology.

View File

@ -21,6 +21,7 @@ import signal
import fixtures
from neutronclient.common import exceptions as nc_exc
from neutronclient.v2_0 import client
from oslo_log import log as logging
from oslo_utils import fileutils
from neutron.agent.linux import async_process
@ -31,6 +32,8 @@ from neutron.tests import base
from neutron.tests.common import net_helpers
from neutron.tests.fullstack import base as fullstack_base
LOG = logging.getLogger(__name__)
class ProcessFixture(fixtures.Fixture):
def __init__(self, test_name, process_name, exec_name, config_filenames,
@ -66,13 +69,28 @@ class ProcessFixture(fixtures.Fixture):
cmd, run_as_root=run_as_root, namespace=self.namespace
)
self.process.start(block=True)
LOG.debug("Process started: %s", self.process_name)
def stop(self):
def stop(self, kill_signal=None):
kill_signal = kill_signal or self.kill_signal
try:
self.process.stop(block=True, kill_signal=self.kill_signal)
self.process.stop(block=True, kill_signal=kill_signal)
except async_process.AsyncProcessException as e:
if "Process is not running" not in str(e):
raise
LOG.debug("Process stopped: %s", self.process_name)
def restart(self, executor=None):
def _restart():
self.stop()
self.start()
LOG.debug("Restarting process: %s", self.process_name)
if executor is None:
_restart()
else:
return executor.submit(_restart)
class RabbitmqEnvironmentFixture(fixtures.Fixture):
@ -101,7 +119,18 @@ class RabbitmqEnvironmentFixture(fixtures.Fixture):
utils.execute(cmd, run_as_root=True)
class NeutronServerFixture(fixtures.Fixture):
class ServiceFixture(fixtures.Fixture):
def restart(self, executor=None):
return self.process_fixture.restart(executor=executor)
def start(self):
return self.process_fixture.start()
def stop(self, kill_signal=None):
return self.process_fixture.stop(kill_signal=kill_signal)
class NeutronServerFixture(ServiceFixture):
NEUTRON_SERVER = "neutron-server"
@ -141,7 +170,7 @@ class NeutronServerFixture(fixtures.Fixture):
return client.Client(auth_strategy="noauth", endpoint_url=url)
class OVSAgentFixture(fixtures.Fixture):
class OVSAgentFixture(ServiceFixture):
NEUTRON_OVS_AGENT = "neutron-openvswitch-agent"
@ -174,7 +203,7 @@ class OVSAgentFixture(fixtures.Fixture):
kill_signal=signal.SIGTERM))
class LinuxBridgeAgentFixture(fixtures.Fixture):
class LinuxBridgeAgentFixture(ServiceFixture):
NEUTRON_LINUXBRIDGE_AGENT = "neutron-linuxbridge-agent"
@ -206,7 +235,7 @@ class LinuxBridgeAgentFixture(fixtures.Fixture):
)
class L3AgentFixture(fixtures.Fixture):
class L3AgentFixture(ServiceFixture):
NEUTRON_L3_AGENT = "neutron-l3-agent"

View File

@ -12,11 +12,18 @@
# License for the specific language governing permissions and limitations
# under the License.
from concurrent import futures
import signal
from neutron_lib import constants
from oslo_log import log as logging
from oslo_utils import uuidutils
import testscenarios
from neutron.common import utils as common_utils
from neutron.tests.common import net_helpers
from neutron.tests.fullstack import base
from neutron.tests.fullstack.resources import config
from neutron.tests.fullstack.resources import environment
from neutron.tests.fullstack.resources import machine
from neutron.tests.fullstack import utils
@ -24,6 +31,10 @@ from neutron.tests.unit import testlib_api
load_tests = testlib_api.module_load_tests
SEGMENTATION_ID = 1234
LOG = logging.getLogger(__name__)
class BaseConnectivitySameNetworkTest(base.BaseFullStackTestCase):
@ -31,6 +42,8 @@ class BaseConnectivitySameNetworkTest(base.BaseFullStackTestCase):
ovsdb_interface = None
arp_responder = False
num_hosts = 3
def setUp(self):
host_descriptions = [
# There's value in enabling L3 agents registration when l2pop
@ -40,7 +53,8 @@ class BaseConnectivitySameNetworkTest(base.BaseFullStackTestCase):
l3_agent=self.l2_pop,
of_interface=self.of_interface,
ovsdb_interface=self.ovsdb_interface,
l2_agent_type=self.l2_agent_type) for _ in range(3)]
l2_agent_type=self.l2_agent_type)
for _ in range(self.num_hosts)]
env = environment.Environment(
environment.EnvironmentDescription(
network_type=self.network_type,
@ -49,23 +63,39 @@ class BaseConnectivitySameNetworkTest(base.BaseFullStackTestCase):
host_descriptions)
super(BaseConnectivitySameNetworkTest, self).setUp(env)
def _test_connectivity(self):
tenant_uuid = uuidutils.generate_uuid()
def _prepare_network(self, tenant_uuid):
net_args = {'network_type': self.network_type}
if self.network_type in ['flat', 'vlan']:
net_args['physical_network'] = config.PHYSICAL_NETWORK_NAME
if self.network_type in ['vlan', 'gre', 'vxlan']:
net_args['segmentation_id'] = SEGMENTATION_ID
network = self.safe_client.create_network(tenant_uuid)
network = self.safe_client.create_network(tenant_uuid, **net_args)
self.safe_client.create_subnet(
tenant_uuid, network['id'], '20.0.0.0/24')
vms = machine.FakeFullstackMachinesList([
return network
def _prepare_vms_in_net(self, tenant_uuid, network):
vms = machine.FakeFullstackMachinesList(
self.useFixture(
machine.FakeFullstackMachine(
self.environment.hosts[i],
host,
network['id'],
tenant_uuid,
self.safe_client))
for i in range(3)])
for host in self.environment.hosts)
vms.block_until_all_boot()
return vms
def _prepare_vms_in_single_network(self):
tenant_uuid = uuidutils.generate_uuid()
network = self._prepare_network(tenant_uuid)
return self._prepare_vms_in_net(tenant_uuid, network)
def _test_connectivity(self):
vms = self._prepare_vms_in_single_network()
vms.ping_all()
@ -87,6 +117,61 @@ class TestOvsConnectivitySameNetwork(BaseConnectivitySameNetworkTest):
self._test_connectivity()
class TestOvsConnectivitySameNetworkOnOvsBridgeControllerStop(
BaseConnectivitySameNetworkTest):
num_hosts = 2
l2_agent_type = constants.AGENT_TYPE_OVS
network_scenarios = [
('VXLAN', {'network_type': 'vxlan',
'l2_pop': False}),
('GRE and l2pop', {'network_type': 'gre',
'l2_pop': True}),
('VLANs', {'network_type': 'vlan',
'l2_pop': False})]
# Do not test for CLI ofctl interface as controller is irrelevant for CLI
scenarios = testscenarios.multiply_scenarios(
network_scenarios,
[(m, v) for (m, v) in utils.get_ovs_interface_scenarios()
if v['of_interface'] != 'ovs-ofctl'])
def _test_controller_timeout_does_not_break_connectivity(self,
kill_signal=None):
# Environment preparation is effectively the same as connectivity test
vms = self._prepare_vms_in_single_network()
ns0 = vms[0].namespace
ip1 = vms[1].ip
LOG.debug("Stopping agents (hence also OVS bridge controllers)")
for host in self.environment.hosts:
if kill_signal is not None:
host.l2_agent.stop(kill_signal=kill_signal)
else:
host.l2_agent.stop()
# Ping to make sure that 3 x 5 seconds is overcame even under a high
# load. The time was chosen to match three times inactivity_probe time,
# which is the time after which the OVS vswitchd
# treats the controller as dead and starts managing the bridge
# by itself when the fail type settings is not set to secure (see
# ovs-vsctl man page for further details)
with net_helpers.async_ping(ns0, [ip1], timeout=2, count=25) as done:
common_utils.wait_until_true(
done,
exception=RuntimeError("Networking interrupted after "
"controllers have vanished"))
def test_controller_timeout_does_not_break_connectivity_sigterm(self):
self._test_controller_timeout_does_not_break_connectivity()
def test_controller_timeout_does_not_break_connectivity_sigkill(self):
self._test_controller_timeout_does_not_break_connectivity(
signal.SIGKILL)
class TestLinuxBridgeConnectivitySameNetwork(BaseConnectivitySameNetworkTest):
l2_agent_type = constants.AGENT_TYPE_LINUXBRIDGE
@ -101,3 +186,59 @@ class TestLinuxBridgeConnectivitySameNetwork(BaseConnectivitySameNetworkTest):
def test_connectivity(self):
self._test_connectivity()
class TestUninterruptedConnectivityOnL2AgentRestart(
BaseConnectivitySameNetworkTest):
num_hosts = 2
ovs_agent_scenario = [('OVS',
{'l2_agent_type': constants.AGENT_TYPE_OVS})]
lb_agent_scenario = [('LB',
{'l2_agent_type': constants.AGENT_TYPE_LINUXBRIDGE})]
network_scenarios = [
('Flat network', {'network_type': 'flat',
'l2_pop': False}),
('VLANs', {'network_type': 'vlan',
'l2_pop': False}),
('VXLAN', {'network_type': 'vxlan',
'l2_pop': False}),
]
scenarios = (
testscenarios.multiply_scenarios(ovs_agent_scenario, network_scenarios,
utils.get_ovs_interface_scenarios()) +
testscenarios.multiply_scenarios(lb_agent_scenario, network_scenarios)
)
def test_l2_agent_restart(self, agent_restart_timeout=20):
# Environment preparation is effectively the same as connectivity test
vms = self._prepare_vms_in_single_network()
ns0 = vms[0].namespace
ip1 = vms[1].ip
agents = [host.l2_agent for host in self.environment.hosts]
# Restart agents on all nodes simultaneously while pinging across
# the hosts. The ping has to cross int and phys bridges and travels
# via central bridge as the vms are on separate hosts.
with net_helpers.async_ping(ns0, [ip1], timeout=2,
count=agent_restart_timeout) as done:
LOG.debug("Restarting agents")
executor = futures.ThreadPoolExecutor(max_workers=len(agents))
restarts = [agent.restart(executor=executor)
for agent in agents]
futures.wait(restarts, timeout=agent_restart_timeout)
self.assertTrue(all([r.done() for r in restarts]))
LOG.debug("Restarting agents - done")
# It is necessary to give agents time to initialize
# because some crucial steps (e.g. setting up bridge flows)
# happen only after RPC is established
common_utils.wait_until_true(
done,
exception=RuntimeError("Could not ping the other VM, L2 agent "
"restart leads to network disruption"))