Packets getting lost during SNAT with too many connections

We have a problem with SNAT with too many connections using the
same source and destination on the network nodes.

In addition we can see in the conntrack table that the who
"instert_failed" increases.

This might be a generic problem with conntrack and linux.
We suspect that we encounter the following "limitation / bug"
in the kernel.

There seems to be a workaround to alleviate this behavior by
setting the -random-fully flag in iptables for port consumption.

This patch fixes the problem by adding the --random-fully to
the SNAT rules.

Change-Id: I246c1f56df889bad9c7e140b56c3614124d80a19
Closes-Bug: #1814002
This commit is contained in:
Swaminathan Vasudevan 2019-02-12 11:27:51 -08:00 committed by Brian Haley
parent f8b990736b
commit 30f35e08f9
9 changed files with 122 additions and 30 deletions

View File

@ -341,12 +341,14 @@ class DvrEdgeRouter(dvr_local_router.DvrLocalRouter):
return lib_constants.FLOATINGIP_STATUS_ACTIVE
def _centralized_floating_forward_rules(self, floating_ip, fixed_ip):
to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip)
if self.snat_iptables_manager.random_fully:
to_source += ' --random-fully'
return [('PREROUTING', '-d %s/32 -j DNAT --to-destination %s' %
(floating_ip, fixed_ip)),
('OUTPUT', '-d %s/32 -j DNAT --to-destination %s' %
(floating_ip, fixed_ip)),
('float-snat', '-s %s/32 -j SNAT --to-source %s' %
(fixed_ip, floating_ip))]
('float-snat', to_source)]
def _set_floating_ip_nat_rules_for_centralized_floatingip(self, fip):
if fip.get(lib_constants.DVR_SNAT_BOUND):

View File

@ -74,9 +74,10 @@ class DvrLocalRouter(dvr_router_base.DvrRouterBase):
dnat_from_floatingip_to_fixedip = (
'PREROUTING', '-d %s/32 -i %s -j DNAT --to-destination %s' % (
floating_ip, rtr_2_fip_name, fixed_ip))
snat_from_fixedip_to_floatingip = (
'float-snat', '-s %s/32 -j SNAT --to-source %s' % (
fixed_ip, floating_ip))
to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip)
if self.iptables_manager.random_fully:
to_source += ' --random-fully'
snat_from_fixedip_to_floatingip = ('float-snat', to_source)
return [dnat_from_floatingip_to_fixedip,
snat_from_fixedip_to_floatingip]

View File

@ -172,12 +172,14 @@ class RouterInfo(object):
def floating_forward_rules(self, fip):
fixed_ip = fip['fixed_ip_address']
floating_ip = fip['floating_ip_address']
to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip)
if self.iptables_manager.random_fully:
to_source += ' --random-fully'
return [('PREROUTING', '-d %s/32 -j DNAT --to-destination %s' %
(floating_ip, fixed_ip)),
('OUTPUT', '-d %s/32 -j DNAT --to-destination %s' %
(floating_ip, fixed_ip)),
('float-snat', '-s %s/32 -j SNAT --to-source %s' %
(fixed_ip, floating_ip))]
('float-snat', to_source)]
def floating_mangle_rules(self, floating_ip, fixed_ip, internal_mark):
mark_traffic_to_floating_ip = (
@ -854,19 +856,21 @@ class RouterInfo(object):
self._prevent_snat_for_internal_traffic_rule(interface_name))
# Makes replies come back through the router to reverse DNAT
ext_in_mark = self.agent_conf.external_ingress_mark
snat_internal_traffic_to_floating_ip = (
'snat', '-m mark ! --mark %s/%s '
'-m conntrack --ctstate DNAT '
'-j SNAT --to-source %s'
% (ext_in_mark, lib_constants.ROUTER_MARK_MASK, ex_gw_ip))
to_source = ('-m mark ! --mark %s/%s '
'-m conntrack --ctstate DNAT '
'-j SNAT --to-source %s'
% (ext_in_mark, lib_constants.ROUTER_MARK_MASK, ex_gw_ip))
if self.iptables_manager.random_fully:
to_source += ' --random-fully'
snat_internal_traffic_to_floating_ip = ('snat', to_source)
return [dont_snat_traffic_to_internal_ports_if_not_to_floating_ip,
snat_internal_traffic_to_floating_ip]
def external_gateway_nat_snat_rules(self, ex_gw_ip, interface_name):
snat_normal_external_traffic = (
'snat', '-o %s -j SNAT --to-source %s' %
(interface_name, ex_gw_ip))
return [snat_normal_external_traffic]
to_source = '-o %s -j SNAT --to-source %s' % (interface_name, ex_gw_ip)
if self.iptables_manager.random_fully:
to_source += ' --random-fully'
return [('snat', to_source)]
def external_gateway_mangle_rules(self, interface_name):
mark = self.agent_conf.external_ingress_mark

View File

@ -38,6 +38,8 @@ from neutron._i18n import _
from neutron.agent.linux import ip_lib
from neutron.agent.linux import iptables_comments as ic
from neutron.agent.linux import utils as linux_utils
from neutron.common import _constants as n_const
from neutron.common import utils
from neutron.conf.agent import common as config
LOG = logging.getLogger(__name__)
@ -302,6 +304,9 @@ class IptablesManager(object):
# run iptables-restore without it.
use_table_lock = False
# Flag to denote iptables supports --random-fully argument
_random_fully = None
def __init__(self, _execute=None, state_less=False, use_ipv6=False,
namespace=None, binary_name=binary_name):
if _execute:
@ -473,6 +478,23 @@ class IptablesManager(object):
args = ['ip', 'netns', 'exec', self.namespace] + args
return self.execute(args, run_as_root=True).split('\n')
def _get_version(self):
# Output example is "iptables v1.6.2"
args = ['iptables', '--version']
version = str(self.execute(args, run_as_root=True).split()[1][1:])
LOG.debug("IPTables version installed: %s", version)
return version
@property
def random_fully(self):
if self._random_fully is not None:
return self._random_fully
version = self._get_version()
self.__class__._random_fully = utils.is_version_greater_equal(
version, n_const.IPTABLES_RANDOM_FULLY_VERSION)
return self._random_fully
@property
def xlock_wait_time(self):
# give agent some time to report back to server

View File

@ -56,3 +56,7 @@ AGENT_RES_PROCESSING_STEP = 100
# Number of resources for neutron to divide the large RPC
# call data sets.
RPC_RES_PROCESSING_STEP = 20
# IPtables version to support --random-fully option.
# Do not move this constant to neutron-lib, since it is temporary
IPTABLES_RANDOM_FULLY_VERSION = '1.6.2'

View File

@ -41,6 +41,7 @@ from oslo_config import cfg
from oslo_db import exception as db_exc
from oslo_log import log as logging
from oslo_utils import excutils
import pkg_resources
import six
import neutron
@ -312,6 +313,12 @@ def get_socket_address_family(ip_version):
else socket.AF_INET6))
def is_version_greater_equal(version1, version2):
"""Returns True if version1 is greater or equal than version2 else False"""
return (pkg_resources.parse_version(version1) >=
pkg_resources.parse_version(version2))
class DelayedStringRenderer(object):
"""Takes a callable and its args and calls when __str__ is called

View File

@ -18,6 +18,7 @@ from itertools import chain as iter_chain
from itertools import combinations as iter_combinations
import eventlet
import fixtures
import mock
import netaddr
from neutron_lib.agent import constants as agent_consts
@ -192,10 +193,28 @@ class BasicRouterOperationsFramework(base.BaseTestCase):
ri.process()
class IptablesFixture(fixtures.Fixture):
def _setUp(self):
# We MUST save and restore random_fully because it is a class
# attribute and could change state in some tests, which can cause
# the other router test cases to randomly fail due to race conditions.
self.random_fully = iptables_manager.IptablesManager.random_fully
iptables_manager.IptablesManager.random_fully = True
self.addCleanup(self._reset)
def _reset(self):
iptables_manager.IptablesManager.random_fully = self.random_fully
class TestBasicRouterOperations(BasicRouterOperationsFramework):
def setUp(self):
super(TestBasicRouterOperations, self).setUp()
self.useFixture(IptablesFixture())
def test_request_id_changes(self):
a = l3_agent.L3NATAgent(HOSTNAME, self.conf)
self.assertNotEqual(a.context.request_id, a.context.request_id)
self.useFixture(IptablesFixture())
def test_init_ha_conf(self):
with mock.patch('os.path.dirname', return_value='/etc/ha/'):
@ -1022,7 +1041,7 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
self._test_external_gateway_action('remove', router, dual_stack=True)
def _verify_snat_mangle_rules(self, nat_rules, mangle_rules, router,
negate=False):
random_fully, negate=False):
interfaces = router[lib_constants.INTERFACE_KEY]
source_cidrs = []
for iface in interfaces:
@ -1033,13 +1052,18 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
source_cidrs.append(source_cidr)
source_nat_ip = router['gw_port']['fixed_ips'][0]['ip_address']
interface_name = ('qg-%s' % router['gw_port']['id'])[:14]
mask_rule = ('-m mark ! --mark 0x2/%s -m conntrack --ctstate DNAT '
'-j SNAT --to-source %s' %
(lib_constants.ROUTER_MARK_MASK, source_nat_ip))
snat_rule = ('-o %s -j SNAT --to-source %s' %
(interface_name, source_nat_ip))
if random_fully:
mask_rule += ' --random-fully'
snat_rule += ' --random-fully'
expected_rules = [
'! -i %s ! -o %s -m conntrack ! --ctstate DNAT -j ACCEPT' %
(interface_name, interface_name),
'-o %s -j SNAT --to-source %s' % (interface_name, source_nat_ip),
'-m mark ! --mark 0x2/%s -m conntrack --ctstate DNAT '
'-j SNAT --to-source %s' %
(lib_constants.ROUTER_MARK_MASK, source_nat_ip)]
mask_rule, snat_rule]
for r in nat_rules:
if negate:
self.assertNotIn(r.rule, expected_rules)
@ -1631,7 +1655,8 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
ri.get_external_device_name = mock.Mock(return_value='exgw')
self._test_process_floating_ip_addresses_add(ri, agent)
def test_process_router_snat_disabled(self):
def _test_process_router_snat_disabled(self, random_fully):
iptables_manager.IptablesManager.random_fully = random_fully
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
router = l3_test_common.prepare_router_data(enable_snat=True)
ri = l3router.RouterInfo(agent, router['id'], router, **self.ri_kwargs)
@ -1655,10 +1680,17 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
if r not in ri.iptables_manager.ipv4['mangle'].rules]
self.assertEqual(1, len(mangle_rules_delta))
self._verify_snat_mangle_rules(nat_rules_delta, mangle_rules_delta,
router)
router, random_fully)
self.assertEqual(1, self.send_adv_notif.call_count)
def test_process_router_snat_enabled(self):
def test_process_router_snat_disabled_random_fully(self):
self._test_process_router_snat_disabled(True)
def test_process_router_snat_disabled_random_fully_false(self):
self._test_process_router_snat_disabled(False)
def _test_process_router_snat_enabled(self, random_fully):
iptables_manager.IptablesManager.random_fully = random_fully
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
router = l3_test_common.prepare_router_data(enable_snat=False)
ri = l3router.RouterInfo(agent, router['id'], router, **self.ri_kwargs)
@ -1682,9 +1714,15 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
if r not in orig_mangle_rules]
self.assertEqual(1, len(mangle_rules_delta))
self._verify_snat_mangle_rules(nat_rules_delta, mangle_rules_delta,
router)
router, random_fully)
self.assertEqual(1, self.send_adv_notif.call_count)
def test_process_router_snat_enabled_random_fully(self):
self._test_process_router_snat_enabled(True)
def test_process_router_snat_enabled_random_fully_false(self):
self._test_process_router_snat_enabled(False)
def _test_update_routing_table(self, is_snat_host=True):
router = l3_test_common.prepare_router_data()
uuid = router['id']
@ -2292,11 +2330,12 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
jump_float_rule = "-A %s-snat -j %s-float-snat" % (wrap_name,
wrap_name)
snat_rule1 = ("-A %s-snat -o iface -j SNAT --to-source %s") % (
snat_rule1 = ("-A %s-snat -o iface -j SNAT --to-source %s "
"--random-fully") % (
wrap_name, ex_gw_port['fixed_ips'][0]['ip_address'])
snat_rule2 = ("-A %s-snat -m mark ! --mark 0x2/%s "
"-m conntrack --ctstate DNAT "
"-j SNAT --to-source %s") % (
"-j SNAT --to-source %s --random-fully") % (
wrap_name, lib_constants.ROUTER_MARK_MASK,
ex_gw_port['fixed_ips'][0]['ip_address'])

View File

@ -259,9 +259,11 @@ class TestDvrRouterOperations(base.BaseTestCase):
dnat_from_floatingip_to_fixedip = (
'PREROUTING', '-d %s/32 -i %s -j DNAT --to-destination %s' % (
floating_ip, rtr_2_fip_name, fixed_ip))
snat_from_fixedip_to_floatingip = (
'float-snat', '-s %s/32 -j SNAT --to-source %s' % (
fixed_ip, floating_ip))
to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip)
if ri.iptables_manager.random_fully:
to_source += ' --random-fully'
snat_from_fixedip_to_floatingip = ('float-snat', to_source)
actual = ri.floating_forward_rules(fip)
expected = [dnat_from_floatingip_to_fixedip,
snat_from_fixedip_to_floatingip]

View File

@ -281,6 +281,17 @@ class TestIpVersionFromInt(base.BaseTestCase):
8)
class TestIsVersionGreaterEqual(base.BaseTestCase):
def test_is_version_greater_equal_greater(self):
self.assertTrue(utils.is_version_greater_equal('1.6.2', '1.6.0'))
def test_is_version_greater_equal_equal(self):
self.assertTrue(utils.is_version_greater_equal('1.6.2', '1.6.2'))
def test_is_version_greater_equal_less(self):
self.assertFalse(utils.is_version_greater_equal('1.6.0', '1.6.2'))
class TestDelayedStringRenderer(base.BaseTestCase):
def test_call_deferred_until_str(self):
my_func = mock.MagicMock(return_value='Brie cheese!')