Fix netns_cleanup interrupted on rwd I/O

Functional tests for netns_cleanup have been failing a few times
in the gate lately. After thorough tests we've seen that the issue was
related to using rootwrap-daemon inside a wait_until_true loop. When
timeout fired while utils.execute() was reading from rootwrap-daemon,
it got interrupted and the output of the last command was not read.
Therefore, next calls to utils.execute() would read the output of
their previous command rather than their own, leading to unexpected
results.

This fix will poll existing processes in the namespace without making
use of the wait_until_true loop. Instead, it will check elapsed time
and raise the exception if timeout is exceeded.

Also, i'm removing debug traces introduced in
327f7fc4d5 which helped finding the root
cause of this bug.

Change-Id: Ie233261e4be36eecaf6ec6d0532f0f5e2e996cd2
Closes-Bug: #1654287
This commit is contained in:
Daniel Alvarez 2017-01-12 01:06:01 +00:00
parent ada4237905
commit 3f9f740d81
3 changed files with 24 additions and 27 deletions

View File

@ -131,14 +131,10 @@ class IPWrapper(SubProcessBase):
cmd = ['ip', 'netns', 'exec', self.namespace,
'find', SYS_NET_PATH, '-maxdepth', '1',
'-type', 'l', '-printf', '%f ']
output_str = utils.execute(
output = utils.execute(
cmd,
run_as_root=True,
log_fail_as_error=self.log_fail_as_error)
# NOTE(dalvarez): Logging the output of this call due to
# bug1654287.
LOG.debug('get_devices(): %s', output_str)
output = output_str.split()
log_fail_as_error=self.log_fail_as_error).split()
except RuntimeError:
# We could be racing with a cron job deleting namespaces.
# Just return a empty list if the namespace is deleted.

View File

@ -35,7 +35,6 @@ from neutron.agent.linux import interface
from neutron.agent.linux import ip_lib
from neutron.agent.linux import utils
from neutron.common import config
from neutron.common import utils as common_utils
from neutron.conf.agent import cmd
from neutron.conf.agent import dhcp as dhcp_config
@ -162,14 +161,19 @@ def wait_until_no_listen_pids_namespace(namespace, timeout=SIGTERM_WAITTIME):
If after timeout seconds, there are remaining processes in the namespace,
then a PidsInNamespaceException will be thrown.
"""
# Would be better to handle an eventlet.timeout.Timeout exception
# but currently there's a problem importing eventlet since it's
# doing a local import from cmd/eventlet which doesn't have a
# timeout module
common_utils.wait_until_true(
lambda: not find_listen_pids_namespace(namespace),
timeout=SIGTERM_WAITTIME,
exception=PidsInNamespaceException)
# NOTE(dalvarez): This function can block forever if
# find_listen_pids_in_namespace never returns which is really unlikely. We
# can't use wait_until_true because we might get interrupted by eventlet
# Timeout during our I/O with rootwrap daemon and that will lead to errors
# in subsequent calls to utils.execute grabbing always the output of the
# previous command
start = end = time.time()
while end - start < timeout:
if not find_listen_pids_namespace(namespace):
return
time.sleep(1)
end = time.time()
raise PidsInNamespaceException
def _kill_listen_processes(namespace, force=False):

View File

@ -274,11 +274,9 @@ class TestNetnsCleanup(base.BaseTestCase):
def test_kill_listen_processes(self):
with mock.patch.object(util, '_kill_listen_processes',
return_value=1) as mock_kill_listen:
with mock.patch('neutron.common.utils.wait_until_true')\
as wait_until_true_mock:
wait_until_true_mock.side_effect = [
util.PidsInNamespaceException,
None]
with mock.patch.object(util, 'wait_until_no_listen_pids_namespace',
side_effect=[util.PidsInNamespaceException,
None]):
namespace = mock.ANY
util.kill_listen_processes(namespace)
mock_kill_listen.assert_has_calls(
@ -288,10 +286,8 @@ class TestNetnsCleanup(base.BaseTestCase):
def test_kill_listen_processes_still_procs(self):
with mock.patch.object(util, '_kill_listen_processes',
return_value=1):
with mock.patch('neutron.common.utils.wait_until_true')\
as wait_until_true_mock:
wait_until_true_mock.side_effect = (
util.PidsInNamespaceException)
with mock.patch.object(util, 'wait_until_no_listen_pids_namespace',
side_effect=util.PidsInNamespaceException):
namespace = mock.ANY
with testtools.ExpectedException(
util.PidsInNamespaceException):
@ -300,13 +296,14 @@ class TestNetnsCleanup(base.BaseTestCase):
def test_kill_listen_processes_no_procs(self):
with mock.patch.object(util, '_kill_listen_processes',
return_value=0) as mock_kill_listen:
with mock.patch('neutron.common.utils.wait_until_true')\
as wait_until_true_mock:
with mock.patch.object(util,
'wait_until_no_listen_pids_namespace')\
as wait_until_mock:
namespace = mock.ANY
util.kill_listen_processes(namespace)
mock_kill_listen.assert_called_once_with(namespace,
force=False)
self.assertFalse(wait_until_true_mock.called)
self.assertFalse(wait_until_mock.called)
def _test_destroy_namespace_helper(self, force, num_devices):
ns = 'qrouter-6e322ac7-ab50-4f53-9cdc-d1d3c1164b6d'