From 1d38f30555384257445f0d119a307e74e88d7fbf Mon Sep 17 00:00:00 2001 From: Daniel Alvarez Date: Thu, 24 Nov 2016 16:32:50 +0000 Subject: [PATCH] Kill processes when cleaning up namespaces This patch will kill processes that are listening on any port/UNIX socket within the namespace to be cleaned up. To kill them it will issue a SIGTERM to them (or to their parents if they were forked) and, if they don't die after a few seconds, a SIGKILL to them and all their children. This is intended for those cases when there's no specific cleanup and serves as a fallback method. Change-Id: I4195f633ef4a1788496d1293846f19eef89416aa Partial-Bug: #1403455 --- etc/neutron/rootwrap.d/dhcp.filters | 4 +- etc/neutron/rootwrap.d/l3.filters | 8 +- etc/neutron/rootwrap.d/netns-cleanup.filters | 12 ++ neutron/agent/linux/async_process.py | 14 +- neutron/agent/linux/utils.py | 59 +++++- neutron/cmd/netns_cleanup.py | 100 +++++++++- .../tests/contrib/functional-testing.filters | 6 + neutron/tests/functional/cmd/process_spawn.py | 180 ++++++++++++++++++ .../functional/cmd/test_netns_cleanup.py | 76 ++++++++ .../unit/agent/linux/test_async_process.py | 12 +- neutron/tests/unit/agent/linux/test_utils.py | 103 ++++++++++ neutron/tests/unit/cmd/test_netns_cleanup.py | 177 +++++++++++++++-- ...s_cleanup_kill_procs-af88d8c47c07dd9c.yaml | 8 + setup.cfg | 1 + 14 files changed, 720 insertions(+), 40 deletions(-) create mode 100644 etc/neutron/rootwrap.d/netns-cleanup.filters create mode 100644 neutron/tests/functional/cmd/process_spawn.py create mode 100644 releasenotes/notes/netns_cleanup_kill_procs-af88d8c47c07dd9c.yaml diff --git a/etc/neutron/rootwrap.d/dhcp.filters b/etc/neutron/rootwrap.d/dhcp.filters index ab87abb2940..3f06b4ae26e 100644 --- a/etc/neutron/rootwrap.d/dhcp.filters +++ b/etc/neutron/rootwrap.d/dhcp.filters @@ -13,8 +13,8 @@ dnsmasq: CommandFilter, dnsmasq, root # dhcp-agent uses kill as well, that's handled by the generic KillFilter # it looks like these are the only signals needed, per # neutron/agent/linux/dhcp.py -kill_dnsmasq: KillFilter, root, /sbin/dnsmasq, -9, -HUP -kill_dnsmasq_usr: KillFilter, root, /usr/sbin/dnsmasq, -9, -HUP +kill_dnsmasq: KillFilter, root, /sbin/dnsmasq, -9, -HUP, -15 +kill_dnsmasq_usr: KillFilter, root, /usr/sbin/dnsmasq, -9, -HUP, -15 ovs-vsctl: CommandFilter, ovs-vsctl, root ivs-ctl: CommandFilter, ivs-ctl, root diff --git a/etc/neutron/rootwrap.d/l3.filters b/etc/neutron/rootwrap.d/l3.filters index 0fdf60cd1ec..789a16f80ed 100644 --- a/etc/neutron/rootwrap.d/l3.filters +++ b/etc/neutron/rootwrap.d/l3.filters @@ -19,10 +19,10 @@ radvd: CommandFilter, radvd, root # metadata proxy metadata_proxy: CommandFilter, neutron-ns-metadata-proxy, root # RHEL invocation of the metadata proxy will report /usr/bin/python -kill_metadata: KillFilter, root, python, -9 -kill_metadata7: KillFilter, root, python2.7, -9 -kill_radvd_usr: KillFilter, root, /usr/sbin/radvd, -9, -HUP -kill_radvd: KillFilter, root, /sbin/radvd, -9, -HUP +kill_metadata: KillFilter, root, python, -15, -9 +kill_metadata7: KillFilter, root, python2.7, -15, -9 +kill_radvd_usr: KillFilter, root, /usr/sbin/radvd, -15, -9, -HUP +kill_radvd: KillFilter, root, /sbin/radvd, -15, -9, -HUP # ip_lib ip: IpFilter, ip, root diff --git a/etc/neutron/rootwrap.d/netns-cleanup.filters b/etc/neutron/rootwrap.d/netns-cleanup.filters new file mode 100644 index 00000000000..1ee142e54c1 --- /dev/null +++ b/etc/neutron/rootwrap.d/netns-cleanup.filters @@ -0,0 +1,12 @@ +# neutron-rootwrap command filters for nodes on which neutron is +# expected to control network +# +# This file should be owned by (and only-writeable by) the root user + +# format seems to be +# cmd-name: filter-name, raw-command, user, args + +[Filters] + +# netns-cleanup +netstat: CommandFilter, netstat, root diff --git a/neutron/agent/linux/async_process.py b/neutron/agent/linux/async_process.py index 5d08a6104e8..bcc8b7f5e4d 100644 --- a/neutron/agent/linux/async_process.py +++ b/neutron/agent/linux/async_process.py @@ -175,15 +175,11 @@ class AsyncProcess(object): try: # A process started by a root helper will be running as # root and need to be killed via the same helper. - utils.execute(['kill', '-%d' % kill_signal, pid], - run_as_root=self.run_as_root) - except Exception as ex: - stale_pid = (isinstance(ex, RuntimeError) and - 'No such process' in str(ex)) - if not stale_pid: - LOG.exception(_LE('An error occurred while killing [%s].'), - self.cmd) - return False + utils.kill_process(pid, kill_signal, self.run_as_root) + except Exception: + LOG.exception(_LE('An error occurred while killing [%s].'), + self.cmd) + return False if self._process: self._process.wait() diff --git a/neutron/agent/linux/utils.py b/neutron/agent/linux/utils.py index 936a22afd7e..5d26118e2d6 100644 --- a/neutron/agent/linux/utils.py +++ b/neutron/agent/linux/utils.py @@ -171,9 +171,11 @@ def get_interface_mac(interface): return ':'.join(["%02x" % b for b in iterbytes(info[MAC_START:MAC_END])]) -def find_child_pids(pid): - """Retrieve a list of the pids of child processes of the given pid.""" +def find_child_pids(pid, recursive=False): + """Retrieve a list of the pids of child processes of the given pid. + It can also find all children through the hierarchy if recursive=True + """ try: raw_pids = execute(['ps', '--ppid', pid, '-o', 'pid='], log_fail_as_error=False) @@ -185,7 +187,58 @@ def find_child_pids(pid): if no_children_found: ctxt.reraise = False return [] - return [x.strip() for x in raw_pids.split('\n') if x.strip()] + child_pids = [x.strip() for x in raw_pids.split('\n') if x.strip()] + if recursive: + for child in child_pids: + child_pids = child_pids + find_child_pids(child, True) + return child_pids + + +def find_parent_pid(pid): + """Retrieve the pid of the parent process of the given pid. + + If the pid doesn't exist in the system, this function will return + None + """ + try: + ppid = execute(['ps', '-o', 'ppid=', pid], + log_fail_as_error=False) + except ProcessExecutionError as e: + # Unexpected errors are the responsibility of the caller + with excutils.save_and_reraise_exception() as ctxt: + # Exception has already been logged by execute + no_such_pid = e.returncode == 1 + if no_such_pid: + ctxt.reraise = False + return + return ppid.strip() + + +def find_fork_top_parent(pid): + """Retrieve the pid of the top parent of the given pid through a fork. + + This function will search the top parent with its same cmdline. If the + given pid has no parent, its own pid will be returned + """ + while True: + ppid = find_parent_pid(pid) + if (ppid and ppid != pid and + pid_invoked_with_cmdline(ppid, get_cmdline_from_pid(pid))): + pid = ppid + else: + return pid + + +def kill_process(pid, signal, run_as_root=False): + """Kill the process with the given pid using the given signal.""" + try: + execute(['kill', '-%d' % signal, pid], run_as_root=run_as_root) + except ProcessExecutionError as ex: + # TODO(dalvarez): this check has i18n issues. Maybe we can use + # use gettext module setting a global locale or just pay attention + # to returncode instead of checking the ex message. + if 'No such process' not in str(ex): + raise def _get_conf_base(cfg_root, uuid, ensure_conf_dir): diff --git a/neutron/cmd/netns_cleanup.py b/neutron/cmd/netns_cleanup.py index 4a142d11def..0458fd082c7 100644 --- a/neutron/cmd/netns_cleanup.py +++ b/neutron/cmd/netns_cleanup.py @@ -15,6 +15,7 @@ import itertools import re +import signal import time from neutron_lib import constants @@ -22,7 +23,7 @@ from oslo_config import cfg from oslo_log import log as logging from oslo_utils import importutils -from neutron._i18n import _LE +from neutron._i18n import _LE, _LW from neutron.agent.common import config as agent_config from neutron.agent.common import ovs_lib from neutron.agent.l3 import dvr_fip_ns @@ -32,7 +33,9 @@ from neutron.agent.linux import dhcp from neutron.agent.linux import external_process from neutron.agent.linux import interface from neutron.agent.linux import ip_lib +from neutron.agent.linux import utils from neutron.common import config +from neutron.common import utils as common_utils from neutron.conf.agent import cmd from neutron.conf.agent import dhcp as dhcp_config @@ -45,6 +48,12 @@ NS_PREFIXES = { dvr_fip_ns.FIP_NS_PREFIX], 'lbaas': [LB_NS_PREFIX], } +SIGTERM_WAITTIME = 10 +NETSTAT_PIDS_REGEX = re.compile(r'.* (?P\d{2,6})/.*') + + +class PidsInNamespaceException(Exception): + pass class FakeDhcpPlugin(object): @@ -131,6 +140,87 @@ def unplug_device(conf, device): device.set_log_fail_as_error(orig_log_fail_as_error) +def find_listen_pids_namespace(namespace): + """Retrieve a list of pids of listening processes within the given netns. + + It executes netstat -nlp and returns a set of unique pairs + """ + ip = ip_lib.IPWrapper(namespace=namespace) + pids = set() + cmd = ['netstat', '-nlp'] + output = ip.netns.execute(cmd, run_as_root=True) + for line in output.splitlines(): + m = NETSTAT_PIDS_REGEX.match(line) + if m: + pids.add(m.group('pid')) + return pids + + +def wait_until_no_listen_pids_namespace(namespace, timeout=SIGTERM_WAITTIME): + """Poll listening processes within the given namespace. + + If after timeout seconds, there are remaining processes in the namespace, + then a PidsInNamespaceException will be thrown. + """ + # Would be better to handle an eventlet.timeout.Timeout exception + # but currently there's a problem importing eventlet since it's + # doing a local import from cmd/eventlet which doesn't have a + # timeout module + common_utils.wait_until_true( + lambda: not find_listen_pids_namespace(namespace), + timeout=SIGTERM_WAITTIME, + exception=PidsInNamespaceException) + + +def _kill_listen_processes(namespace, force=False): + """Identify all listening processes within the given namespace. + + Then, for each one, find its top parent with same cmdline (in case this + process forked) and issue a SIGTERM to all of them. If force is True, + then a SIGKILL will be issued to all parents and all their children. Also, + this function returns the number of listening processes. + """ + pids = find_listen_pids_namespace(namespace) + pids_to_kill = {utils.find_fork_top_parent(pid) for pid in pids} + kill_signal = signal.SIGTERM + if force: + kill_signal = signal.SIGKILL + children = [utils.find_child_pids(pid, True) for pid in pids_to_kill] + pids_to_kill.update(itertools.chain.from_iterable(children)) + + for pid in pids_to_kill: + # Throw a warning since this particular cleanup may need a specific + # implementation in the right module. Ideally, netns_cleanup wouldn't + # kill any processes as the responsible module should've killed them + # before cleaning up the namespace + LOG.warning(_LW("Killing (%(signal)d) [%(pid)s] %(cmdline)s"), + {'signal': kill_signal, + 'pid': pid, + 'cmdline': ' '.join(utils.get_cmdline_from_pid(pid))[:80] + }) + try: + utils.kill_process(pid, kill_signal, run_as_root=True) + except Exception as ex: + LOG.error(_LE('An error occurred while killing ' + '[%(pid)s]: %(msg)s'), {'pid': pid, 'msg': ex}) + return len(pids) + + +def kill_listen_processes(namespace): + """Kill all processes listening within the given namespace. + + First it tries to kill them using SIGTERM, waits until they die gracefully + and then kills remaining processes (if any) with SIGKILL + """ + if _kill_listen_processes(namespace, force=False): + try: + wait_until_no_listen_pids_namespace(namespace) + except PidsInNamespaceException: + _kill_listen_processes(namespace, force=True) + # Allow some time for remaining processes to die + wait_until_no_listen_pids_namespace(namespace) + + def destroy_namespace(conf, namespace, force=False): """Destroy a given namespace. @@ -146,6 +236,14 @@ def destroy_namespace(conf, namespace, force=False): # NOTE: The dhcp driver will remove the namespace if is it empty, # so a second check is required here. if ip.netns.exists(namespace): + try: + kill_listen_processes(namespace) + except PidsInNamespaceException: + # This is unlikely since, at this point, we have SIGKILLed + # all remaining processes but if there are still some, log + # the error and continue with the cleanup + LOG.error(_LE('Not all processes were killed in %s'), + namespace) for device in ip.get_devices(exclude_loopback=True): unplug_device(conf, device) diff --git a/neutron/tests/contrib/functional-testing.filters b/neutron/tests/contrib/functional-testing.filters index 657ae802b11..af90e36bfc6 100644 --- a/neutron/tests/contrib/functional-testing.filters +++ b/neutron/tests/contrib/functional-testing.filters @@ -41,3 +41,9 @@ ovstrace_filter: RegExpFilter, ovs-appctl, root, ovs-appctl, ofproto/trace, .*, # needed for TestGetRootHelperChildPid bash_filter: RegExpFilter, /bin/bash, root, bash, -c, \(sleep 100\) sleep_kill: KillFilter, root, sleep, -9 + +#needed by test_netns_cleanup +process_spawn: EnvFilter, env, root, PATH=, python +ip_exec: IpNetnsExecFilter, ip, root +ps: CommandFilter, ps, root +pid_kill: RegExpFilter, kill, root, kill, -\d+, .* diff --git a/neutron/tests/functional/cmd/process_spawn.py b/neutron/tests/functional/cmd/process_spawn.py new file mode 100644 index 00000000000..7eddd1322e2 --- /dev/null +++ b/neutron/tests/functional/cmd/process_spawn.py @@ -0,0 +1,180 @@ +# Copyright 2016 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import os +import signal +import socket +import sys +import time + +from neutron_lib import constants as n_const +from oslo_config import cfg + +from neutron._i18n import _ +from neutron.agent.linux import daemon +from neutron.tests.common import net_helpers + +UNIX_FAMILY = 'UNIX' + +OPTS = [ + cfg.IntOpt('num_children', + short='n', + default=0, + help=_('Number of children to spawn'), + required=False), + cfg.StrOpt('family', + short='f', + default=n_const.IPv4, + choices=[n_const.IPv4, n_const.IPv6, UNIX_FAMILY], + help=_('Listen socket family (%(v4)s, %(v6)s or %(unix)s)') % + { + 'v4': n_const.IPv4, + 'v6': n_const.IPv6, + 'unix': UNIX_FAMILY + }, + required=False), + cfg.StrOpt('proto', + short='p', + default=n_const.PROTO_NAME_TCP, + choices=[n_const.PROTO_NAME_TCP, n_const.PROTO_NAME_UDP], + help=_('Protocol (%(tcp)s or %(udp)s)') % + { + 'tcp': n_const.PROTO_NAME_TCP, + 'udp': n_const.PROTO_NAME_UDP + }, + required=False), + cfg.BoolOpt('parent_listen', + short='pl', + default=True, + help=_('Parent process must listen too'), + required=False), + cfg.BoolOpt('ignore_sigterm', + short='i', + default=False, + help=_('Ignore SIGTERM'), + required=False) +] + + +class ProcessSpawn(daemon.Daemon): + """ + This class is part of the functional test of the netns_cleanup module. + + It allows spawning processes that listen on random ports either on + tcp(6), udp(6) or unix sockets. Also it allows handling or ignoring + SIGTERM to check whether the cleanup works as expected by getting rid + of the spawned processes. + """ + + MAX_BIND_RETRIES = 5 + + DCT_FAMILY = { + n_const.IPv4: socket.AF_INET, + n_const.IPv6: socket.AF_INET6, + UNIX_FAMILY: socket.AF_UNIX + } + DCT_PROTO = { + n_const.PROTO_NAME_TCP: socket.SOCK_STREAM, + n_const.PROTO_NAME_UDP: socket.SOCK_DGRAM, + } + + def __init__(self, pidfile=None, + family=n_const.IPv4, + proto=n_const.PROTO_NAME_TCP, + ignore_sigterm=False, num_children=0, + parent_must_listen=True): + self.family = family + self.proto = proto + self.ignore_sigterm = ignore_sigterm + self.num_children = num_children + self.listen_socket = None + self.parent_must_listen = parent_must_listen + self.child_pids = [] + + super(ProcessSpawn, self).__init__(pidfile) + + def start_listening(self): + socket_family = self.DCT_FAMILY[self.family] + socket_type = self.DCT_PROTO[self.proto] + + self.listen_socket = socket.socket(socket_family, socket_type) + + # Try to listen in a random port which is not currently in use + retries = 0 + while retries < ProcessSpawn.MAX_BIND_RETRIES: + # NOTE(dalvarez): not finding a free port on a freshly created + # namespace is very unlikely but if problems show up, retries can + # be increased to avoid tests failing + try: + if self.family == UNIX_FAMILY: + self.listen_socket.bind('') + else: + port = net_helpers.get_free_namespace_port(self.proto) + self.listen_socket.bind(('', port)) + except socket.error: + retries += 1 + else: + if n_const.PROTO_NAME_TCP in self.proto: + self.listen_socket.listen(0) + break + + def do_sleep(self): + while True: + time.sleep(10) + + def run(self): + # Spawn as many children as requested + children = [] + while len(children) != self.num_children: + child_pid = os.fork() + if child_pid == 0: + # Listen and do nothing else + self.start_listening() + self.do_sleep() + return + children.append(child_pid) + + # Install a SIGTERM handler if requested + if not self.ignore_sigterm: + signal.signal(signal.SIGTERM, self.sigterm_handler) + + self.child_pids = children + if self.parent_must_listen: + self.start_listening() + self.do_sleep() + + def sigterm_handler(self, signum, frame): + if self.listen_socket: + self.listen_socket.close() + for child in self.child_pids: + try: + os.kill(child, signal.SIGTERM) + except OSError: + pass + sys.exit(0) + + +def main(): + cfg.CONF.register_cli_opts(OPTS) + cfg.CONF(project='neutron', default_config_files=[]) + proc_spawn = ProcessSpawn(num_children=cfg.CONF.num_children, + family=cfg.CONF.family, + proto=cfg.CONF.proto, + parent_must_listen=cfg.CONF.parent_listen, + ignore_sigterm=cfg.CONF.ignore_sigterm) + proc_spawn.start() + + +if __name__ == "__main__": + main() diff --git a/neutron/tests/functional/cmd/test_netns_cleanup.py b/neutron/tests/functional/cmd/test_netns_cleanup.py index e5f88f755f1..7b409515d95 100644 --- a/neutron/tests/functional/cmd/test_netns_cleanup.py +++ b/neutron/tests/functional/cmd/test_netns_cleanup.py @@ -13,19 +13,26 @@ # License for the specific language governing permissions and limitations # under the License. +import os + import mock +from neutron_lib import constants as n_const from neutron.agent.l3 import namespaces from neutron.agent.linux import dhcp from neutron.agent.linux import ip_lib +from neutron.agent.linux import utils from neutron.cmd import netns_cleanup +from neutron.common import utils as common_utils from neutron.conf.agent import cmd from neutron.tests import base as basetest from neutron.tests.common import net_helpers from neutron.tests.functional import base +from neutron.tests.functional.cmd import process_spawn GET_NAMESPACES = 'neutron.agent.linux.ip_lib.IPWrapper.get_namespaces' TEST_INTERFACE_DRIVER = 'neutron.agent.linux.interface.OVSInterfaceDriver' +NUM_SUBPROCESSES = 6 class NetnsCleanupTest(base.BaseSudoTestCase): @@ -60,13 +67,82 @@ class NetnsCleanupTest(base.BaseSudoTestCase): # tests, as otherwise cleanup will kill them all self.get_namespaces.return_value = [l3_namespace, dhcp_namespace] + # launch processes in each namespace to make sure they're + # killed during cleanup + procs_launched = self._launch_processes([l3_namespace, dhcp_namespace]) + self.assertIsNot(procs_launched, 0) + common_utils.wait_until_true( + lambda: self._get_num_spawned_procs() == procs_launched, + timeout=15, + exception=Exception("Didn't spawn expected number of processes")) + netns_cleanup.cleanup_network_namespaces(self.conf) self.get_namespaces_p.stop() namespaces_now = ip_lib.IPWrapper.get_namespaces() + procs_after = self._get_num_spawned_procs() + self.assertEqual(procs_after, 0) self.assertNotIn(l3_namespace, namespaces_now) self.assertNotIn(dhcp_namespace, namespaces_now) + @staticmethod + def _launch_processes(namespaces): + """ + Launch processes in the specified namespaces. + + This function will spawn processes inside the given namespaces: + - 6 processes listening on tcp ports (parent + 5 children) + - 1 process + 5 subprocesses listening on unix sockets + - 1 process + 5 subprocesses listening on udp6 sockets + + First two sets of processes will process SIGTERM so when the parent + gets killed, it will kill all spawned children + The last set of processes will ignore SIGTERM. This will allow us + to test the cleanup functionality which will issue a SIGKILL + to all remaining processes after the SIGTERM attempt + """ + commands = [['python', process_spawn.__file__, + '-n', NUM_SUBPROCESSES, + '-f', n_const.IPv4, + '-p', n_const.PROTO_NAME_TCP, + '--noignore_sigterm', + '--parent_listen'], + ['python', process_spawn.__file__, + '-n', NUM_SUBPROCESSES, + '-f', process_spawn.UNIX_FAMILY, + '-p', n_const.PROTO_NAME_TCP, + '--noignore_sigterm', + '--noparent_listen'], + ['python', process_spawn.__file__, + '-n', NUM_SUBPROCESSES, + '-f', n_const.IPv4, + '-p', n_const.PROTO_NAME_UDP, + '--ignore_sigterm', + '--noparent_listen']] + + proc_count = 0 + for netns in namespaces: + ip = ip_lib.IPWrapper(namespace=netns) + for command in commands: + # The total amount of processes per command is + # the process itself plus the number of subprocesses spawned by + # it + proc_count += (1 + NUM_SUBPROCESSES) + # We need to pass the PATH env variable so that python + # interpreter runs under the same virtual environment. + # Otherwise, it won't find the necessary packages such as + # oslo_config + ip.netns.execute(command, + addl_env={'PATH': + os.environ.get('PATH')}) + return proc_count + + @staticmethod + def _get_num_spawned_procs(): + cmd = ['ps', '-f', '-u', 'root'] + out = utils.execute(cmd, run_as_root=True) + return sum([1 for line in out.splitlines() if 'process_spawn' in line]) + class TestNETNSCLIConfig(basetest.BaseTestCase): diff --git a/neutron/tests/unit/agent/linux/test_async_process.py b/neutron/tests/unit/agent/linux/test_async_process.py index db0321de457..0512556d715 100644 --- a/neutron/tests/unit/agent/linux/test_async_process.py +++ b/neutron/tests/unit/agent/linux/test_async_process.py @@ -198,20 +198,18 @@ class TestAsyncProcess(base.BaseTestCase): exc = RuntimeError(exception_message) else: exc = None - with mock.patch.object(utils, 'execute', - side_effect=exc) as mock_execute: + with mock.patch.object(utils, 'kill_process', + side_effect=exc) as mock_kill_process: actual = self.proc._kill_process(pid, kill_signal) self.assertEqual(expected, actual) - mock_execute.assert_called_with(['kill', '-%d' % kill_signal, pid], - run_as_root=self.proc.run_as_root) + mock_kill_process.assert_called_with(pid, + kill_signal, + self.proc.run_as_root) def test__kill_process_returns_true_for_valid_pid(self): self._test__kill_process('1', True) - def test__kill_process_returns_true_for_stale_pid(self): - self._test__kill_process('1', True, 'No such process') - def test__kill_process_returns_false_for_execute_exception(self): self._test__kill_process('1', False, 'Invalid') diff --git a/neutron/tests/unit/agent/linux/test_utils.py b/neutron/tests/unit/agent/linux/test_utils.py index 9244ab05bcf..a0e7a1fbd96 100644 --- a/neutron/tests/unit/agent/linux/test_utils.py +++ b/neutron/tests/unit/agent/linux/test_utils.py @@ -12,6 +12,7 @@ # License for the specific language governing permissions and limitations # under the License. +import signal import socket import mock @@ -181,6 +182,100 @@ class AgentUtilsGetInterfaceMAC(base.BaseTestCase): self.assertEqual(actual_val, expect_val) +class TestFindParentPid(base.BaseTestCase): + def setUp(self): + super(TestFindParentPid, self).setUp() + self.m_execute = mock.patch.object(utils, 'execute').start() + + def test_returns_none_for_no_valid_pid(self): + self.m_execute.side_effect = utils.ProcessExecutionError('', + returncode=1) + self.assertIsNone(utils.find_parent_pid(-1)) + + def test_returns_parent_id_for_good_ouput(self): + self.m_execute.return_value = '123 \n' + self.assertEqual(utils.find_parent_pid(-1), '123') + + def test_raises_exception_returncode_0(self): + with testtools.ExpectedException(utils.ProcessExecutionError): + self.m_execute.side_effect = \ + utils.ProcessExecutionError('', returncode=0) + utils.find_parent_pid(-1) + + def test_raises_unknown_exception(self): + with testtools.ExpectedException(RuntimeError): + self.m_execute.side_effect = RuntimeError() + utils.find_parent_pid(-1) + + +class TestFindForkTopParent(base.BaseTestCase): + def _test_find_fork_top_parent(self, expected=_marker, + find_parent_pid_retvals=None, + pid_invoked_with_cmdline_retvals=None): + def _find_parent_pid(x): + if find_parent_pid_retvals: + return find_parent_pid_retvals.pop(0) + + pid_invoked_with_cmdline = {} + if pid_invoked_with_cmdline_retvals: + pid_invoked_with_cmdline['side_effect'] = ( + pid_invoked_with_cmdline_retvals) + else: + pid_invoked_with_cmdline['return_value'] = False + with mock.patch.object(utils, 'find_parent_pid', + side_effect=_find_parent_pid), \ + mock.patch.object(utils, 'pid_invoked_with_cmdline', + **pid_invoked_with_cmdline): + actual = utils.find_fork_top_parent(_marker) + self.assertEqual(expected, actual) + + def test_returns_own_pid_no_parent(self): + self._test_find_fork_top_parent() + + def test_returns_own_pid_nofork(self): + self._test_find_fork_top_parent(find_parent_pid_retvals=['2', '3']) + + def test_returns_first_parent_pid_fork(self): + self._test_find_fork_top_parent( + expected='2', + find_parent_pid_retvals=['2', '3', '4'], + pid_invoked_with_cmdline_retvals=[True, False, False]) + + def test_returns_top_parent_pid_fork(self): + self._test_find_fork_top_parent( + expected='4', + find_parent_pid_retvals=['2', '3', '4'], + pid_invoked_with_cmdline_retvals=[True, True, True]) + + +class TestKillProcess(base.BaseTestCase): + def _test_kill_process(self, pid, exception_message=None, + kill_signal=signal.SIGKILL): + if exception_message: + exc = utils.ProcessExecutionError(exception_message, returncode=0) + else: + exc = None + with mock.patch.object(utils, 'execute', + side_effect=exc) as mock_execute: + utils.kill_process(pid, kill_signal, run_as_root=True) + + mock_execute.assert_called_with(['kill', '-%d' % kill_signal, pid], + run_as_root=True) + + def test_kill_process_returns_none_for_valid_pid(self): + self._test_kill_process('1') + + def test_kill_process_returns_none_for_stale_pid(self): + self._test_kill_process('1', 'No such process') + + def test_kill_process_raises_exception_for_execute_exception(self): + with testtools.ExpectedException(utils.ProcessExecutionError): + self._test_kill_process('1', 'Invalid') + + def test_kill_process_with_different_signal(self): + self._test_kill_process('1', kill_signal=signal.SIGTERM) + + class TestFindChildPids(base.BaseTestCase): def test_returns_empty_list_for_exit_code_1(self): @@ -197,6 +292,14 @@ class TestFindChildPids(base.BaseTestCase): with mock.patch.object(utils, 'execute', return_value=' 123 \n 185\n'): self.assertEqual(utils.find_child_pids(-1), ['123', '185']) + def test_returns_list_of_child_process_ids_recursively(self): + with mock.patch.object(utils, 'execute', + side_effect=[' 123 \n 185\n', + ' 40 \n', '\n', + '41\n', '\n']): + actual = utils.find_child_pids(-1, True) + self.assertEqual(actual, ['123', '185', '40', '41']) + def test_raises_unknown_exception(self): with testtools.ExpectedException(RuntimeError): with mock.patch.object(utils, 'execute', diff --git a/neutron/tests/unit/cmd/test_netns_cleanup.py b/neutron/tests/unit/cmd/test_netns_cleanup.py index fb70e88561a..354de2bf4e9 100644 --- a/neutron/tests/unit/cmd/test_netns_cleanup.py +++ b/neutron/tests/unit/cmd/test_netns_cleanup.py @@ -13,11 +13,47 @@ # License for the specific language governing permissions and limitations # under the License. +import signal + import mock +import testtools from neutron.cmd import netns_cleanup as util from neutron.tests import base +NETSTAT_NETNS_OUTPUT = (""" +Active Internet connections (only servers) +Proto Recv-Q Send-Q Local Address Foreign Address State\ + PID/Program name +tcp 0 0 0.0.0.0:9697 0.0.0.0:* LISTEN\ + 1347/python +raw 0 0 0.0.0.0:112 0.0.0.0:* 7\ + 1279/keepalived +raw 0 0 0.0.0.0:112 0.0.0.0:* 7\ + 1279/keepalived +raw6 0 0 :::58 :::* 7\ + 1349/radvd +Active UNIX domain sockets (only servers) +Proto RefCnt Flags Type State I-Node PID/Program name\ + Path +unix 2 [ ACC ] STREAM LISTENING 82039530 1353/python\ + /tmp/rootwrap-VKSm8a/rootwrap.sock +""") + +NETSTAT_NO_NAMESPACE = (""" +Cannot open network namespace "qrouter-e6f206b2-4e8d-4597-a7e1-c3a20337e9c6":\ + No such file or directory +""") + +NETSTAT_NO_LISTEN_PROCS = (""" +Active Internet connections (only servers) +Proto Recv-Q Send-Q Local Address Foreign Address State\ + PID/Program name +Active UNIX domain sockets (only servers) +Proto RefCnt Flags Type State I-Node PID/Program name\ + Path +""") + class TestNetnsCleanup(base.BaseTestCase): def setUp(self): @@ -163,6 +199,115 @@ class TestNetnsCleanup(base.BaseTestCase): self.assertEqual([], ovs_br_cls.mock_calls) self.assertTrue(debug.called) + def _test_find_listen_pids_namespace_helper(self, expected, + netstat_output=None): + with mock.patch('neutron.agent.linux.ip_lib.IPWrapper') as ip_wrap: + ip_wrap.return_value.netns.execute.return_value = netstat_output + observed = util.find_listen_pids_namespace(mock.ANY) + self.assertEqual(expected, observed) + + def test_find_listen_pids_namespace_correct_output(self): + expected = set(['1347', '1279', '1349', '1353']) + self._test_find_listen_pids_namespace_helper(expected, + NETSTAT_NETNS_OUTPUT) + + def test_find_listen_pids_namespace_no_procs(self): + expected = set() + self._test_find_listen_pids_namespace_helper(expected, + NETSTAT_NO_LISTEN_PROCS) + + def test_find_listen_pids_namespace_no_namespace(self): + expected = set() + self._test_find_listen_pids_namespace_helper(expected, + NETSTAT_NO_NAMESPACE) + + def _test__kill_listen_processes_helper(self, pids, parents, children, + kills_expected, force): + def _get_element(dct, x): + return dct.get(x, []) + + def _find_childs(x, recursive): + return _get_element(children, x) + + def _find_parent(x): + return _get_element(parents, x) + + utils_mock = dict( + find_fork_top_parent=mock.DEFAULT, + find_child_pids=mock.DEFAULT, + get_cmdline_from_pid=mock.DEFAULT, + kill_process=mock.DEFAULT) + + self.log_mock = mock.patch.object(util, 'LOG').start() + with mock.patch.multiple('neutron.agent.linux.utils', **utils_mock)\ + as mocks: + mocks['find_fork_top_parent'].side_effect = _find_parent + mocks['find_child_pids'].side_effect = _find_childs + + with mock.patch.object(util, 'find_listen_pids_namespace', + return_value=pids): + calls = [] + for pid, sig in kills_expected: + calls.append(mock.call(pid, sig, run_as_root=True)) + util._kill_listen_processes(mock.ANY, force=force) + mocks['kill_process'].assert_has_calls(calls, any_order=True) + + def test__kill_listen_processes_only_parents_force_false(self): + pids = ['4', '5', '6'] + parents = {'4': '1', '5': '5', '6': '2'} + children = {} + kills_expected = [('1', signal.SIGTERM), + ('5', signal.SIGTERM), + ('2', signal.SIGTERM)] + + self._test__kill_listen_processes_helper(pids, parents, children, + kills_expected, False) + + def test__kill_listen_processes_parents_and_childs(self): + pids = ['4', '5', '6'] + parents = {'4': '1', '5': '2', '6': '3'} + children = {'1': ['4'], '2': ['5'], '3': ['6', '8', '7']} + kills_expected = [(str(x), signal.SIGKILL) for x in range(1, 9)] + self._test__kill_listen_processes_helper(pids, parents, children, + kills_expected, True) + + def test_kill_listen_processes(self): + with mock.patch.object(util, '_kill_listen_processes', + return_value=1) as mock_kill_listen: + with mock.patch('neutron.common.utils.wait_until_true')\ + as wait_until_true_mock: + wait_until_true_mock.side_effect = [ + util.PidsInNamespaceException, + None] + namespace = mock.ANY + util.kill_listen_processes(namespace) + mock_kill_listen.assert_has_calls( + [mock.call(namespace, force=False), + mock.call(namespace, force=True)]) + + def test_kill_listen_processes_still_procs(self): + with mock.patch.object(util, '_kill_listen_processes', + return_value=1): + with mock.patch('neutron.common.utils.wait_until_true')\ + as wait_until_true_mock: + wait_until_true_mock.side_effect = ( + util.PidsInNamespaceException) + namespace = mock.ANY + with testtools.ExpectedException( + util.PidsInNamespaceException): + util.kill_listen_processes(namespace) + + def test_kill_listen_processes_no_procs(self): + with mock.patch.object(util, '_kill_listen_processes', + return_value=0) as mock_kill_listen: + with mock.patch('neutron.common.utils.wait_until_true')\ + as wait_until_true_mock: + namespace = mock.ANY + util.kill_listen_processes(namespace) + mock_kill_listen.assert_called_once_with(namespace, + force=False) + self.assertFalse(wait_until_true_mock.called) + def _test_destroy_namespace_helper(self, force, num_devices): ns = 'qrouter-6e322ac7-ab50-4f53-9cdc-d1d3c1164b6d' conf = mock.Mock() @@ -182,23 +327,27 @@ class TestNetnsCleanup(base.BaseTestCase): ip_wrap.return_value.get_devices.return_value = devices ip_wrap.return_value.netns.exists.return_value = True - with mock.patch.object(util, 'unplug_device') as unplug: + with mock.patch.object(util, 'kill_listen_processes'): - with mock.patch.object(util, 'kill_dhcp') as kill_dhcp: - util.destroy_namespace(conf, ns, force) - expected = [mock.call(namespace=ns)] + with mock.patch.object(util, 'unplug_device') as unplug: - if force: - expected.extend([ - mock.call().netns.exists(ns), - mock.call().get_devices(exclude_loopback=True)]) - self.assertTrue(kill_dhcp.called) - unplug.assert_has_calls( - [mock.call(conf, d) for d in - devices[1:]]) + with mock.patch.object(util, 'kill_dhcp') as kill_dhcp: + util.destroy_namespace(conf, ns, force) + expected = [mock.call(namespace=ns)] - expected.append(mock.call().garbage_collect_namespace()) - ip_wrap.assert_has_calls(expected) + if force: + expected.extend([ + mock.call().netns.exists(ns), + mock.call().get_devices( + exclude_loopback=True)]) + self.assertTrue(kill_dhcp.called) + unplug.assert_has_calls( + [mock.call(conf, d) for d in + devices[1:]]) + + expected.append( + mock.call().garbage_collect_namespace()) + ip_wrap.assert_has_calls(expected) def test_destroy_namespace_empty(self): self._test_destroy_namespace_helper(False, 0) diff --git a/releasenotes/notes/netns_cleanup_kill_procs-af88d8c47c07dd9c.yaml b/releasenotes/notes/netns_cleanup_kill_procs-af88d8c47c07dd9c.yaml new file mode 100644 index 00000000000..b231a698632 --- /dev/null +++ b/releasenotes/notes/netns_cleanup_kill_procs-af88d8c47c07dd9c.yaml @@ -0,0 +1,8 @@ +--- +features: + - A new mechanism has been added to netns_cleanup to + kill processes that are listening on any port/unix + socket within the namespace. This will try to kill + them gracefully via SIGTERM and, if they don't die, + then a SIGKILL will be sent to the remaining + processes to ensure a proper cleanup. diff --git a/setup.cfg b/setup.cfg index f34cd0704b2..a210ff4810e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,6 +36,7 @@ data_files = etc/neutron/rootwrap.d/ipset-firewall.filters etc/neutron/rootwrap.d/l3.filters etc/neutron/rootwrap.d/linuxbridge-plugin.filters + etc/neutron/rootwrap.d/netns-cleanup.filters etc/neutron/rootwrap.d/openvswitch-plugin.filters scripts = bin/neutron-rootwrap-xen-dom0