From 8e8072c16a63ee652b6cc383f6ad81808b319c2f Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Tue, 11 Aug 2020 16:45:38 -0700 Subject: [PATCH] Make wait_for_fork() more robust against infinite deadlock There is something about the process handling in the functional test base that opens the scenario where we might signal our API master child, waitpid() on it and never return (see bug 1891190). Specifically the test_reload() case, where the API master is being sent SIGHUP various times, which may cause it to temporarily mask out SIGTERM or something, which means it ignores our request. Either way, a deadlock in the tests can occur when the cleanup() routine goes to waitpid() on that master, forever. This change makes our wait_for_fork() helper call waitpid() with WNOHANG and progressively ramp up the aggression on asking it to stop. First, it waits for the original signal to be honored, then it starts sending SIGTERM, followed by SIGKILLs until the overall timeout expires, after which it fails the test. The latter should only ever happen if something gets wedged in uninterruptible sleep. Related-Bug: #1891190 Change-Id: Ie0e91e2ee86f33bcdb63eed74f2d9948108acfd4 --- glance/tests/functional/__init__.py | 3 +- glance/tests/utils.py | 62 +++++++++++++++++++++++------ 2 files changed, 52 insertions(+), 13 deletions(-) diff --git a/glance/tests/functional/__init__.py b/glance/tests/functional/__init__.py index c51ef58798..358e450641 100644 --- a/glance/tests/functional/__init__.py +++ b/glance/tests/functional/__init__.py @@ -290,7 +290,8 @@ class PosixServer(BaseServer): if self.stop_kill: os.kill(self.process_pid, signal.SIGTERM) - rc = test_utils.wait_for_fork(self.process_pid, raise_error=False) + rc = test_utils.wait_for_fork(self.process_pid, raise_error=False, + force=self.stop_kill) return (rc, '', '') diff --git a/glance/tests/utils.py b/glance/tests/utils.py index f267226c53..67f6111d03 100644 --- a/glance/tests/utils.py +++ b/glance/tests/utils.py @@ -20,6 +20,7 @@ import functools import os import shlex import shutil +import signal import socket import subprocess import threading @@ -32,6 +33,7 @@ from oslo_config import cfg from oslo_config import fixture as cfg_fixture from oslo_log.fixture import logging_error as log_fixture from oslo_log import log +from oslo_utils import timeutils import six from six.moves import BaseHTTPServer from six.moves import http_client as http @@ -49,6 +51,7 @@ from glance.db.sqlalchemy import api as db_api from glance.tests.unit import fixtures as glance_fixtures CONF = cfg.CONF +LOG = log.getLogger(__name__) try: CONF.debug except cfg.NoSuchOptError: @@ -280,7 +283,8 @@ def fork_exec(cmd, def wait_for_fork(pid, raise_error=True, - expected_exitcode=0): + expected_exitcode=0, + force=True): """ Wait for a process to complete @@ -289,18 +293,52 @@ def wait_for_fork(pid, is raised. """ - rc = 0 - try: - (pid, rc) = os.waitpid(pid, 0) - rc = os.WEXITSTATUS(rc) - if rc != expected_exitcode: - raise RuntimeError('The exit code %d is not %d' - % (rc, expected_exitcode)) - except Exception: - if raise_error: - raise + # For the first period, we wait without being pushy, but after + # this timer expires, we start sending SIGTERM + term_timer = timeutils.StopWatch(5) + term_timer.start() - return rc + # After this timer expires we start sending SIGKILL + nice_timer = timeutils.StopWatch(7) + nice_timer.start() + + # Process gets a maximum amount of time to exit before we fail the + # test + total_timer = timeutils.StopWatch(10) + total_timer.start() + + while not total_timer.expired(): + try: + cpid, rc = os.waitpid(pid, force and os.WNOHANG or 0) + if cpid == 0 and force: + if not term_timer.expired(): + # Waiting for exit on first signal + pass + elif not nice_timer.expired(): + # Politely ask the process to GTFO + LOG.warning('Killing child %i with SIGTERM' % pid) + os.kill(pid, signal.SIGTERM) + else: + # No more Mr. Nice Guy + LOG.warning('Killing child %i with SIGKILL' % pid) + os.kill(pid, signal.SIGKILL) + expected_exitcode = signal.SIGKILL + time.sleep(1) + continue + LOG.info('waitpid(%i) returned %i,%i' % (pid, cpid, rc)) + if rc != expected_exitcode: + raise RuntimeError('The exit code %d is not %d' + % (rc, expected_exitcode)) + return rc + except ChildProcessError: + # Nothing to wait for + return 0 + except Exception as e: + LOG.error('Got wait error: %s' % e) + if raise_error: + raise + + raise RuntimeError('Gave up waiting for %i to exit!' % pid) def execute(cmd,