Make wait_for_fork() more robust against infinite deadlock

There is something about the process handling in the functional test base
that opens the scenario where we might signal our API master child,
waitpid() on it and never return (see bug 1891190). Specifically the
test_reload() case, where the API master is being sent SIGHUP various times,
which may cause it to temporarily mask out SIGTERM or something, which
means it ignores our request. Either way, a deadlock in the tests can
occur when the cleanup() routine goes to waitpid() on that master,
forever.

This change makes our wait_for_fork() helper call waitpid() with WNOHANG
and progressively ramp up the aggression on asking it to stop. First, it
waits for the original signal to be honored, then it starts sending SIGTERM,
followed by SIGKILLs until the overall timeout expires, after which it
fails the test. The latter should only ever happen if something gets
wedged in uninterruptible sleep.

Related-Bug: #1891190
Change-Id: Ie0e91e2ee86f33bcdb63eed74f2d9948108acfd4
This commit is contained in:
Dan Smith 2020-08-11 16:45:38 -07:00
parent ed3296b23a
commit 8e8072c16a
2 changed files with 52 additions and 13 deletions

View File

@ -290,7 +290,8 @@ class PosixServer(BaseServer):
if self.stop_kill:
os.kill(self.process_pid, signal.SIGTERM)
rc = test_utils.wait_for_fork(self.process_pid, raise_error=False)
rc = test_utils.wait_for_fork(self.process_pid, raise_error=False,
force=self.stop_kill)
return (rc, '', '')

View File

@ -20,6 +20,7 @@ import functools
import os
import shlex
import shutil
import signal
import socket
import subprocess
import threading
@ -32,6 +33,7 @@ from oslo_config import cfg
from oslo_config import fixture as cfg_fixture
from oslo_log.fixture import logging_error as log_fixture
from oslo_log import log
from oslo_utils import timeutils
import six
from six.moves import BaseHTTPServer
from six.moves import http_client as http
@ -49,6 +51,7 @@ from glance.db.sqlalchemy import api as db_api
from glance.tests.unit import fixtures as glance_fixtures
CONF = cfg.CONF
LOG = log.getLogger(__name__)
try:
CONF.debug
except cfg.NoSuchOptError:
@ -280,7 +283,8 @@ def fork_exec(cmd,
def wait_for_fork(pid,
raise_error=True,
expected_exitcode=0):
expected_exitcode=0,
force=True):
"""
Wait for a process to complete
@ -289,18 +293,52 @@ def wait_for_fork(pid,
is raised.
"""
rc = 0
try:
(pid, rc) = os.waitpid(pid, 0)
rc = os.WEXITSTATUS(rc)
if rc != expected_exitcode:
raise RuntimeError('The exit code %d is not %d'
% (rc, expected_exitcode))
except Exception:
if raise_error:
raise
# For the first period, we wait without being pushy, but after
# this timer expires, we start sending SIGTERM
term_timer = timeutils.StopWatch(5)
term_timer.start()
return rc
# After this timer expires we start sending SIGKILL
nice_timer = timeutils.StopWatch(7)
nice_timer.start()
# Process gets a maximum amount of time to exit before we fail the
# test
total_timer = timeutils.StopWatch(10)
total_timer.start()
while not total_timer.expired():
try:
cpid, rc = os.waitpid(pid, force and os.WNOHANG or 0)
if cpid == 0 and force:
if not term_timer.expired():
# Waiting for exit on first signal
pass
elif not nice_timer.expired():
# Politely ask the process to GTFO
LOG.warning('Killing child %i with SIGTERM' % pid)
os.kill(pid, signal.SIGTERM)
else:
# No more Mr. Nice Guy
LOG.warning('Killing child %i with SIGKILL' % pid)
os.kill(pid, signal.SIGKILL)
expected_exitcode = signal.SIGKILL
time.sleep(1)
continue
LOG.info('waitpid(%i) returned %i,%i' % (pid, cpid, rc))
if rc != expected_exitcode:
raise RuntimeError('The exit code %d is not %d'
% (rc, expected_exitcode))
return rc
except ChildProcessError:
# Nothing to wait for
return 0
except Exception as e:
LOG.error('Got wait error: %s' % e)
if raise_error:
raise
raise RuntimeError('Gave up waiting for %i to exit!' % pid)
def execute(cmd,