Make wait_for_fork() more robust against infinite deadlock

There is something about the process handling in the functional test base that opens the scenario where we might signal our API master child, waitpid() on it and never return (see bug 1891190). Specifically the test_reload() case, where the API master is being sent SIGHUP various times, which may cause it to temporarily mask out SIGTERM or something, which means it ignores our request. Either way, a deadlock in the tests can occur when the cleanup() routine goes to waitpid() on that master, forever. This change makes our wait_for_fork() helper call waitpid() with WNOHANG and progressively ramp up the aggression on asking it to stop. First, it waits for the original signal to be honored, then it starts sending SIGTERM, followed by SIGKILLs until the overall timeout expires, after which it fails the test. The latter should only ever happen if something gets wedged in uninterruptible sleep. Related-Bug: #1891190 Change-Id: Ie0e91e2ee86f33bcdb63eed74f2d9948108acfd4
2020-08-11 16:45:38 -07:00 · 2020-08-11 16:45:38 -07:00 · 8e8072c16a
parent ed3296b23a
commit 8e8072c16a
2 changed files with 52 additions and 13 deletions
--- a/glance/tests/functional/init.py
+++ b/glance/tests/functional/init.py
@ -290,7 +290,8 @@ class PosixServer(BaseServer):

        if self.stop_kill:
            os.kill(self.process_pid, signal.SIGTERM)
-        rc = test_utils.wait_for_fork(self.process_pid, raise_error=False)
+        rc = test_utils.wait_for_fork(self.process_pid, raise_error=False,
+                                      force=self.stop_kill)
        return (rc, '', '')


--- a/glance/tests/utils.py
+++ b/glance/tests/utils.py
@ -20,6 +20,7 @@ import functools
 import os
 import shlex
 import shutil
+import signal
 import socket
 import subprocess
 import threading
@ -32,6 +33,7 @@ from oslo_config import cfg
 from oslo_config import fixture as cfg_fixture
 from oslo_log.fixture import logging_error as log_fixture
 from oslo_log import log
+from oslo_utils import timeutils
 import six
 from six.moves import BaseHTTPServer
 from six.moves import http_client as http
@ -49,6 +51,7 @@ from glance.db.sqlalchemy import api as db_api
 from glance.tests.unit import fixtures as glance_fixtures

 CONF = cfg.CONF
+LOG = log.getLogger(__name__)
 try:
    CONF.debug
 except cfg.NoSuchOptError:
@ -280,7 +283,8 @@ def fork_exec(cmd,

 def wait_for_fork(pid,
                  raise_error=True,
-                  expected_exitcode=0):
+                  expected_exitcode=0,
+                  force=True):
    """
    Wait for a process to complete

@ -289,18 +293,52 @@ def wait_for_fork(pid,
    is raised.
    """

-    rc = 0
-    try:
-        (pid, rc) = os.waitpid(pid, 0)
-        rc = os.WEXITSTATUS(rc)
-        if rc != expected_exitcode:
-            raise RuntimeError('The exit code %d is not %d'
-                               % (rc, expected_exitcode))
-    except Exception:
-        if raise_error:
-            raise
+    # For the first period, we wait without being pushy, but after
+    # this timer expires, we start sending SIGTERM
+    term_timer = timeutils.StopWatch(5)
+    term_timer.start()

-    return rc
+    # After this timer expires we start sending SIGKILL
+    nice_timer = timeutils.StopWatch(7)
+    nice_timer.start()
+
+    # Process gets a maximum amount of time to exit before we fail the
+    # test
+    total_timer = timeutils.StopWatch(10)
+    total_timer.start()
+
+    while not total_timer.expired():
+        try:
+            cpid, rc = os.waitpid(pid, force and os.WNOHANG or 0)
+            if cpid == 0 and force:
+                if not term_timer.expired():
+                    # Waiting for exit on first signal
+                    pass
+                elif not nice_timer.expired():
+                    # Politely ask the process to GTFO
+                    LOG.warning('Killing child %i with SIGTERM' % pid)
+                    os.kill(pid, signal.SIGTERM)
+                else:
+                    # No more Mr. Nice Guy
+                    LOG.warning('Killing child %i with SIGKILL' % pid)
+                    os.kill(pid, signal.SIGKILL)
+                    expected_exitcode = signal.SIGKILL
+                time.sleep(1)
+                continue
+            LOG.info('waitpid(%i) returned %i,%i' % (pid, cpid, rc))
+            if rc != expected_exitcode:
+                raise RuntimeError('The exit code %d is not %d'
+                                   % (rc, expected_exitcode))
+            return rc
+        except ChildProcessError:
+            # Nothing to wait for
+            return 0
+        except Exception as e:
+            LOG.error('Got wait error: %s' % e)
+            if raise_error:
+                raise
+
+    raise RuntimeError('Gave up waiting for %i to exit!' % pid)


 def execute(cmd,