Implement a --timeout for killing stalled scripts

This implements the feature required for bug #1595722 but the tripleo
bug is not closed until os-refresh-config is invoked with an
appropriate --timeout value.

Change-Id: Ibcbb2090aed126abec8dac49efa53ecbdb2b9b2c
DependsOn: If31f0d5d60e8585720c4c9c95cffa202f059f6f7
Partial-Bug: #1595722
This commit is contained in:
Steve Baker 2016-06-24 15:45:29 +12:00
parent d0cf563c2d
commit 1d828fa0dd
3 changed files with 50 additions and 5 deletions

View File

@ -18,10 +18,13 @@ import argparse
import fcntl
import logging
import os
import signal
import subprocess
import sys
import time
import psutil
OLD_BASE_DIR = '/opt/stack/os-config-refresh'
DEFAULT_BASE_DIR = '/usr/libexec/os-refresh-config'
@ -55,6 +58,21 @@ PHASES = ['pre-configure',
'migration']
def timeout():
p = psutil.Process()
children = list(p.get_children(recursive=True))
for child in children:
child.kill()
def exit(lock, statuscode=0):
signal.alarm(0)
if lock:
lock.truncate(0)
lock.close()
return statuscode
def main(argv=sys.argv):
parser = argparse.ArgumentParser(
description="""Runs through all of the phases to ensure
@ -72,6 +90,10 @@ def main(argv=sys.argv):
parser.add_argument('--lockfile',
default='/var/run/os-refresh-config.lock',
help='Lock file to prevent multiple running copies.')
parser.add_argument('--timeout',
type=int,
help='Seconds until the current run will be '
'terminated.')
options = parser.parse_args(argv[1:])
if options.print_base:
@ -101,6 +123,15 @@ def main(argv=sys.argv):
lock.truncate(0)
lock.write("Locked by pid==%d at %s\n" % (os.getpid(), time.localtime()))
def timeout_handler(signum, frame):
log.error('Timeout reached: %ss. Sending SIGKILL to all children' %
options.timeout)
timeout()
if options.timeout:
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(options.timeout)
for phase in PHASES:
phase_dir = os.path.join(BASE_DIR, '%s.d' % phase)
log.debug('Checking %s' % phase_dir)
@ -124,13 +155,11 @@ def main(argv=sys.argv):
except OSError:
pass
log.error("Aborting...")
return 1
return exit(lock, 1)
else:
log.debug('No dir for phase %s' % phase)
lock.truncate(0)
lock.close()
return 0
return exit(lock)
if __name__ == '__main__':

View File

@ -116,7 +116,6 @@ exit %(returncode)s
self._write_script('pre-configure', '20-pre-second', 99)
self._write_script('configure', '10-conf-first', 0)
returncode, stdout, stderr = self._run_orc()
print(stderr)
self.assertEqual('\n'.join([
'10-pre-first starting',
'10-pre-first done',
@ -126,6 +125,22 @@ exit %(returncode)s
]), stdout)
self.assertEqual(1, returncode)
def test_cmd_with_timeout(self):
self._write_script('pre-configure', '10-pre-first', 0, 5)
self._write_script('pre-configure', '20-pre-second', 0, 5)
self._write_script('configure', '10-conf-first', 0, 5)
now = time.time()
returncode, stdout, stderr = self._run_orc('--timeout', '2',
'--log-level', 'DEBUG')
# check run time accounts for the 2 seconds timeout
self.assertTrue(time.time() - now >= 2.0)
self.assertEqual('\n'.join([
'10-pre-first starting',
'',
]), stdout)
self.assertEqual(1, returncode)
def test_debug(self):
returncode, stdout, stderr = self._run_orc('--log-level', 'DEBUG')
self.assertEqual('', stdout)

View File

@ -3,3 +3,4 @@
# process, which may cause wedges in the gate later.
pbr>=1.6 # Apache-2.0
dib-utils # Apache-2.0
psutil>=1.1.1,<2.0.0 # BSD