From 0256e1e69f4dd38cf0a80532337fa30c89502cb4 Mon Sep 17 00:00:00 2001 From: Dmitry Galkin Date: Wed, 19 Sep 2018 14:02:48 +0000 Subject: [PATCH] Fix node exclusive lock not released on console start/restart. This patch forces kill of console process with SIGKILL if it did not terminate on SIGTERM within the CONF.console.kill_timeout and reads the shellinabox subprocess stdout/stderr after CONF.console.subprocess_timeout or if subprocess exited with non zero code. Change-Id: I55a112d877d94f31d27487846ff59fe27f602f8b Closes-Bug: 1587313 Story: 1587313 Task: 9654 --- ironic/conf/console.py | 4 ++ ironic/drivers/modules/console_utils.py | 55 ++++++++++++------- .../drivers/modules/test_console_utils.py | 30 ++++++++-- ...llinabox-locking-fix-2fae2a451a8a489a.yaml | 8 +++ 4 files changed, 71 insertions(+), 26 deletions(-) create mode 100644 releasenotes/notes/shellinabox-locking-fix-2fae2a451a8a489a.yaml diff --git a/ironic/conf/console.py b/ironic/conf/console.py index 2713511132..2c361c1816 100644 --- a/ironic/conf/console.py +++ b/ironic/conf/console.py @@ -45,6 +45,10 @@ opts = [ default=10, help=_('Time (in seconds) to wait for the console subprocess ' 'to start.')), + cfg.IntOpt('kill_timeout', + default=1, + help=_('Time (in seconds) to wait for the shellinabox console ' + 'subprocess to exit before sending SIGKILL signal.')), cfg.IPOpt('socat_address', default='$my_ip', help=_('IP address of Socat service running on the host of ' diff --git a/ironic/drivers/modules/console_utils.py b/ironic/drivers/modules/console_utils.py index f28b1d4e4f..84eae03e62 100644 --- a/ironic/drivers/modules/console_utils.py +++ b/ironic/drivers/modules/console_utils.py @@ -101,6 +101,23 @@ def _stop_console(node_uuid): console_pid = _get_console_pid(node_uuid) os.kill(console_pid, signal.SIGTERM) + + # make sure the process gets killed hard if required + attempt = 0 + max_attempts = CONF.console.kill_timeout // 0.2 + + while attempt < max_attempts: + if psutil.pid_exists(console_pid): + if attempt == max_attempts - 1: + os.kill(console_pid, signal.SIGKILL) + LOG.debug("Waiting for the console process with PID %(pid)s " + "to exit. Node: %(node)s.", + {'pid': console_pid, 'node': node_uuid}) + time.sleep(0.2) + attempt += 1 + else: + break + except OSError as exc: if exc.errno != errno.ESRCH: msg = (_("Could not stop the console for node '%(node)s'. " @@ -198,28 +215,24 @@ def start_shellinabox_console(node_uuid, port, console_cmd): # check if the console pid is created and the process is running. # if it is, then the shellinaboxd is invoked successfully as a daemon. # otherwise check the error. - if locals['returncode'] is not None: - if (locals['returncode'] == 0 and os.path.exists(pid_file) - and psutil.pid_exists(_get_console_pid(node_uuid))): - raise loopingcall.LoopingCallDone() - else: - (stdout, stderr) = popen_obj.communicate() - locals['errstr'] = _( - "Command: %(command)s.\n" - "Exit code: %(return_code)s.\n" - "Stdout: %(stdout)r\n" - "Stderr: %(stderr)r") % { - 'command': ' '.join(args), - 'return_code': locals['returncode'], - 'stdout': stdout, - 'stderr': stderr} - LOG.warning(locals['errstr']) - raise loopingcall.LoopingCallDone() + if (locals['returncode'] == 0 and os.path.exists(pid_file) + and psutil.pid_exists(_get_console_pid(node_uuid))): + raise loopingcall.LoopingCallDone() - if (time.time() > expiration): - locals['errstr'] = (_("Timeout while waiting for console " - "subprocess to start for node %s.") % - node_uuid) + if (time.time() > expiration or locals['returncode'] is not None): + (stdout, stderr) = popen_obj.communicate() + locals['errstr'] = _( + "Timeout or error while waiting for console " + "subprocess to start for node: %(node)s.\n" + "Command: %(command)s.\n" + "Exit code: %(return_code)s.\n" + "Stdout: %(stdout)r\n" + "Stderr: %(stderr)r") % { + 'node': node_uuid, + 'command': ' '.join(args), + 'return_code': locals['returncode'], + 'stdout': stdout, + 'stderr': stderr} LOG.warning(locals['errstr']) raise loopingcall.LoopingCallDone() diff --git a/ironic/tests/unit/drivers/modules/test_console_utils.py b/ironic/tests/unit/drivers/modules/test_console_utils.py index 8f998bb3d9..6355ad0494 100644 --- a/ironic/tests/unit/drivers/modules/test_console_utils.py +++ b/ironic/tests/unit/drivers/modules/test_console_utils.py @@ -145,8 +145,28 @@ class ConsoleUtilsTestCase(db_base.DbTestCase): console_utils._stop_console(self.info['uuid']) mock_pid.assert_called_once_with(self.info['uuid']) - mock_kill.assert_called_once_with(mock_pid.return_value, - signal.SIGTERM) + + # a check if process still exist (signal 0) in a loop + mock_kill.assert_any_call(mock_pid.return_value, signal.SIG_DFL) + # and that it receives the SIGTERM + mock_kill.assert_any_call(mock_pid.return_value, signal.SIGTERM) + mock_unlink.assert_called_once_with(pid_file) + + @mock.patch.object(ironic_utils, 'unlink_without_raise', autospec=True) + @mock.patch.object(os, 'kill', autospec=True) + @mock.patch.object(psutil, 'pid_exists', autospec=True, return_value=True) + @mock.patch.object(console_utils, '_get_console_pid', autospec=True) + def test__stop_console_forced_kill(self, mock_pid, mock_psutil, mock_kill, + mock_unlink): + pid_file = console_utils._get_console_pid_file(self.info['uuid']) + mock_pid.return_value = 12345 + + console_utils._stop_console(self.info['uuid']) + + mock_pid.assert_called_once_with(self.info['uuid']) + + # Make sure console process receives hard SIGKILL + mock_kill.assert_any_call(mock_pid.return_value, signal.SIGKILL) mock_unlink.assert_called_once_with(pid_file) @mock.patch.object(ironic_utils, 'unlink_without_raise', autospec=True) @@ -322,7 +342,7 @@ class ConsoleUtilsTestCase(db_base.DbTestCase): mock_popen.assert_called_once_with(mock.ANY, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - mock_popen.return_value.poll.assert_called_once_with() + mock_popen.return_value.poll.assert_called_with() @mock.patch.object(console_utils, 'open', mock.mock_open(read_data='12345\n')) @@ -350,11 +370,11 @@ class ConsoleUtilsTestCase(db_base.DbTestCase): mock_stop.assert_called_once_with(self.info['uuid']) mock_dir_exists.assert_called_once_with() - mock_pid_exists.assert_called_once_with(12345) + mock_pid_exists.assert_called_with(12345) mock_popen.assert_called_once_with(mock.ANY, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - mock_popen.return_value.poll.assert_called_once_with() + mock_popen.return_value.poll.assert_called_with() @mock.patch.object(subprocess, 'Popen', autospec=True) @mock.patch.object(console_utils, '_ensure_console_pid_dir_exists', diff --git a/releasenotes/notes/shellinabox-locking-fix-2fae2a451a8a489a.yaml b/releasenotes/notes/shellinabox-locking-fix-2fae2a451a8a489a.yaml new file mode 100644 index 0000000000..12ee688b6a --- /dev/null +++ b/releasenotes/notes/shellinabox-locking-fix-2fae2a451a8a489a.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes a locking issue where ``ipmitool-shellinabox`` console + interface users may encounter a situation where the bare metal node is + locked until the conductor is restarted. + See story `1587313 `_ + for additional information.