agent: poll long-running commands till completion

Currently for install_bootloader we use wait=True with a longer timeout. As a more robust alternative, poll the agent until the command completes. This avoids trying to guess how long the command will actually take. Change-Id: I62e9086441fa2b164aee42f7489d12aed4076f49 Story: #2006963
2020-06-07 13:05:13 +02:00 · 2020-06-07 13:05:13 +02:00 · 7828fe8b64
parent a2ad31ddef
commit 7828fe8b64
5 changed files with 157 additions and 56 deletions
--- a/ironic/common/exception.py
+++ b/ironic/common/exception.py
@ -664,6 +664,10 @@ class AgentConnectionFailed(IronicException):
    _msg_fmt = _("Connection to agent failed: %(reason)s")


+class AgentCommandTimeout(IronicException):
+    _msg_fmt = _("Timeout executing command %(command)s on node %(node)s")
+
+
 class NodeProtected(HTTPForbidden):
    _msg_fmt = _("Node %(node)s is protected and cannot be undeployed, "
                 "rebuilt or deleted")
--- a/ironic/conf/agent.py
+++ b/ironic/conf/agent.py
@ -113,16 +113,20 @@ opts = [
    cfg.IntOpt('command_timeout',
               default=60,
               mutable=True,
-               help=_('Timeout (in seconds) for IPA commands. '
-                      'Please note, the bootloader installation command '
-                      'to the agent is permitted a timeout of twice the '
-                      'value set here as these are IO heavy operations '
-                      'depending on the configuration of the instance.')),
+               help=_('Timeout (in seconds) for IPA commands.')),
    cfg.IntOpt('max_command_attempts',
               default=3,
               help=_('This is the maximum number of attempts that will be '
                      'done for IPA commands that fails due to network '
                      'problems.')),
+    cfg.IntOpt('command_wait_attempts',
+               default=100,
+               help=_('Number of attempts to check for asynchronous commands '
+                      'completion before timing out.')),
+    cfg.IntOpt('command_wait_interval',
+               default=6,
+               help=_('Number of seconds to wait for between checks for '
+                      'asynchronous commands completion.')),
    cfg.IntOpt('neutron_agent_poll_interval',
               default=2,
               mutable=True,
--- a/ironic/drivers/modules/agent_client.py
+++ b/ironic/drivers/modules/agent_client.py
@ -56,13 +56,60 @@ class AgentClient(object):
            'params': params,
        })

+    def _raise_if_typeerror(self, result, node, method):
+        error = result.get('command_error')
+        if error and error.get('type') == 'TypeError':
+            LOG.error('Agent command %(method)s for node %(node)s failed. '
+                      'Internal TypeError detected: Error %(error)s',
+                      {'method': method, 'node': node.uuid, 'error': error})
+            raise exception.AgentAPIError(node=node.uuid,
+                                          status=error.get('code'),
+                                          error=result.get('faultstring'))
+
+    @METRICS.timer('AgentClient._wait_for_command')
+    @retrying.retry(
+        retry_on_exception=(
+            lambda e: isinstance(e, exception.AgentCommandTimeout)),
+        stop_max_attempt_number=CONF.agent.command_wait_attempts,
+        wait_fixed=CONF.agent.command_wait_interval * 1000)
+    def _wait_for_command(self, node, method):
+        """Wait for a command to complete.
+
+        :param node: A Node object.
+        :param method: A string represents the command executed by agent.
+        """
+        try:
+            method = method.split('.', 1)[1]
+        except IndexError:
+            pass
+
+        commands = self.get_commands_status(node)
+        try:
+            result = next(c for c in reversed(commands)
+                          if c.get('command_name') == method)
+        except StopIteration:
+            LOG.debug('Command %(cmd)s is not in the executing commands list '
+                      'for node %(node)s',
+                      {'cmd': method, 'node': node.uuid})
+            raise exception.AgentCommandTimeout(command=method, node=node.uuid)
+
+        if result.get('command_status') == 'RUNNING':
+            LOG.debug('Command %(cmd)s has not finished yet for node %(node)s',
+                      {'cmd': method, 'node': node.uuid})
+            raise exception.AgentCommandTimeout(command=method, node=node.uuid)
+        else:
+            LOG.debug('Command %(cmd)s has finished for node %(node)s with '
+                      'result %(result)s',
+                      {'cmd': method, 'node': node.uuid, 'result': result})
+            self._raise_if_typeerror(result, node, method)
+            return result
+
    @METRICS.timer('AgentClient._command')
    @retrying.retry(
        retry_on_exception=(
            lambda e: isinstance(e, exception.AgentConnectionFailed)),
        stop_max_attempt_number=CONF.agent.max_command_attempts)
-    def _command(self, node, method, params, wait=False,
-                 command_timeout_factor=1):
+    def _command(self, node, method, params, wait=False, poll=False):
        """Sends command to agent.

        :param node: A Node object.
@ -72,19 +119,16 @@ class AgentClient(object):
                       body.
        :param wait: True to wait for the command to finish executing, False
                     otherwise.
-        :param command_timeout_factor: An integer, default 1, by which to
-                                       multiply the [agent]command_timeout
-                                       value. This is intended for use with
-                                       extremely long running commands to
-                                       the agent ramdisk where a general
-                                       timeout value should not be extended
-                                       in all cases.
+        :param poll: Whether to poll the command until completion. Provides
+                     a better alternative to `wait` for long-running commands.
        :raises: IronicException when failed to issue the request or there was
                 a malformed response from the agent.
        :raises: AgentAPIError when agent failed to execute specified command.
        :returns: A dict containing command result from agent, see
                  get_commands_status for a sample.
        """
+        assert not (wait and poll)
+
        url = self._get_command_url(node)
        body = self._get_command_body(method, params)
        request_params = {
@ -99,7 +143,7 @@ class AgentClient(object):
        try:
            response = self.session.post(
                url, params=request_params, data=body,
-                timeout=CONF.agent.command_timeout * command_timeout_factor)
+                timeout=CONF.agent.command_timeout)
        except (requests.ConnectionError, requests.Timeout) as e:
            msg = (_('Failed to connect to the agent running on node %(node)s '
                     'for invoking command %(method)s. Error: %(error)s') %
@ -128,12 +172,6 @@ class AgentClient(object):
            raise exception.IronicException(msg)

        error = result.get('command_error')
-        exc_type = None
-        if error:
-            # if an error, we should see if a type field exists. This type
-            # field may signal an exception that is compatability based.
-            exc_type = error.get('type')
-
        LOG.debug('Agent command %(method)s for node %(node)s returned '
                  'result %(res)s, error %(error)s, HTTP status code %(code)d',
                  {'node': node.uuid, 'method': method,
@ -149,14 +187,11 @@ class AgentClient(object):
            raise exception.AgentAPIError(node=node.uuid,
                                          status=response.status_code,
                                          error=result.get('faultstring'))
-        if exc_type == 'TypeError':
-            LOG.error('Agent command %(method)s for node %(node)s failed. '
-                      'Internal %(exc_type)s error detected: Error %(error)s',
-                      {'method': method, 'node': node.uuid,
-                       'exc_type': exc_type, 'error': error})
-            raise exception.AgentAPIError(node=node.uuid,
-                                          status=error.get('code'),
-                                          error=result.get('faultstring'))
+
+        self._raise_if_typeerror(result, node, method)
+
+        if poll:
+            result = self._wait_for_command(node, method)

        return result

@ -245,7 +280,7 @@ class AgentClient(object):
        return self._command(node=node,
                             method='standby.prepare_image',
                             params=params,
-                             wait=wait)
+                             poll=wait)

    @METRICS.timer('AgentClient.start_iscsi_target')
    def start_iscsi_target(self, node, iqn,
@ -313,8 +348,7 @@ class AgentClient(object):
            return self._command(node=node,
                                 method='image.install_bootloader',
                                 params=params,
-                                 wait=True,
-                                 command_timeout_factor=2)
+                                 poll=True)
        except exception.AgentAPIError:
            # NOTE(arne_wiebalck): If for software RAID and 'uefi' as the boot
            # mode, we find that the IPA does not yet support the additional
@ -338,8 +372,7 @@ class AgentClient(object):
                return self._command(node=node,
                                     method='image.install_bootloader',
                                     params=params,
-                                     wait=True,
-                                     command_timeout_factor=2)
+                                     poll=True)

    @METRICS.timer('AgentClient.get_clean_steps')
    def get_clean_steps(self, node, ports):
--- a/ironic/tests/unit/drivers/modules/test_agent_client.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_client.py
@ -29,13 +29,29 @@ CONF = conf.CONF


 class MockResponse(object):
-    def __init__(self, text, status_code=http_client.OK):
-        assert isinstance(text, str)
+    def __init__(self, data=None, status_code=http_client.OK, text=None):
+        assert not (data and text)
        self.text = text
+        self.data = data
        self.status_code = status_code

    def json(self):
-        return json.loads(self.text)
+        if self.text:
+            return json.loads(self.text)
+        else:
+            return self.data
+
+
+class MockCommandStatus(MockResponse):
+    def __init__(self, status, name='fake', error=None):
+        super().__init__({
+            'commands': [
+                {'command_name': name,
+                 'command_status': status,
+                 'command_result': 'I did something',
+                 'command_error': error}
+            ]
+        })


 class MockNode(object):
@ -87,8 +103,7 @@ class TestAgentClient(base.TestCase):

    def test__command(self):
        response_data = {'status': 'ok'}
-        response_text = json.dumps(response_data)
-        self.client.session.post.return_value = MockResponse(response_text)
+        self.client.session.post.return_value = MockResponse(response_data)
        method = 'standby.run_image'
        image_info = {'image_id': 'test_image'}
        params = {'image_info': image_info}
@ -106,7 +121,8 @@ class TestAgentClient(base.TestCase):

    def test__command_fail_json(self):
        response_text = 'this be not json matey!'
-        self.client.session.post.return_value = MockResponse(response_text)
+        self.client.session.post.return_value = MockResponse(
+            text=response_text)
        method = 'standby.run_image'
        image_info = {'image_id': 'test_image'}
        params = {'image_info': image_info}
@ -159,7 +175,7 @@ class TestAgentClient(base.TestCase):
                          'error': error}, str(e))

    def test__command_error_code(self):
-        response_text = '{"faultstring": "you dun goofd"}'
+        response_text = {"faultstring": "you dun goofd"}
        self.client.session.post.return_value = MockResponse(
            response_text, status_code=http_client.BAD_REQUEST)
        method = 'standby.run_image'
@ -179,10 +195,9 @@ class TestAgentClient(base.TestCase):
            timeout=60)

    def test__command_error_code_okay_error_typeerror_embedded(self):
-        response_text = ('{"faultstring": "you dun goofd", '
-                         '"command_error": {"type": "TypeError"}}')
-        self.client.session.post.return_value = MockResponse(
-            response_text)
+        response_data = {"faultstring": "you dun goofd",
+                         "command_error": {"type": "TypeError"}}
+        self.client.session.post.return_value = MockResponse(response_data)
        method = 'standby.run_image'
        image_info = {'image_id': 'test_image'}
        params = {'image_info': image_info}
@ -199,6 +214,36 @@ class TestAgentClient(base.TestCase):
            params={'wait': 'false'},
            timeout=60)

+    @mock.patch('time.sleep', lambda seconds: None)
+    def test__command_poll(self):
+        response_data = {'status': 'ok'}
+        final_status = MockCommandStatus('SUCCEEDED', name='run_image')
+        self.client.session.post.return_value = MockResponse(response_data)
+        self.client.session.get.side_effect = [
+            MockCommandStatus('RUNNING', name='run_image'),
+            final_status,
+        ]
+
+        method = 'standby.run_image'
+        image_info = {'image_id': 'test_image'}
+        params = {'image_info': image_info}
+        expected = {'command_error': None,
+                    'command_name': 'run_image',
+                    'command_result': 'I did something',
+                    'command_status': 'SUCCEEDED'}
+
+        url = self.client._get_command_url(self.node)
+        body = self.client._get_command_body(method, params)
+
+        response = self.client._command(self.node, method, params, poll=True)
+        self.assertEqual(expected, response)
+        self.client.session.post.assert_called_once_with(
+            url,
+            data=body,
+            params={'wait': 'false'},
+            timeout=60)
+        self.client.session.get.assert_called_with(url, timeout=60)
+
    def test_get_commands_status(self):
        with mock.patch.object(self.client.session, 'get',
                               autospec=True) as mock_get:
@ -234,7 +279,7 @@ class TestAgentClient(base.TestCase):
                                  wait=False)
        self.client._command.assert_called_once_with(
            node=self.node, method='standby.prepare_image',
-            params=params, wait=False)
+            params=params, poll=False)

    def test_prepare_image_with_configdrive(self):
        self.client._command = mock.MagicMock(spec_set=[])
@ -251,7 +296,19 @@ class TestAgentClient(base.TestCase):
                                  wait=False)
        self.client._command.assert_called_once_with(
            node=self.node, method='standby.prepare_image',
-            params=params, wait=False)
+            params=params, poll=False)
+
+    def test_prepare_image_with_wait(self):
+        self.client._command = mock.MagicMock(spec_set=[])
+        image_info = {'image_id': 'image'}
+        params = {'image_info': image_info}
+
+        self.client.prepare_image(self.node,
+                                  image_info,
+                                  wait=True)
+        self.client._command.assert_called_once_with(
+            node=self.node, method='standby.prepare_image',
+            params=params, poll=True)

    def test_start_iscsi_target(self):
        self.client._command = mock.MagicMock(spec_set=[])
@ -305,9 +362,8 @@ class TestAgentClient(base.TestCase):
            self.node, root_uuid, efi_system_part_uuid=efi_system_part_uuid,
            prep_boot_part_uuid=prep_boot_part_uuid, target_boot_mode='hello')
        self.client._command.assert_called_once_with(
-            command_timeout_factor=2, node=self.node,
-            method='image.install_bootloader', params=params,
-            wait=True)
+            node=self.node, method='image.install_bootloader', params=params,
+            poll=True)

    def test_install_bootloader(self):
        self._test_install_bootloader(root_uuid='fake-root-uuid',
@ -415,8 +471,7 @@ class TestAgentClient(base.TestCase):

    def test__command_agent_client(self):
        response_data = {'status': 'ok'}
-        response_text = json.dumps(response_data)
-        self.client.session.post.return_value = MockResponse(response_text)
+        self.client.session.post.return_value = MockResponse(response_data)
        method = 'standby.run_image'
        image_info = {'image_id': 'test_image'}
        params = {'image_info': image_info}
@ -472,13 +527,12 @@ class TestAgentClientAttempts(base.TestCase):
        mock_sleep.return_value = None
        error = 'Connection Timeout'
        response_data = {'status': 'ok'}
-        response_text = json.dumps(response_data)
        method = 'standby.run_image'
        image_info = {'image_id': 'test_image'}
        params = {'image_info': image_info}
        self.client.session.post.side_effect = [requests.Timeout(error),
                                                requests.Timeout(error),
-                                                MockResponse(response_text)]
+                                                MockResponse(response_data)]

        response = self.client._command(self.node, method, params)
        self.assertEqual(3, self.client.session.post.call_count)
@ -494,12 +548,11 @@ class TestAgentClientAttempts(base.TestCase):
        mock_sleep.return_value = None
        error = 'Connection Timeout'
        response_data = {'status': 'ok'}
-        response_text = json.dumps(response_data)
        method = 'standby.run_image'
        image_info = {'image_id': 'test_image'}
        params = {'image_info': image_info}
        self.client.session.post.side_effect = [requests.Timeout(error),
-                                                MockResponse(response_text),
+                                                MockResponse(response_data),
                                                requests.Timeout(error)]

        response = self.client._command(self.node, method, params)
--- a/releasenotes/notes/agent-client-poll-ce16fd589e88c95a.yaml
+++ b/releasenotes/notes/agent-client-poll-ce16fd589e88c95a.yaml
@ -0,0 +1,7 @@
+---
+fixes:
+  - |
+    Instead of increasing timeout when running long synchronous tasks on
+    ironic-python-agent, ironic now runs them asynchronously and polls
+    the agent until completion. It is no longer necessary to account for
+    long-running tasks when setting ``[agent]command_timeout``.