From 3e5198225267c9f6ccc732e2a8f12c704a895215 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Mon, 9 Sep 2019 17:26:21 +0100 Subject: [PATCH] Don't resume deployment or cleaning on heartbeat when polling Some drivers use a periodic task to poll for completion of a deploy or clean step. The iDRAC RAID driver is one example of this. In https://review.opendev.org/#/c/676152, the agent heartbeat handler was modified to resume deployment if not currently in the core deploy step. This makes sense for the ilo driver, which does not poll for completion of RAID configuration, but the iDRAC driver polls the lifecycle controller's job queue, and expects to be able to resume deployment once the job is complete. However, there is now a race between the agent heartbeat as the node boots up, and the job queue poller. This change adds new flags, cleaning_polling and deployment_polling, which can be used by a driver to signal that they are polling for completion of a deploy step, and that the agent heartbeat should not be used for this purpose. We also add here some more cleanup of the cleaning and deployment step metadata in driver_internal_info, since if these fields are left in place they may affect subsequent cleaning or deployment steps. Change-Id: I34591440ab993a80a0cc88be6e10e33f1ae4a660 Story: 2003817 Task: 36563 --- ironic/conductor/manager.py | 19 ++++++ ironic/conductor/utils.py | 8 ++- ironic/drivers/modules/agent_base_vendor.py | 16 ++++- ironic/drivers/modules/deploy_utils.py | 15 ++++- ironic/tests/unit/conductor/test_manager.py | 58 ++++++++++++++++- ironic/tests/unit/conductor/test_utils.py | 14 +++++ .../drivers/modules/test_agent_base_vendor.py | 63 +++++++++++++++++++ .../unit/drivers/modules/test_deploy_utils.py | 12 ++-- ...leaning-polling-flag-be13a866a7c302d7.yaml | 8 +++ 9 files changed, 199 insertions(+), 14 deletions(-) create mode 100644 releasenotes/notes/deployment-cleaning-polling-flag-be13a866a7c302d7.yaml diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index b8ceb00a94..8964ce5e52 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -916,12 +916,17 @@ class ConductorManager(base_manager.BaseConductorManager): 'state': node.provision_state, 'deploy_state': ', '.join(expected_states)}) + save_required = False info = node.driver_internal_info try: skip_current_step = info.pop('skip_current_deploy_step') except KeyError: skip_current_step = True else: + save_required = True + if info.pop('deployment_polling', None) is not None: + save_required = True + if save_required: node.driver_internal_info = info node.save() @@ -1227,12 +1232,17 @@ class ConductorManager(base_manager.BaseConductorManager): 'state': node.provision_state, 'clean_state': states.CLEANWAIT}) + save_required = False info = node.driver_internal_info try: skip_current_step = info.pop('skip_current_clean_step') except KeyError: skip_current_step = True else: + save_required = True + if info.pop('cleaning_polling', None) is not None: + save_required = True + if save_required: node.driver_internal_info = info node.save() @@ -1460,6 +1470,7 @@ class ConductorManager(base_manager.BaseConductorManager): driver_internal_info['clean_steps'] = None driver_internal_info.pop('clean_step_index', None) driver_internal_info.pop('cleaning_reboot', None) + driver_internal_info.pop('cleaning_polling', None) node.driver_internal_info = driver_internal_info node.save() try: @@ -1543,6 +1554,13 @@ class ConductorManager(base_manager.BaseConductorManager): node.last_error = last_error node.clean_step = None + info = node.driver_internal_info + # Clear any leftover metadata about cleaning + info.pop('clean_step_index', None) + info.pop('cleaning_reboot', None) + info.pop('cleaning_polling', None) + info.pop('skip_current_clean_step', None) + node.driver_internal_info = info node.save() LOG.info(info_message) @@ -3958,6 +3976,7 @@ def _do_next_deploy_step(task, step_index, conductor_id): driver_internal_info['deploy_steps'] = None driver_internal_info.pop('deploy_step_index', None) driver_internal_info.pop('deployment_reboot', None) + driver_internal_info.pop('deployment_polling', None) node.driver_internal_info = driver_internal_info node.save() diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 51e1bf6ea3..428c07002d 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -408,9 +408,11 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True, # Clear clean step, msg should already include current step node.clean_step = {} info = node.driver_internal_info + # Clear any leftover metadata about cleaning info.pop('clean_step_index', None) - # Clear any leftover metadata about cleaning reboots info.pop('cleaning_reboot', None) + info.pop('cleaning_polling', None) + info.pop('skip_current_clean_step', None) node.driver_internal_info = info # For manual cleaning, the target provision state is MANAGEABLE, whereas # for automated cleaning, it is AVAILABLE. @@ -466,9 +468,11 @@ def deploying_error_handler(task, logmsg, errmsg, traceback=False, # in node.driver_internal_info for debugging purposes. node.deploy_step = {} info = node.driver_internal_info + # Clear any leftover metadata about deployment. info.pop('deploy_step_index', None) - # Clear any leftover metadata about deployment reboots info.pop('deployment_reboot', None) + info.pop('deployment_polling', None) + info.pop('skip_current_deploy_step', None) node.driver_internal_info = info if cleanup_err: diff --git a/ironic/drivers/modules/agent_base_vendor.py b/ironic/drivers/modules/agent_base_vendor.py index 3a1c507827..51a33fd401 100644 --- a/ironic/drivers/modules/agent_base_vendor.py +++ b/ironic/drivers/modules/agent_base_vendor.py @@ -377,7 +377,7 @@ class HeartbeatMixin(object): # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we # are currently in the core deploy.deploy step. Other deploy steps # may cause the agent to boot, but we should not trigger deployment - # at that point. + # at that point if the driver is polling for completion of a step. if node.provision_state == states.DEPLOYWAIT: if self.in_core_deploy_step(task): if not self.deploy_has_started(task): @@ -391,7 +391,12 @@ class HeartbeatMixin(object): else: # The exceptions from RPC are not possible as we using cast # here - manager_utils.notify_conductor_resume_deploy(task) + # Check if the driver is polling for completion of a step, + # via the 'deployment_polling' flag. + polling = node.driver_internal_info.get( + 'deployment_polling', False) + if not polling: + manager_utils.notify_conductor_resume_deploy(task) node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() @@ -408,7 +413,12 @@ class HeartbeatMixin(object): manager_utils.notify_conductor_resume_clean(task) else: msg = _('Node failed to check cleaning progress.') - self.continue_cleaning(task) + # Check if the driver is polling for completion of a step, + # via the 'cleaning_polling' flag. + polling = node.driver_internal_info.get( + 'cleaning_polling', False) + if not polling: + self.continue_cleaning(task) elif (node.provision_state == states.RESCUEWAIT): msg = _('Node failed to perform rescue operation.') self._finalize_rescue(task) diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py index c96c3eb378..e29b3668cb 100644 --- a/ironic/drivers/modules/deploy_utils.py +++ b/ironic/drivers/modules/deploy_utils.py @@ -1475,7 +1475,8 @@ def get_async_step_return_state(node): return states.CLEANWAIT if node.clean_step else states.DEPLOYWAIT -def set_async_step_flags(node, reboot=None, skip_current_step=None): +def set_async_step_flags(node, reboot=None, skip_current_step=None, + polling=None): """Sets appropriate reboot flags in driver_internal_info based on operation :param node: an ironic node object. @@ -1488,16 +1489,24 @@ def set_async_step_flags(node, reboot=None, skip_current_step=None): skip_current_deploy_step based on cleaning or deployment operation in progress. If it is None, corresponding skip step flag is not set in node's driver_internal_info. + :param polling: Boolean value to set for node's driver_internal_info flag + deployment_polling or cleaning_polling. If it is None, the + corresponding polling flag is not set in the node's + driver_internal_info. """ info = node.driver_internal_info cleaning = {'reboot': 'cleaning_reboot', - 'skip': 'skip_current_clean_step'} + 'skip': 'skip_current_clean_step', + 'polling': 'cleaning_polling'} deployment = {'reboot': 'deployment_reboot', - 'skip': 'skip_current_deploy_step'} + 'skip': 'skip_current_deploy_step', + 'polling': 'deployment_polling'} fields = cleaning if node.clean_step else deployment if reboot is not None: info[fields['reboot']] = reboot if skip_current_step is not None: info[fields['skip']] = skip_current_step + if polling is not None: + info[fields['polling']] = polling node.driver_internal_info = info node.save() diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index e0211367b0..90152cc8da 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -1878,6 +1878,26 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, def test_continue_node_deploy_no_skip_step(self): self._continue_node_deploy_skip_step(skip=False) + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', + autospec=True) + def test_continue_node_deploy_polling(self, mock_spawn): + # test that deployment_polling flag is cleared + driver_info = {'deploy_steps': self.deploy_steps, + 'deploy_step_index': 0, + 'deployment_polling': True} + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.DEPLOYWAIT, + target_provision_state=states.MANAGEABLE, + driver_internal_info=driver_info, deploy_step=self.deploy_steps[0]) + self._start_service() + self.service.continue_node_deploy(self.context, node.uuid) + self._stop_service() + node.refresh() + self.assertNotIn('deployment_polling', node.driver_internal_info) + mock_spawn.assert_called_with(mock.ANY, manager._do_next_deploy_step, + mock.ANY, 1, mock.ANY) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step', autospec=True) def test_do_next_deploy_step_oob_reboot(self, mock_execute): @@ -3257,7 +3277,12 @@ class DoNodeCleanAbortTestCase(mgr_utils.ServiceSetUpMixin, self.context, driver='fake-hardware', provision_state=states.CLEANFAIL, target_provision_state=states.AVAILABLE, - clean_step={'step': 'foo', 'abortable': True}) + clean_step={'step': 'foo', 'abortable': True}, + driver_internal_info={ + 'clean_step_index': 2, + 'cleaning_reboot': True, + 'cleaning_polling': True, + 'skip_current_clean_step': True}) with task_manager.acquire(self.context, node.uuid) as task: self.service._do_node_clean_abort(task, step_name=step_name) @@ -3265,8 +3290,16 @@ class DoNodeCleanAbortTestCase(mgr_utils.ServiceSetUpMixin, tear_mock.assert_called_once_with(task.driver.deploy, task) if step_name: self.assertIn(step_name, task.node.last_error) - # assert node's clean_step was cleaned up + # assert node's clean_step and metadata was cleaned up self.assertEqual({}, task.node.clean_step) + self.assertNotIn('clean_step_index', + task.node.driver_internal_info) + self.assertNotIn('cleaning_reboot', + task.node.driver_internal_info) + self.assertNotIn('cleaning_polling', + task.node.driver_internal_info) + self.assertNotIn('skip_current_clean_step', + task.node.driver_internal_info) def test__do_node_clean_abort(self): self._test__do_node_clean_abort(None) @@ -3543,6 +3576,27 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): def test_continue_node_clean_no_skip_step(self): self._continue_node_clean_skip_step(skip=False) + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', + autospec=True) + def test_continue_node_clean_polling(self, mock_spawn): + # test that cleaning_polling flag is cleared + driver_info = {'clean_steps': self.clean_steps, + 'clean_step_index': 0, + 'cleaning_polling': True} + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.CLEANWAIT, + target_provision_state=states.MANAGEABLE, + driver_internal_info=driver_info, clean_step=self.clean_steps[0]) + self._start_service() + self.service.continue_node_clean(self.context, node.uuid) + self._stop_service() + node.refresh() + self.assertNotIn('cleaning_polling', node.driver_internal_info) + mock_spawn.assert_called_with(self.service, + self.service._do_next_clean_step, + mock.ANY, 1) + def _continue_node_clean_abort(self, manual=False): last_clean_step = self.clean_steps[0] last_clean_step['abortable'] = False diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index 7653ed391e..1526b2ec98 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -915,6 +915,11 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase): self.task) def test_deploying_error_handler(self): + info = self.node.driver_internal_info + info['deploy_step_index'] = 2 + info['deployment_reboot'] = True + info['deployment_polling'] = True + info['skip_current_deploy_step'] = True conductor_utils.deploying_error_handler(self.task, self.logmsg, self.errmsg) @@ -924,6 +929,9 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase): self.assertEqual({}, self.node.deploy_step) self.assertNotIn('deploy_step_index', self.node.driver_internal_info) self.assertNotIn('deployment_reboot', self.node.driver_internal_info) + self.assertNotIn('deployment_polling', self.node.driver_internal_info) + self.assertNotIn('skip_current_deploy_step', + self.node.driver_internal_info) self.task.process_event.assert_called_once_with('fail') def _test_deploying_error_handler_cleanup(self, exc, expected_str): @@ -1049,12 +1057,18 @@ class ErrorHandlersTestCase(tests_base.TestCase): self.node.clean_step = {'key': 'val'} self.node.driver_internal_info = { 'cleaning_reboot': True, + 'cleaning_polling': True, + 'skip_current_clean_step': True, 'clean_step_index': 0} msg = 'error bar' conductor_utils.cleaning_error_handler(self.task, msg) self.node.save.assert_called_once_with() self.assertEqual({}, self.node.clean_step) self.assertNotIn('clean_step_index', self.node.driver_internal_info) + self.assertNotIn('cleaning_reboot', self.node.driver_internal_info) + self.assertNotIn('cleaning_polling', self.node.driver_internal_info) + self.assertNotIn('skip_current_clean_step', + self.node.driver_internal_info) self.assertEqual(msg, self.node.last_error) self.assertTrue(self.node.maintenance) self.assertEqual(msg, self.node.maintenance_reason) diff --git a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py index 4ac6497d45..11ecd83b9b 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py @@ -175,6 +175,46 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): self.assertFalse(rti_mock.called) self.assertTrue(in_resume_deploy_mock.called) + @mock.patch.object(manager_utils, + 'notify_conductor_resume_deploy', autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'in_core_deploy_step', autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'deploy_has_started', autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'deploy_is_done', autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy', + autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'reboot_to_instance', autospec=True) + def test_heartbeat_not_in_core_deploy_step_polling(self, rti_mock, cd_mock, + deploy_is_done_mock, + deploy_started_mock, + in_deploy_mock, + in_resume_deploy_mock): + # Check that heartbeats do not trigger deployment actions when not in + # the deploy.deploy step. + in_deploy_mock.return_value = False + self.node.provision_state = states.DEPLOYWAIT + info = self.node.driver_internal_info + info['deployment_polling'] = True + self.node.driver_internal_info = info + self.node.save() + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + self.deploy.heartbeat(task, 'url', '3.2.0') + self.assertFalse(task.shared) + self.assertEqual( + 'url', task.node.driver_internal_info['agent_url']) + self.assertEqual( + '3.2.0', + task.node.driver_internal_info['agent_version']) + self.assertFalse(deploy_started_mock.called) + self.assertFalse(deploy_is_done_mock.called) + self.assertFalse(cd_mock.called) + self.assertFalse(rti_mock.called) + self.assertFalse(in_resume_deploy_mock.called) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy', autospec=True) @mock.patch.object(agent_base_vendor.HeartbeatMixin, @@ -451,6 +491,29 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): mock_touch.assert_called_once_with(mock.ANY) mock_continue.assert_called_once_with(mock.ANY, task) + @mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'continue_cleaning', autospec=True) + def test_heartbeat_continue_cleaning_polling(self, mock_continue, + mock_touch): + info = self.node.driver_internal_info + info['cleaning_polling'] = True + self.node.driver_internal_info = info + self.node.clean_step = { + 'priority': 10, + 'interface': 'deploy', + 'step': 'foo', + 'reboot_requested': False + } + self.node.provision_state = states.CLEANWAIT + self.node.save() + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + self.deploy.heartbeat(task, 'http://127.0.0.1:8080', '1.0.0') + + mock_touch.assert_called_once_with(mock.ANY) + self.assertFalse(mock_continue.called) + @mock.patch.object(manager_utils, 'cleaning_error_handler') @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_cleaning', autospec=True) diff --git a/ironic/tests/unit/drivers/modules/test_deploy_utils.py b/ironic/tests/unit/drivers/modules/test_deploy_utils.py index 72e903207f..7f448df304 100644 --- a/ironic/tests/unit/drivers/modules/test_deploy_utils.py +++ b/ironic/tests/unit/drivers/modules/test_deploy_utils.py @@ -2889,15 +2889,17 @@ class AsyncStepTestCase(db_base.DbTestCase): self.node.save() self._test_get_async_step_return_state() - def test_set_async_step_flags_cleaning_set_both(self): + def test_set_async_step_flags_cleaning_set_all(self): self.node.clean_step = {'step': 'create_configuration', 'interface': 'raid'} self.node.driver_internal_info = {} expected = {'cleaning_reboot': True, + 'cleaning_polling': True, 'skip_current_clean_step': True} self.node.save() utils.set_async_step_flags(self.node, reboot=True, - skip_current_step=True) + skip_current_step=True, + polling=True) self.assertEqual(expected, self.node.driver_internal_info) def test_set_async_step_flags_cleaning_set_one(self): @@ -2909,15 +2911,17 @@ class AsyncStepTestCase(db_base.DbTestCase): self.assertEqual({'cleaning_reboot': True}, self.node.driver_internal_info) - def test_set_async_step_flags_deploying_set_both(self): + def test_set_async_step_flags_deploying_set_all(self): self.node.deploy_step = {'step': 'create_configuration', 'interface': 'raid'} self.node.driver_internal_info = {} expected = {'deployment_reboot': True, + 'deployment_polling': True, 'skip_current_deploy_step': True} self.node.save() utils.set_async_step_flags(self.node, reboot=True, - skip_current_step=True) + skip_current_step=True, + polling=True) self.assertEqual(expected, self.node.driver_internal_info) def test_set_async_step_flags_deploying_set_one(self): diff --git a/releasenotes/notes/deployment-cleaning-polling-flag-be13a866a7c302d7.yaml b/releasenotes/notes/deployment-cleaning-polling-flag-be13a866a7c302d7.yaml new file mode 100644 index 0000000000..6e30ead3bc --- /dev/null +++ b/releasenotes/notes/deployment-cleaning-polling-flag-be13a866a7c302d7.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes an issue with asynchronous deploy steps that poll for completion + where the step could fail to execute. The ``deployment_polling`` and + ``cleaning_polling`` flags may be used by driver implementations to signal + that the driver is polling for completion. See `story 2003817 + `__ for details.