diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 2085ed5be8..b8ceb00a94 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -1300,6 +1300,14 @@ class ConductorManager(base_manager.BaseConductorManager): 'successfully moved to AVAILABLE state.', node.uuid) return + # NOTE(dtantsur): this is only reachable during automated cleaning, + # for manual cleaning we verify maintenance mode earlier on. + if (not CONF.conductor.allow_provisioning_in_maintenance + and node.maintenance): + msg = _('Cleaning a node in maintenance mode is not allowed') + return utils.cleaning_error_handler(task, msg, + tear_down_cleaning=False) + try: # NOTE(ghe): Valid power and network values are needed to perform # a cleaning. @@ -1543,7 +1551,8 @@ class ConductorManager(base_manager.BaseConductorManager): exception.NodeLocked, exception.InvalidParameterValue, exception.InvalidStateRequested, - exception.UnsupportedDriverExtension) + exception.UnsupportedDriverExtension, + exception.NodeInMaintenance) def do_provisioning_action(self, context, node_id, action): """RPC method to initiate certain provisioning state transitions. @@ -1556,7 +1565,7 @@ class ConductorManager(base_manager.BaseConductorManager): :raises: InvalidParameterValue :raises: InvalidStateRequested :raises: NoFreeConductorWorker - + :raises: NodeInMaintenance """ with task_manager.acquire(context, node_id, shared=False, purpose='provision action %s' @@ -1564,6 +1573,11 @@ class ConductorManager(base_manager.BaseConductorManager): node = task.node if (action == states.VERBS['provide'] and node.provision_state == states.MANAGEABLE): + # NOTE(dtantsur): do this early to avoid entering cleaning. + if (not CONF.conductor.allow_provisioning_in_maintenance + and node.maintenance): + raise exception.NodeInMaintenance(op=_('providing'), + node=node.uuid) task.process_event( 'provide', callback=self._spawn_worker, diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 4cea2f2c75..51e1bf6ea3 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -416,7 +416,9 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True, # for automated cleaning, it is AVAILABLE. manual_clean = node.target_provision_state == states.MANAGEABLE node.last_error = msg - node.maintenance_reason = msg + # NOTE(dtantsur): avoid overwriting existing maintenance_reason + if not node.maintenance_reason: + node.maintenance_reason = msg node.save() if set_fail_state and node.provision_state != states.CLEANFAIL: diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index 1dd21355b8..3ed0fc4f45 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -178,6 +178,18 @@ opts = [ 'longer. In an environment where all tenants are ' 'trusted (eg, because there is only one tenant), ' 'this option could be safely disabled.')), + cfg.BoolOpt('allow_provisioning_in_maintenance', + default=True, + mutable=True, + help=_('Whether to allow nodes to enter or undergo deploy or ' + 'cleaning when in maintenance mode. If this option is ' + 'set to False, and a node enters maintenance during ' + 'deploy or cleaning, the process will be aborted ' + 'after the next heartbeat. Automated cleaning or ' + 'making a node available will also fail. If True ' + '(the default), the process will begin and will pause ' + 'after the node starts heartbeating. Moving it from ' + 'maintenance will make the process continue.')), cfg.IntOpt('clean_callback_timeout', default=1800, help=_('Timeout (seconds) to wait for a callback from the ' diff --git a/ironic/drivers/modules/agent_base_vendor.py b/ironic/drivers/modules/agent_base_vendor.py index 0fa2d45487..3a1c507827 100644 --- a/ironic/drivers/modules/agent_base_vendor.py +++ b/ironic/drivers/modules/agent_base_vendor.py @@ -296,6 +296,31 @@ class HeartbeatMixin(object): return FASTTRACK_HEARTBEAT_ALLOWED return HEARTBEAT_ALLOWED + def _heartbeat_in_maintenance(self, task): + node = task.node + if (node.provision_state in (states.CLEANING, states.CLEANWAIT) + and not CONF.conductor.allow_provisioning_in_maintenance): + LOG.error('Aborting cleaning for node %s, as it is in maintenance ' + 'mode', node.uuid) + last_error = _('Cleaning aborted as node is in maintenance mode') + manager_utils.cleaning_error_handler(task, last_error) + elif (node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT) + and not CONF.conductor.allow_provisioning_in_maintenance): + LOG.error('Aborting deployment for node %s, as it is in ' + 'maintenance mode', node.uuid) + last_error = _('Deploy aborted as node is in maintenance mode') + deploy_utils.set_failed_state(task, last_error, collect_logs=False) + elif (node.provision_state in (states.RESCUING, states.RESCUEWAIT) + and not CONF.conductor.allow_provisioning_in_maintenance): + LOG.error('Aborting rescuing for node %s, as it is in ' + 'maintenance mode', node.uuid) + last_error = _('Rescue aborted as node is in maintenance mode') + manager_utils.rescuing_error_handler(task, last_error) + else: + LOG.warning('Heartbeat from node %(node)s in ' + 'maintenance mode; not taking any action.', + {'node': node.uuid}) + @METRICS.timer('HeartbeatMixin.heartbeat') def heartbeat(self, task, callback_url, agent_version): """Process a heartbeat. @@ -342,20 +367,18 @@ class HeartbeatMixin(object): 'node as on-line.', {'node': task.node.uuid}) return + if node.maintenance: + return self._heartbeat_in_maintenance(task) + # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: - if node.maintenance: - # this shouldn't happen often, but skip the rest if it does. - LOG.debug('Heartbeat from node %(node)s in maintenance mode; ' - 'not taking any action.', {'node': node.uuid}) - return # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we # are currently in the core deploy.deploy step. Other deploy steps # may cause the agent to boot, but we should not trigger deployment # at that point. - elif node.provision_state == states.DEPLOYWAIT: + if node.provision_state == states.DEPLOYWAIT: if self.in_core_deploy_step(task): if not self.deploy_has_started(task): msg = _('Node failed to deploy.') diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 1622aac180..e0211367b0 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -3161,6 +3161,31 @@ class DoProvisioningActionTestCase(mgr_utils.ServiceSetUpMixin, mock_spawn.assert_called_with(self.service, self.service._do_node_clean, mock.ANY) + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', + autospec=True) + def test_do_provision_action_provide_in_maintenance(self, mock_spawn): + CONF.set_override('allow_provisioning_in_maintenance', False, + group='conductor') + # test when a node is cleaned going from manageable to available + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.MANAGEABLE, + target_provision_state=None, + maintenance=True) + + self._start_service() + mock_spawn.reset_mock() + exc = self.assertRaises(messaging.rpc.ExpectedException, + self.service.do_provisioning_action, + self.context, node.uuid, 'provide') + # Compare true exception hidden by @messaging.expected_exceptions + self.assertEqual(exception.NodeInMaintenance, exc.exc_info[0]) + node.refresh() + self.assertEqual(states.MANAGEABLE, node.provision_state) + self.assertIsNone(node.target_provision_state) + self.assertIsNone(node.last_error) + self.assertFalse(mock_spawn.called) + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', autospec=True) def test_do_provision_action_manage(self, mock_spawn): @@ -3822,6 +3847,31 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.assertTrue(mock_validate.called) self.assertIn('clean_steps', node.driver_internal_info) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.tear_down_cleaning', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_cleaning', + autospec=True) + def test__do_node_clean_maintenance(self, mock_prep, mock_tear_down): + CONF.set_override('allow_provisioning_in_maintenance', False, + group='conductor') + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.CLEANING, + target_provision_state=states.AVAILABLE, + maintenance=True, + maintenance_reason='Original reason') + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + self.service._do_node_clean(task) + node.refresh() + self.assertEqual(states.CLEANFAIL, node.provision_state) + self.assertEqual(states.AVAILABLE, node.target_provision_state) + self.assertIn('is not allowed', node.last_error) + self.assertTrue(node.maintenance) + self.assertEqual('Original reason', node.maintenance_reason) + self.assertFalse(mock_prep.called) + self.assertFalse(mock_tear_down.called) + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', autospec=True) @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_cleaning', diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index 0606dd5101..7653ed391e 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -986,9 +986,9 @@ class ErrorHandlersTestCase(tests_base.TestCase): # strict typing of the node power state fields and would fail if passed # a Mock object in constructors. A task context is also required for # notifications. - power_attrs = {'power_state': states.POWER_OFF, - 'target_power_state': states.POWER_ON} - self.node.configure_mock(**power_attrs) + self.node.configure_mock(power_state=states.POWER_OFF, + target_power_state=states.POWER_ON, + maintenance=False, maintenance_reason=None) self.task.context = self.context @mock.patch.object(conductor_utils, 'LOG') @@ -1116,7 +1116,6 @@ class ErrorHandlersTestCase(tests_base.TestCase): self.assertEqual('clean failure', self.node.fault) def test_abort_on_conductor_take_over_cleaning(self): - self.node.maintenance = False self.node.provision_state = states.CLEANFAIL conductor_utils.abort_on_conductor_take_over(self.task) self.assertTrue(self.node.maintenance) @@ -1128,7 +1127,6 @@ class ErrorHandlersTestCase(tests_base.TestCase): self.node.save.assert_called_once_with() def test_abort_on_conductor_take_over_deploying(self): - self.node.maintenance = False self.node.provision_state = states.DEPLOYFAIL conductor_utils.abort_on_conductor_take_over(self.task) self.assertFalse(self.node.maintenance) diff --git a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py index 185e063600..4ac6497d45 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py @@ -200,6 +200,44 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): self.assertEqual( '3.2.0', task.node.driver_internal_info['agent_version']) + self.assertEqual(state, task.node.provision_state) + self.assertIsNone(task.node.last_error) + self.assertEqual(0, ncrc_mock.call_count) + self.assertEqual(0, rti_mock.call_count) + self.assertEqual(0, cd_mock.call_count) + + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy', + autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'reboot_to_instance', autospec=True) + @mock.patch.object(manager_utils, 'notify_conductor_resume_clean', + autospec=True) + def test_heartbeat_in_maintenance_abort(self, ncrc_mock, rti_mock, + cd_mock): + CONF.set_override('allow_provisioning_in_maintenance', False, + group='conductor') + for state, expected in [(states.DEPLOYWAIT, states.DEPLOYFAIL), + (states.CLEANWAIT, states.CLEANFAIL), + (states.RESCUEWAIT, states.RESCUEFAIL)]: + for m in (ncrc_mock, rti_mock, cd_mock): + m.reset_mock() + self.node.provision_state = state + self.node.maintenance = True + self.node.save() + agent_url = 'url-%s' % state + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + self.deploy.heartbeat(task, agent_url, '3.2.0') + self.assertFalse(task.shared) + self.assertEqual( + agent_url, + task.node.driver_internal_info['agent_url']) + self.assertEqual( + '3.2.0', + task.node.driver_internal_info['agent_version']) + self.node.refresh() + self.assertEqual(expected, self.node.provision_state) + self.assertIn('aborted', self.node.last_error) self.assertEqual(0, ncrc_mock.call_count) self.assertEqual(0, rti_mock.call_count) self.assertEqual(0, cd_mock.call_count) @@ -252,6 +290,29 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): self.assertEqual(0, rti_mock.call_count) self.assertEqual(0, cd_mock.call_count) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy', + autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'reboot_to_instance', autospec=True) + @mock.patch.object(manager_utils, 'notify_conductor_resume_clean', + autospec=True) + def test_heartbeat_noops_in_wrong_state2(self, ncrc_mock, rti_mock, + cd_mock): + CONF.set_override('allow_provisioning_in_maintenance', False, + group='conductor') + allowed = {states.DEPLOYWAIT, states.CLEANWAIT} + for state in set(states.machine.states) - allowed: + for m in (ncrc_mock, rti_mock, cd_mock): + m.reset_mock() + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + self.node.provision_state = state + self.deploy.heartbeat(task, 'url', '1.0.0') + self.assertTrue(task.shared) + self.assertEqual(0, ncrc_mock.call_count) + self.assertEqual(0, rti_mock.call_count) + self.assertEqual(0, cd_mock.call_count) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'in_core_deploy_step', autospec=True) @mock.patch.object(agent_base_vendor.HeartbeatMixin, diff --git a/releasenotes/notes/cleaning-maintenance-7ae83b1e4ff992b0.yaml b/releasenotes/notes/cleaning-maintenance-7ae83b1e4ff992b0.yaml new file mode 100644 index 0000000000..b5b1f71250 --- /dev/null +++ b/releasenotes/notes/cleaning-maintenance-7ae83b1e4ff992b0.yaml @@ -0,0 +1,10 @@ +--- +fixes: + - | + Currently ironic allows entering deployment or cleaning for nodes in + maintenance mode. However, heartbeats do not cause any actions for such + nodes, thus deployment or cleaning never finish if the nodes are not moved + out of maintenance. A new configuration option + ``[conductor]allow_provisioning_in_maintenance`` (defaulting to ``True``) + is added to configure this behavior. If it is set to ``False``, deployment + and cleaning will be prevented from nodes in maintenance mode.