Clean nodes stuck in CLEANING state when ir-cond restarts

When a conductor managing a node dies abruptly mid cleaing, the
node will get stuck in the CLEANING state.

This also moves _start_service() before creating CLEANING nodes
in tests. Finally, it adds autospec to a few places where the tests
fail in a mysterious way otherwise.

Change-Id: Ia7bce4dff57569707de4fcf3002eac241a5aa85b
Co-Authored-By: Dmitry Tantsur <dtantsur@redhat.com>
Partial-Bug: #1651092
This commit is contained in:
Zhenguo Niu 2016-08-02 20:24:54 +08:00 committed by Dmitry Tantsur
parent 5b75b6e4f2
commit 2921fe685d
4 changed files with 64 additions and 41 deletions

View File

@ -223,21 +223,14 @@ class BaseConductorManager(object):
self._periodic_tasks_worker.add_done_callback(
self._on_periodic_tasks_stop)
# NOTE(lucasagomes): If the conductor server dies abruptly
# mid deployment (OMM Killer, power outage, etc...) we
# can not resume the deployment even if the conductor
# comes back online. Cleaning the reservation of the nodes
# (dbapi.clear_node_reservations_for_conductor) is not enough to
# unstick it, so let's gracefully fail the deployment so the node
# can go through the steps (deleting & cleaning) to make itself
# available again.
filters = {'reserved': False,
'provision_state': states.DEPLOYING}
last_error = (_("The deployment can't be resumed by conductor "
"%s. Moving to fail state.") % self.host)
self._fail_if_in_state(ironic_context.get_admin_context(), filters,
states.DEPLOYING, 'provision_updated_at',
last_error=last_error)
self._fail_transient_state(
states.DEPLOYING,
_("The deployment can't be resumed by conductor "
"%s. Moving to fail state.") % self.host)
self._fail_transient_state(
states.CLEANING,
_("The cleaning can't be resumed by conductor "
"%s. Moving to fail state.") % self.host)
# Start consoles if it set enabled in a greenthread.
try:
@ -259,6 +252,20 @@ class BaseConductorManager(object):
self._started = True
def _fail_transient_state(self, state, last_error):
"""Apply "fail" transition to nodes in a transient state.
If the conductor server dies abruptly mid deployment or cleaning
(OMM Killer, power outage, etc...) we can not resume the process even
if the conductor comes back online. Cleaning the reservation of
the nodes (dbapi.clear_node_reservations_for_conductor) is not enough
to unstick it, so let's gracefully fail the process.
"""
filters = {'reserved': False, 'provision_state': state}
self._fail_if_in_state(ironic_context.get_admin_context(), filters,
state, 'provision_updated_at',
last_error=last_error)
def del_host(self, deregister=True):
# Conductor deregistration fails if called on non-initialized
# conductor (e.g. when rpc server is unreachable).

View File

@ -184,7 +184,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
ht_mock.assert_called_once_with()
@mock.patch.object(base_manager, 'LOG')
@mock.patch.object(base_manager.BaseConductorManager, 'del_host')
@mock.patch.object(base_manager.BaseConductorManager, 'del_host',
autospec=True)
@mock.patch.object(driver_factory, 'DriverFactory')
def test_starts_with_only_dynamic_drivers(self, df_mock, del_mock,
log_mock):
@ -197,7 +198,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.assertFalse(del_mock.called)
@mock.patch.object(base_manager, 'LOG')
@mock.patch.object(base_manager.BaseConductorManager, 'del_host')
@mock.patch.object(base_manager.BaseConductorManager, 'del_host',
autospec=True)
@mock.patch.object(driver_factory, 'HardwareTypesFactory')
def test_starts_with_only_classic_drivers(self, ht_mock, del_mock,
log_mock):

View File

@ -2298,13 +2298,13 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
@mock.patch('ironic.drivers.modules.fake.FakePower.validate')
def test__do_node_clean_automated_disabled(self, mock_validate):
self.config(automated_clean=False, group='conductor')
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
target_provision_state=states.AVAILABLE,
last_error=None)
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_node_clean(task)
@ -2416,6 +2416,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
else:
tgt_prov_state = states.AVAILABLE
driver_info = {'clean_steps': self.clean_steps}
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2424,7 +2426,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
power_state=states.POWER_OFF,
driver_internal_info=driver_info)
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_node_clean(task, clean_steps=clean_steps)
@ -2462,6 +2463,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
tgt_prov_state = states.AVAILABLE
driver_internal_info['clean_steps'] = self.clean_steps
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2472,8 +2475,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_execute.return_value = return_state
expected_first_step = node.driver_internal_info['clean_steps'][0]
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
@ -2500,6 +2501,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
manual=False):
# Resume an in-progress cleaning after the first async step
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2510,8 +2513,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
clean_step=self.clean_steps[0])
mock_execute.return_value = return_state
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, self.next_clean_step_index)
@ -2538,6 +2539,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
info = {'clean_steps': self.clean_steps,
'clean_step_index': len(self.clean_steps) - 1}
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2546,8 +2549,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
driver_internal_info=info,
clean_step=self.clean_steps[-1])
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, None)
@ -2575,6 +2576,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_power_execute, manual=False):
# Run all steps from start to finish (all synchronous)
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2586,8 +2589,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_deploy_execute.return_value = None
mock_power_execute.return_value = None
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
@ -2620,6 +2621,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
manual=False):
# When a clean step fails, go to CLEANFAIL
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2630,8 +2633,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
clean_step={})
mock_execute.side_effect = Exception()
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
@ -2665,6 +2666,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self, tear_mock, power_exec_mock, deploy_exec_mock, log_mock,
manual=True):
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2678,8 +2681,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
power_exec_mock.return_value = None
tear_mock.side_effect = Exception('boom')
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
@ -2722,6 +2723,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
{'clean_steps': None}):
# Resume where there are no steps, should be a noop
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
uuid=uuidutils.generate_uuid(),
@ -2731,8 +2734,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
driver_internal_info=info,
clean_step={})
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, None)
@ -2760,6 +2761,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self, deploy_exec_mock, power_exec_mock, manual=False):
# When a clean step fails, go to CLEANFAIL
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2770,8 +2773,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
clean_step={})
deploy_exec_mock.return_value = "foo"
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
@ -3372,11 +3373,19 @@ class MiscTestCase(mgr_utils.ServiceSetUpMixin, mgr_utils.CommonMixIn,
self.assertEqual([(nodes[0].uuid, 'fake', 0)], result)
mock_nodeinfo_list.assert_called_once_with(
columns=self.columns, filters=mock.sentinel.filters)
mock_fail_if_state.assert_called_once_with(
mock.ANY, mock.ANY,
{'provision_state': 'deploying', 'reserved': False},
'deploying', 'provision_updated_at',
last_error=mock.ANY)
expected_calls = [mock.call(mock.ANY, mock.ANY,
{'provision_state': 'deploying',
'reserved': False},
'deploying',
'provision_updated_at',
last_error=mock.ANY),
mock.call(mock.ANY, mock.ANY,
{'provision_state': 'cleaning',
'reserved': False},
'cleaning',
'provision_updated_at',
last_error=mock.ANY)]
mock_fail_if_state.assert_has_calls(expected_calls)
@mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list')
def test_iter_nodes_shutdown(self, mock_nodeinfo_list):

View File

@ -0,0 +1,5 @@
---
fixes:
- When a conductor managing a node dies mid-cleaning the node would get stuck
in the CLEANING state. Now upon conductor startup nodes in the CLEANING state
will be moved to the CLEANFAIL state.