summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZhenguo Niu <Niu.ZGlinux@gmail.com>2016-08-02 20:24:54 +0800
committerDmitry Tantsur <divius.inside@gmail.com>2018-02-20 09:04:07 +0000
commit8ad8c874d208e2c80be05bc64afe67d9a9c7a9ec (patch)
treed1b9f16eb7a41605cd214019a8eb73c9b16667c8
parent504a67b46d883545c320847aef6da96f6fa2c607 (diff)
Clean nodes stuck in CLEANING state when ir-cond restartsstable/pike
When a conductor managing a node dies abruptly mid cleaing, the node will get stuck in the CLEANING state. This also moves _start_service() before creating CLEANING nodes in tests. Finally, it adds autospec to a few places where the tests fail in a mysterious way otherwise. Change-Id: Ia7bce4dff57569707de4fcf3002eac241a5aa85b Co-Authored-By: Dmitry Tantsur <dtantsur@redhat.com> Partial-Bug: #1651092 (cherry picked from commit 2921fe685d8f096717f8795494c1032025407fe4)
Notes
Notes (review): Code-Review+2: Ruby Loo <ruby.loo@intel.com> Workflow+1: Ruby Loo <ruby.loo@intel.com> Verified+2: Zuul Submitted-by: Zuul Submitted-at: Wed, 21 Feb 2018 01:26:34 +0000 Reviewed-on: https://review.openstack.org/546083 Project: openstack/ironic Branch: refs/heads/stable/pike
-rw-r--r--ironic/conductor/base_manager.py37
-rw-r--r--ironic/tests/unit/conductor/test_base_manager.py6
-rw-r--r--ironic/tests/unit/conductor/test_manager.py57
-rw-r--r--releasenotes/notes/clean-nodes-stuck-in-cleaning-on-startup-443823ea4f937965.yaml5
4 files changed, 64 insertions, 41 deletions
diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py
index 34319ba..a75a516 100644
--- a/ironic/conductor/base_manager.py
+++ b/ironic/conductor/base_manager.py
@@ -223,21 +223,14 @@ class BaseConductorManager(object):
223 self._periodic_tasks_worker.add_done_callback( 223 self._periodic_tasks_worker.add_done_callback(
224 self._on_periodic_tasks_stop) 224 self._on_periodic_tasks_stop)
225 225
226 # NOTE(lucasagomes): If the conductor server dies abruptly 226 self._fail_transient_state(
227 # mid deployment (OMM Killer, power outage, etc...) we 227 states.DEPLOYING,
228 # can not resume the deployment even if the conductor 228 _("The deployment can't be resumed by conductor "
229 # comes back online. Cleaning the reservation of the nodes 229 "%s. Moving to fail state.") % self.host)
230 # (dbapi.clear_node_reservations_for_conductor) is not enough to 230 self._fail_transient_state(
231 # unstick it, so let's gracefully fail the deployment so the node 231 states.CLEANING,
232 # can go through the steps (deleting & cleaning) to make itself 232 _("The cleaning can't be resumed by conductor "
233 # available again. 233 "%s. Moving to fail state.") % self.host)
234 filters = {'reserved': False,
235 'provision_state': states.DEPLOYING}
236 last_error = (_("The deployment can't be resumed by conductor "
237 "%s. Moving to fail state.") % self.host)
238 self._fail_if_in_state(ironic_context.get_admin_context(), filters,
239 states.DEPLOYING, 'provision_updated_at',
240 last_error=last_error)
241 234
242 # Start consoles if it set enabled in a greenthread. 235 # Start consoles if it set enabled in a greenthread.
243 try: 236 try:
@@ -259,6 +252,20 @@ class BaseConductorManager(object):
259 252
260 self._started = True 253 self._started = True
261 254
255 def _fail_transient_state(self, state, last_error):
256 """Apply "fail" transition to nodes in a transient state.
257
258 If the conductor server dies abruptly mid deployment or cleaning
259 (OMM Killer, power outage, etc...) we can not resume the process even
260 if the conductor comes back online. Cleaning the reservation of
261 the nodes (dbapi.clear_node_reservations_for_conductor) is not enough
262 to unstick it, so let's gracefully fail the process.
263 """
264 filters = {'reserved': False, 'provision_state': state}
265 self._fail_if_in_state(ironic_context.get_admin_context(), filters,
266 state, 'provision_updated_at',
267 last_error=last_error)
268
262 def del_host(self, deregister=True): 269 def del_host(self, deregister=True):
263 # Conductor deregistration fails if called on non-initialized 270 # Conductor deregistration fails if called on non-initialized
264 # conductor (e.g. when rpc server is unreachable). 271 # conductor (e.g. when rpc server is unreachable).
diff --git a/ironic/tests/unit/conductor/test_base_manager.py b/ironic/tests/unit/conductor/test_base_manager.py
index ea2ff86..ff62fdb 100644
--- a/ironic/tests/unit/conductor/test_base_manager.py
+++ b/ironic/tests/unit/conductor/test_base_manager.py
@@ -184,7 +184,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
184 ht_mock.assert_called_once_with() 184 ht_mock.assert_called_once_with()
185 185
186 @mock.patch.object(base_manager, 'LOG') 186 @mock.patch.object(base_manager, 'LOG')
187 @mock.patch.object(base_manager.BaseConductorManager, 'del_host') 187 @mock.patch.object(base_manager.BaseConductorManager, 'del_host',
188 autospec=True)
188 @mock.patch.object(driver_factory, 'DriverFactory') 189 @mock.patch.object(driver_factory, 'DriverFactory')
189 def test_starts_with_only_dynamic_drivers(self, df_mock, del_mock, 190 def test_starts_with_only_dynamic_drivers(self, df_mock, del_mock,
190 log_mock): 191 log_mock):
@@ -197,7 +198,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
197 self.assertFalse(del_mock.called) 198 self.assertFalse(del_mock.called)
198 199
199 @mock.patch.object(base_manager, 'LOG') 200 @mock.patch.object(base_manager, 'LOG')
200 @mock.patch.object(base_manager.BaseConductorManager, 'del_host') 201 @mock.patch.object(base_manager.BaseConductorManager, 'del_host',
202 autospec=True)
201 @mock.patch.object(driver_factory, 'HardwareTypesFactory') 203 @mock.patch.object(driver_factory, 'HardwareTypesFactory')
202 def test_starts_with_only_classic_drivers(self, ht_mock, del_mock, 204 def test_starts_with_only_classic_drivers(self, ht_mock, del_mock,
203 log_mock): 205 log_mock):
diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py
index 24f467f..b743d42 100644
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@@ -2208,13 +2208,13 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2208 @mock.patch('ironic.drivers.modules.fake.FakePower.validate') 2208 @mock.patch('ironic.drivers.modules.fake.FakePower.validate')
2209 def test__do_node_clean_automated_disabled(self, mock_validate): 2209 def test__do_node_clean_automated_disabled(self, mock_validate):
2210 self.config(automated_clean=False, group='conductor') 2210 self.config(automated_clean=False, group='conductor')
2211
2212 self._start_service()
2211 node = obj_utils.create_test_node( 2213 node = obj_utils.create_test_node(
2212 self.context, driver='fake', 2214 self.context, driver='fake',
2213 provision_state=states.CLEANING, 2215 provision_state=states.CLEANING,
2214 target_provision_state=states.AVAILABLE, 2216 target_provision_state=states.AVAILABLE,
2215 last_error=None) 2217 last_error=None)
2216
2217 self._start_service()
2218 with task_manager.acquire( 2218 with task_manager.acquire(
2219 self.context, node.uuid, shared=False) as task: 2219 self.context, node.uuid, shared=False) as task:
2220 self.service._do_node_clean(task) 2220 self.service._do_node_clean(task)
@@ -2326,6 +2326,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2326 else: 2326 else:
2327 tgt_prov_state = states.AVAILABLE 2327 tgt_prov_state = states.AVAILABLE
2328 driver_info = {'clean_steps': self.clean_steps} 2328 driver_info = {'clean_steps': self.clean_steps}
2329
2330 self._start_service()
2329 node = obj_utils.create_test_node( 2331 node = obj_utils.create_test_node(
2330 self.context, driver='fake', 2332 self.context, driver='fake',
2331 provision_state=states.CLEANING, 2333 provision_state=states.CLEANING,
@@ -2334,7 +2336,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2334 power_state=states.POWER_OFF, 2336 power_state=states.POWER_OFF,
2335 driver_internal_info=driver_info) 2337 driver_internal_info=driver_info)
2336 2338
2337 self._start_service()
2338 with task_manager.acquire( 2339 with task_manager.acquire(
2339 self.context, node.uuid, shared=False) as task: 2340 self.context, node.uuid, shared=False) as task:
2340 self.service._do_node_clean(task, clean_steps=clean_steps) 2341 self.service._do_node_clean(task, clean_steps=clean_steps)
@@ -2372,6 +2373,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2372 tgt_prov_state = states.AVAILABLE 2373 tgt_prov_state = states.AVAILABLE
2373 driver_internal_info['clean_steps'] = self.clean_steps 2374 driver_internal_info['clean_steps'] = self.clean_steps
2374 2375
2376 self._start_service()
2377
2375 node = obj_utils.create_test_node( 2378 node = obj_utils.create_test_node(
2376 self.context, driver='fake', 2379 self.context, driver='fake',
2377 provision_state=states.CLEANING, 2380 provision_state=states.CLEANING,
@@ -2382,8 +2385,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2382 mock_execute.return_value = return_state 2385 mock_execute.return_value = return_state
2383 expected_first_step = node.driver_internal_info['clean_steps'][0] 2386 expected_first_step = node.driver_internal_info['clean_steps'][0]
2384 2387
2385 self._start_service()
2386
2387 with task_manager.acquire( 2388 with task_manager.acquire(
2388 self.context, node.uuid, shared=False) as task: 2389 self.context, node.uuid, shared=False) as task:
2389 self.service._do_next_clean_step(task, 0) 2390 self.service._do_next_clean_step(task, 0)
@@ -2410,6 +2411,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2410 manual=False): 2411 manual=False):
2411 # Resume an in-progress cleaning after the first async step 2412 # Resume an in-progress cleaning after the first async step
2412 tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE 2413 tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
2414
2415 self._start_service()
2413 node = obj_utils.create_test_node( 2416 node = obj_utils.create_test_node(
2414 self.context, driver='fake', 2417 self.context, driver='fake',
2415 provision_state=states.CLEANING, 2418 provision_state=states.CLEANING,
@@ -2420,8 +2423,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2420 clean_step=self.clean_steps[0]) 2423 clean_step=self.clean_steps[0])
2421 mock_execute.return_value = return_state 2424 mock_execute.return_value = return_state
2422 2425
2423 self._start_service()
2424
2425 with task_manager.acquire( 2426 with task_manager.acquire(
2426 self.context, node.uuid, shared=False) as task: 2427 self.context, node.uuid, shared=False) as task:
2427 self.service._do_next_clean_step(task, self.next_clean_step_index) 2428 self.service._do_next_clean_step(task, self.next_clean_step_index)
@@ -2448,6 +2449,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2448 tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE 2449 tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
2449 info = {'clean_steps': self.clean_steps, 2450 info = {'clean_steps': self.clean_steps,
2450 'clean_step_index': len(self.clean_steps) - 1} 2451 'clean_step_index': len(self.clean_steps) - 1}
2452
2453 self._start_service()
2451 node = obj_utils.create_test_node( 2454 node = obj_utils.create_test_node(
2452 self.context, driver='fake', 2455 self.context, driver='fake',
2453 provision_state=states.CLEANING, 2456 provision_state=states.CLEANING,
@@ -2456,8 +2459,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2456 driver_internal_info=info, 2459 driver_internal_info=info,
2457 clean_step=self.clean_steps[-1]) 2460 clean_step=self.clean_steps[-1])
2458 2461
2459 self._start_service()
2460
2461 with task_manager.acquire( 2462 with task_manager.acquire(
2462 self.context, node.uuid, shared=False) as task: 2463 self.context, node.uuid, shared=False) as task:
2463 self.service._do_next_clean_step(task, None) 2464 self.service._do_next_clean_step(task, None)
@@ -2485,6 +2486,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2485 mock_power_execute, manual=False): 2486 mock_power_execute, manual=False):
2486 # Run all steps from start to finish (all synchronous) 2487 # Run all steps from start to finish (all synchronous)
2487 tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE 2488 tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
2489
2490 self._start_service()
2488 node = obj_utils.create_test_node( 2491 node = obj_utils.create_test_node(
2489 self.context, driver='fake', 2492 self.context, driver='fake',
2490 provision_state=states.CLEANING, 2493 provision_state=states.CLEANING,
@@ -2496,8 +2499,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2496 mock_deploy_execute.return_value = None 2499 mock_deploy_execute.return_value = None
2497 mock_power_execute.return_value = None 2500 mock_power_execute.return_value = None
2498 2501
2499 self._start_service()
2500
2501 with task_manager.acquire( 2502 with task_manager.acquire(
2502 self.context, node.uuid, shared=False) as task: 2503 self.context, node.uuid, shared=False) as task:
2503 self.service._do_next_clean_step(task, 0) 2504 self.service._do_next_clean_step(task, 0)
@@ -2530,6 +2531,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2530 manual=False): 2531 manual=False):
2531 # When a clean step fails, go to CLEANFAIL 2532 # When a clean step fails, go to CLEANFAIL
2532 tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE 2533 tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
2534
2535 self._start_service()
2533 node = obj_utils.create_test_node( 2536 node = obj_utils.create_test_node(
2534 self.context, driver='fake', 2537 self.context, driver='fake',
2535 provision_state=states.CLEANING, 2538 provision_state=states.CLEANING,
@@ -2540,8 +2543,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2540 clean_step={}) 2543 clean_step={})
2541 mock_execute.side_effect = Exception() 2544 mock_execute.side_effect = Exception()
2542 2545
2543 self._start_service()
2544
2545 with task_manager.acquire( 2546 with task_manager.acquire(
2546 self.context, node.uuid, shared=False) as task: 2547 self.context, node.uuid, shared=False) as task:
2547 self.service._do_next_clean_step(task, 0) 2548 self.service._do_next_clean_step(task, 0)
@@ -2575,6 +2576,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2575 self, tear_mock, power_exec_mock, deploy_exec_mock, log_mock, 2576 self, tear_mock, power_exec_mock, deploy_exec_mock, log_mock,
2576 manual=True): 2577 manual=True):
2577 tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE 2578 tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
2579
2580 self._start_service()
2578 node = obj_utils.create_test_node( 2581 node = obj_utils.create_test_node(
2579 self.context, driver='fake', 2582 self.context, driver='fake',
2580 provision_state=states.CLEANING, 2583 provision_state=states.CLEANING,
@@ -2588,8 +2591,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2588 power_exec_mock.return_value = None 2591 power_exec_mock.return_value = None
2589 tear_mock.side_effect = Exception('boom') 2592 tear_mock.side_effect = Exception('boom')
2590 2593
2591 self._start_service()
2592
2593 with task_manager.acquire( 2594 with task_manager.acquire(
2594 self.context, node.uuid, shared=False) as task: 2595 self.context, node.uuid, shared=False) as task:
2595 self.service._do_next_clean_step(task, 0) 2596 self.service._do_next_clean_step(task, 0)
@@ -2632,6 +2633,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2632 {'clean_steps': None}): 2633 {'clean_steps': None}):
2633 # Resume where there are no steps, should be a noop 2634 # Resume where there are no steps, should be a noop
2634 tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE 2635 tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
2636
2637 self._start_service()
2635 node = obj_utils.create_test_node( 2638 node = obj_utils.create_test_node(
2636 self.context, driver='fake', 2639 self.context, driver='fake',
2637 uuid=uuidutils.generate_uuid(), 2640 uuid=uuidutils.generate_uuid(),
@@ -2641,8 +2644,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2641 driver_internal_info=info, 2644 driver_internal_info=info,
2642 clean_step={}) 2645 clean_step={})
2643 2646
2644 self._start_service()
2645
2646 with task_manager.acquire( 2647 with task_manager.acquire(
2647 self.context, node.uuid, shared=False) as task: 2648 self.context, node.uuid, shared=False) as task:
2648 self.service._do_next_clean_step(task, None) 2649 self.service._do_next_clean_step(task, None)
@@ -2670,6 +2671,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2670 self, deploy_exec_mock, power_exec_mock, manual=False): 2671 self, deploy_exec_mock, power_exec_mock, manual=False):
2671 # When a clean step fails, go to CLEANFAIL 2672 # When a clean step fails, go to CLEANFAIL
2672 tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE 2673 tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
2674
2675 self._start_service()
2673 node = obj_utils.create_test_node( 2676 node = obj_utils.create_test_node(
2674 self.context, driver='fake', 2677 self.context, driver='fake',
2675 provision_state=states.CLEANING, 2678 provision_state=states.CLEANING,
@@ -2680,8 +2683,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
2680 clean_step={}) 2683 clean_step={})
2681 deploy_exec_mock.return_value = "foo" 2684 deploy_exec_mock.return_value = "foo"
2682 2685
2683 self._start_service()
2684
2685 with task_manager.acquire( 2686 with task_manager.acquire(
2686 self.context, node.uuid, shared=False) as task: 2687 self.context, node.uuid, shared=False) as task:
2687 self.service._do_next_clean_step(task, 0) 2688 self.service._do_next_clean_step(task, 0)
@@ -2930,11 +2931,19 @@ class MiscTestCase(mgr_utils.ServiceSetUpMixin, mgr_utils.CommonMixIn,
2930 self.assertEqual([(nodes[0].uuid, 'fake', 0)], result) 2931 self.assertEqual([(nodes[0].uuid, 'fake', 0)], result)
2931 mock_nodeinfo_list.assert_called_once_with( 2932 mock_nodeinfo_list.assert_called_once_with(
2932 columns=self.columns, filters=mock.sentinel.filters) 2933 columns=self.columns, filters=mock.sentinel.filters)
2933 mock_fail_if_state.assert_called_once_with( 2934 expected_calls = [mock.call(mock.ANY, mock.ANY,
2934 mock.ANY, mock.ANY, 2935 {'provision_state': 'deploying',
2935 {'provision_state': 'deploying', 'reserved': False}, 2936 'reserved': False},
2936 'deploying', 'provision_updated_at', 2937 'deploying',
2937 last_error=mock.ANY) 2938 'provision_updated_at',
2939 last_error=mock.ANY),
2940 mock.call(mock.ANY, mock.ANY,
2941 {'provision_state': 'cleaning',
2942 'reserved': False},
2943 'cleaning',
2944 'provision_updated_at',
2945 last_error=mock.ANY)]
2946 mock_fail_if_state.assert_has_calls(expected_calls)
2938 2947
2939 @mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list') 2948 @mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list')
2940 def test_iter_nodes_shutdown(self, mock_nodeinfo_list): 2949 def test_iter_nodes_shutdown(self, mock_nodeinfo_list):
diff --git a/releasenotes/notes/clean-nodes-stuck-in-cleaning-on-startup-443823ea4f937965.yaml b/releasenotes/notes/clean-nodes-stuck-in-cleaning-on-startup-443823ea4f937965.yaml
new file mode 100644
index 0000000..09d2a01
--- /dev/null
+++ b/releasenotes/notes/clean-nodes-stuck-in-cleaning-on-startup-443823ea4f937965.yaml
@@ -0,0 +1,5 @@
1---
2fixes:
3 - When a conductor managing a node dies mid-cleaning the node would get stuck
4 in the CLEANING state. Now upon conductor startup nodes in the CLEANING state
5 will be moved to the CLEANFAIL state.