Power fault recovery: apply fault

This patch implements setting and using the fault field.

For each case currently maintenance is set to True, the fault is set
accordingly. A periodic task is added to check power state for nodes
in maintenance due to power failure, maintenance is cleared if the
power state of a node can be retrieved.

When a node is taken out of maintenance by user, the fault is
cleared (if there is any).

Story: #1596107
Task: #10469

Change-Id: Ic4ab20af9022a2d06bdac567e7a098f3ba08570a
Partial-Bug: #1596107
This commit is contained in:
Kaifeng Wang 2018-03-27 13:20:48 +08:00
parent 96aeaa8d9b
commit 0a1b165ba5
7 changed files with 314 additions and 2 deletions

25
ironic/common/faults.py Normal file
View File

@ -0,0 +1,25 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Fault definitions."""
POWER_FAILURE = 'power failure'
""" Node is moved to maintenance due to power synchronization failure. """
CLEAN_FAILURE = 'clean failure'
""" Node is moved to maintenance due to failure of a cleaning
operation. """
RESCUE_ABORT_FAILURE = 'rescue abort failure'
""" Node is moved to maintenance due to failure of cleaning up during
rescue abort. """

View File

@ -59,6 +59,7 @@ from six.moves import queue
from ironic.common import driver_factory
from ironic.common import exception
from ironic.common import faults
from ironic.common.glance_service import service_utils as glance_utils
from ironic.common.i18n import _
from ironic.common import images
@ -164,9 +165,11 @@ class ConductorManager(base_manager.BaseConductorManager):
# NOTE(jroll) clear maintenance_reason if node.update sets
# maintenance to False for backwards compatibility, for tools
# not using the maintenance endpoint.
# NOTE(kaifeng) also clear fault when out of maintenance.
delta = node_obj.obj_what_changed()
if 'maintenance' in delta and not node_obj.maintenance:
node_obj.maintenance_reason = None
node_obj.fault = None
# TODO(dtantsur): reconsider allowing changing some (but not all)
# interfaces for active nodes in the future.
@ -744,6 +747,7 @@ class ConductorManager(base_manager.BaseConductorManager):
node.last_error = error_msg
node.maintenance = True
node.maintenance_reason = error_msg
node.fault = faults.RESCUE_ABORT_FAILURE
node.save()
return
@ -1564,6 +1568,96 @@ class ConductorManager(base_manager.BaseConductorManager):
# Yield on every iteration
eventlet.sleep(0)
@METRICS.timer('ConductorManager._power_failure_recovery')
@periodics.periodic(spacing=CONF.conductor.power_failure_recovery_interval,
enabled=bool(
CONF.conductor.power_failure_recovery_interval))
def _power_failure_recovery(self, context):
"""Periodic task to check power states for nodes in maintenance.
Attempt to grab a lock and sync only if the following
conditions are met:
1) Node is mapped to this conductor.
2) Node is in maintenance with maintenance type of power failure.
3) Node is not reserved.
4) Node is not in the ENROLL state.
"""
def should_sync_power_state_for_recovery(task):
"""Check if ironic should sync power state for recovery."""
# NOTE(dtantsur): it's also pointless (and dangerous) to
# sync power state when a power action is in progress
if (task.node.provision_state == states.ENROLL or
not task.node.maintenance or
task.node.fault != faults.POWER_FAILURE or
task.node.target_power_state or
task.node.reservation):
return False
return True
def handle_recovery(task, actual_power_state):
"""Handle recovery when power sync is succeeded."""
task.upgrade_lock()
node = task.node
# Update power state
old_power_state = node.power_state
node.power_state = actual_power_state
# Clear maintenance related fields
node.maintenance = False
node.maintenance_reason = None
node.fault = None
node.save()
LOG.info("Node %(node)s is recovered from power failure "
"with actual power state '%(state)s'.",
{'node': node.uuid, 'state': actual_power_state})
if old_power_state != actual_power_state:
notify_utils.emit_power_state_corrected_notification(
task, old_power_state)
# NOTE(kaifeng) To avoid conflicts with periodic task of the
# regular power state checking, maintenance is still a required
# condition.
filters = {'maintenance': True,
'fault': faults.POWER_FAILURE}
node_iter = self.iter_nodes(fields=['id'], filters=filters)
for (node_uuid, driver, node_id) in node_iter:
try:
with task_manager.acquire(context, node_uuid,
purpose='power failure recovery',
shared=True) as task:
if not should_sync_power_state_for_recovery(task):
continue
try:
# Validate driver info in case of parameter changed
# in maintenance.
task.driver.power.validate(task)
# The driver may raise an exception, or may return
# ERROR. Handle both the same way.
power_state = task.driver.power.get_power_state(task)
if power_state == states.ERROR:
raise exception.PowerStateFailure(
_("Power driver returned ERROR state "
"while trying to get power state."))
except Exception as e:
LOG.debug("During power_failure_recovery, could "
"not get power state for node %(node)s, "
"Error: %(err)s.",
{'node': task.node.uuid, 'err': e})
else:
handle_recovery(task, power_state)
except exception.NodeNotFound:
LOG.info("During power_failure_recovery, node %(node)s was "
"not found and presumed deleted by another process.",
{'node': node_uuid})
except exception.NodeLocked:
LOG.info("During power_failure_recovery, node %(node)s was "
"already locked by another process. Skip.",
{'node': node_uuid})
finally:
# Yield on every iteration
eventlet.sleep(0)
@METRICS.timer('ConductorManager._check_deploy_timeouts')
@periodics.periodic(spacing=CONF.conductor.check_provision_state_interval)
def _check_deploy_timeouts(self, context):
@ -3391,6 +3485,7 @@ def handle_sync_power_state_max_retries_exceeded(task, actual_power_state,
node.last_error = msg
node.maintenance = True
node.maintenance_reason = msg
node.fault = faults.POWER_FAILURE
node.save()
if old_power_state != actual_power_state:
notify_utils.emit_power_state_corrected_notification(

View File

@ -19,6 +19,7 @@ from oslo_utils import excutils
import six
from ironic.common import exception
from ironic.common import faults
from ironic.common.i18n import _
from ironic.common import network
from ironic.common import states
@ -375,6 +376,7 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
node.last_error = msg
node.maintenance = True
node.maintenance_reason = msg
node.fault = faults.CLEAN_FAILURE
node.save()
if set_fail_state:

View File

@ -169,6 +169,14 @@ opts = [
'complete, i.e., so that a baremetal node is in the '
'desired power state. If timed out, the power operation '
'is considered a failure.')),
cfg.IntOpt('power_failure_recovery_interval',
min=0, default=300,
help=_('Interval (in seconds) between checking the power '
'state for nodes previously put into maintenance mode '
'due to power synchronization failure. A node is '
'automatically moved out of maintenance mode once its '
'power state is retrieved successfully. Set to 0 to '
'disable this check.')),
]

View File

@ -477,9 +477,11 @@ class UpdateNodeTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
res = self.service.update_node(self.context, node)
self.assertEqual({'test': 'two'}, res['extra'])
def test_update_node_clears_maintenance_reason(self):
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
def test_update_node_maintenance_set_false(self):
node = obj_utils.create_test_node(self.context,
driver='fake-hardware',
maintenance=True,
fault='clean failure',
maintenance_reason='reason')
# check that ManagerService.update_node actually updates the node
@ -487,6 +489,7 @@ class UpdateNodeTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
res = self.service.update_node(self.context, node)
self.assertFalse(res['maintenance'])
self.assertIsNone(res['maintenance_reason'])
self.assertIsNone(res['fault'])
def test_update_node_already_locked(self):
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
@ -2110,6 +2113,7 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin,
self.assertIsNotNone(task.node.last_error)
self.assertIsNotNone(task.node.maintenance_reason)
self.assertTrue(task.node.maintenance)
self.assertEqual('clean failure', task.node.fault)
@mgr_utils.mock_record_keepalive
@ -3356,6 +3360,8 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
self.assertIsNotNone(task.node.last_error)
self.assertIsNotNone(task.node.maintenance_reason)
self.assertTrue(task.node.maintenance)
self.assertEqual('rescue abort failure',
task.node.fault)
@mgr_utils.mock_record_keepalive
@ -5157,6 +5163,7 @@ class ManagerDoSyncPowerStateTestCase(db_base.DbTestCase):
self.service.power_state_sync_count[self.node.uuid])
self.assertTrue(self.node.maintenance)
self.assertIsNotNone(self.node.maintenance_reason)
self.assertEqual('power failure', self.node.fault)
def test_max_retries_exceeded2(self, node_power_action):
self.config(force_power_state_during_sync=True, group='conductor')
@ -5176,6 +5183,7 @@ class ManagerDoSyncPowerStateTestCase(db_base.DbTestCase):
self.assertEqual(3,
self.service.power_state_sync_count[self.node.uuid])
self.assertTrue(self.node.maintenance)
self.assertEqual('power failure', self.node.fault)
@mock.patch('ironic.objects.node.NodeCorrectedPowerStateNotification')
def test_max_retries_exceeded_notify(self, mock_notif, node_power_action):
@ -5505,6 +5513,158 @@ class ManagerSyncPowerStatesTestCase(mgr_utils.CommonMixIn,
self.assertEqual(sync_calls, sync_mock.call_args_list)
@mock.patch.object(task_manager, 'acquire')
@mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor')
@mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list')
class ManagerPowerRecoveryTestCase(mgr_utils.CommonMixIn,
db_base.DbTestCase):
def setUp(self):
super(ManagerPowerRecoveryTestCase, self).setUp()
self.service = manager.ConductorManager('hostname', 'test-topic')
self.service.dbapi = self.dbapi
self.driver = mock.Mock(spec_set=drivers_base.BaseDriver)
self.power = self.driver.power
self.task = mock.Mock(spec_set=['context', 'driver', 'node',
'upgrade_lock', 'shared'])
self.node = self._create_node(maintenance=True,
fault='power failure',
maintenance_reason='Unreachable BMC')
self.task.node = self.node
self.task.driver = self.driver
self.filters = {'maintenance': True,
'fault': 'power failure'}
self.columns = ['uuid', 'driver', 'id']
def test_node_not_mapped(self, get_nodeinfo_mock,
mapped_mock, acquire_mock):
get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response()
mapped_mock.return_value = False
self.service._power_failure_recovery(self.context)
get_nodeinfo_mock.assert_called_once_with(
columns=self.columns, filters=self.filters)
mapped_mock.assert_called_once_with(self.node.uuid,
self.node.driver)
self.assertFalse(acquire_mock.called)
self.assertFalse(self.power.validate.called)
def _power_failure_recovery(self, node_dict, get_nodeinfo_mock,
mapped_mock, acquire_mock):
get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response()
mapped_mock.return_value = True
task = self._create_task(node_attrs=node_dict)
acquire_mock.side_effect = self._get_acquire_side_effect(task)
self.service._power_failure_recovery(self.context)
get_nodeinfo_mock.assert_called_once_with(
columns=self.columns, filters=self.filters)
mapped_mock.assert_called_once_with(self.node.uuid,
self.node.driver)
acquire_mock.assert_called_once_with(self.context, self.node.uuid,
purpose=mock.ANY,
shared=True)
self.assertFalse(self.power.validate.called)
def test_node_locked_on_acquire(self, get_nodeinfo_mock, mapped_mock,
acquire_mock):
node_dict = dict(reservation='host1', uuid=self.node.uuid)
self._power_failure_recovery(node_dict, get_nodeinfo_mock,
mapped_mock, acquire_mock)
def test_node_in_enroll_on_acquire(self, get_nodeinfo_mock, mapped_mock,
acquire_mock):
node_dict = dict(provision_state=states.ENROLL,
target_provision_state=states.NOSTATE,
maintenance=True, uuid=self.node.uuid)
self._power_failure_recovery(node_dict, get_nodeinfo_mock,
mapped_mock, acquire_mock)
def test_node_in_power_transition_on_acquire(self, get_nodeinfo_mock,
mapped_mock, acquire_mock):
node_dict = dict(target_power_state=states.POWER_ON,
maintenance=True, uuid=self.node.uuid)
self._power_failure_recovery(node_dict, get_nodeinfo_mock,
mapped_mock, acquire_mock)
def test_node_not_in_maintenance_on_acquire(self, get_nodeinfo_mock,
mapped_mock, acquire_mock):
node_dict = dict(maintenance=False, uuid=self.node.uuid)
self._power_failure_recovery(node_dict, get_nodeinfo_mock,
mapped_mock, acquire_mock)
def test_node_disappears_on_acquire(self, get_nodeinfo_mock,
mapped_mock, acquire_mock):
get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response()
mapped_mock.return_value = True
acquire_mock.side_effect = exception.NodeNotFound(node=self.node.uuid,
host='fake')
self.service._power_failure_recovery(self.context)
get_nodeinfo_mock.assert_called_once_with(
columns=self.columns, filters=self.filters)
mapped_mock.assert_called_once_with(self.node.uuid,
self.node.driver)
acquire_mock.assert_called_once_with(self.context, self.node.uuid,
purpose=mock.ANY,
shared=True)
self.assertFalse(self.power.validate.called)
@mock.patch.object(notification_utils,
'emit_power_state_corrected_notification')
def test_node_recovery_success(self, notify_mock, get_nodeinfo_mock,
mapped_mock, acquire_mock):
self.node.power_state = states.POWER_ON
get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response()
mapped_mock.return_value = True
acquire_mock.side_effect = self._get_acquire_side_effect(self.task)
self.power.get_power_state.return_value = states.POWER_OFF
self.service._power_failure_recovery(self.context)
get_nodeinfo_mock.assert_called_once_with(
columns=self.columns, filters=self.filters)
mapped_mock.assert_called_once_with(self.node.uuid,
self.node.driver)
acquire_mock.assert_called_once_with(self.context, self.node.uuid,
purpose=mock.ANY,
shared=True)
self.power.validate.assert_called_once_with(self.task)
self.power.get_power_state.assert_called_once_with(self.task)
self.task.upgrade_lock.assert_called_once_with()
self.assertFalse(self.node.maintenance)
self.assertIsNone(self.node.fault)
self.assertIsNone(self.node.maintenance_reason)
self.assertEqual(states.POWER_OFF, self.node.power_state)
notify_mock.assert_called_once_with(self.task, states.POWER_ON)
def test_node_recovery_failed(self, get_nodeinfo_mock,
mapped_mock, acquire_mock):
get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response()
mapped_mock.return_value = True
acquire_mock.side_effect = self._get_acquire_side_effect(self.task)
self.power.get_power_state.return_value = states.ERROR
self.service._power_failure_recovery(self.context)
get_nodeinfo_mock.assert_called_once_with(
columns=self.columns, filters=self.filters)
mapped_mock.assert_called_once_with(self.node.uuid,
self.node.driver)
acquire_mock.assert_called_once_with(self.context, self.node.uuid,
purpose=mock.ANY,
shared=True)
self.power.validate.assert_called_once_with(self.task)
self.power.get_power_state.assert_called_once_with(self.task)
self.assertFalse(self.task.upgrade_lock.called)
self.assertTrue(self.node.maintenance)
self.assertEqual('power failure', self.node.fault)
self.assertEqual('Unreachable BMC', self.node.maintenance_reason)
@mock.patch.object(task_manager, 'acquire')
@mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor')
@mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list')

View File

@ -1182,6 +1182,7 @@ class ErrorHandlersTestCase(tests_base.TestCase):
target_state=None)
self.assertTrue(self.node.maintenance)
self.assertEqual(clean_error, self.node.maintenance_reason)
self.assertEqual('clean failure', self.node.fault)
def test_cleaning_error_handler(self):
self.node.provision_state = states.CLEANING
@ -1196,6 +1197,7 @@ class ErrorHandlersTestCase(tests_base.TestCase):
self.assertEqual(msg, self.node.last_error)
self.assertTrue(self.node.maintenance)
self.assertEqual(msg, self.node.maintenance_reason)
self.assertEqual('clean failure', self.node.fault)
driver = self.task.driver.deploy
driver.tear_down_cleaning.assert_called_once_with(self.task)
self.task.process_event.assert_called_once_with('fail',
@ -1238,6 +1240,7 @@ class ErrorHandlersTestCase(tests_base.TestCase):
self.assertTrue(log_mock.exception.called)
self.assertIn(msg, self.node.last_error)
self.assertIn(msg, self.node.maintenance_reason)
self.assertEqual('clean failure', self.node.fault)
def test_abort_on_conductor_take_over_cleaning(self):
self.node.maintenance = False
@ -1246,6 +1249,7 @@ class ErrorHandlersTestCase(tests_base.TestCase):
self.assertTrue(self.node.maintenance)
self.assertIn('take over', self.node.maintenance_reason)
self.assertIn('take over', self.node.last_error)
self.assertEqual('clean failure', self.node.fault)
self.task.driver.deploy.tear_down_cleaning.assert_called_once_with(
self.task)
self.node.save.assert_called_once_with()

View File

@ -0,0 +1,18 @@
---
features:
- |
Adds power failure recovery to ironic. For nodes that ironic had put into
maintenance mode due to power failure, ironic periodically checks their
power state, and moves them out of maintenance mode when power state can
be retrieved. The interval of this check is configured via
``[conductor]power_failure_recovery_interval`` configuration option, the
default value is 300 (seconds). Set to 0 to disable this behavior.
upgrade:
- |
Power failure recovery introduces a new configuration option
``[conductor]power_failure_recovery_interval``, which is enabled and set
to 300 seconds by default. In case the default value is not suitable for
the needs or scale of a deployment, please make adjustment or turn it off
during upgrade. The power failure recovery does not apply to nodes that
were in maintenance mode due to power failure before upgrade, they have
to be manually moved out of maintenance mode.