Add rescuewait timeout periodic task

Ensure nodes don't get stuck in rescuewait forever when
a rescue ramdisk fails to boot and start heartbeating.

Change-Id: I15a92c0f619505e25768dc2fbc1b2a796f0b38fa
Related-bug: #1526449
Co-Authored-By: Jay Faulkner <jay@jvf.cc>
Co-Authored-By: Mario Villaplana <mario.villaplana@gmail.com>
Co-Authored-By: Jesse J. Cook <jesse.j.cook@member.fsf.org>
Co-Authored-By: Aparna <aparnavtce@gmail.com>
Co-Authored-By: Shivanand Tendulker <stendulker@gmail.com>
This commit is contained in:
Jay Faulkner 2016-08-09 16:09:02 -07:00 committed by Shivanand Tendulker
parent fbee0981ad
commit a9bc2e6ddf
6 changed files with 151 additions and 3 deletions

View File

@ -1267,6 +1267,12 @@
# (integer value)
#check_provision_state_interval = 60
# Interval (seconds) between checks of rescue timeouts. This
# option is part of rescue feature work, which is not
# currently exposed to users. (integer value)
# Minimum value: 1
#check_rescue_state_interval = 60
# Timeout (seconds) to wait for a callback from a deploy
# ramdisk. Set to 0 to disable timeout. (integer value)
#deploy_callback_timeout = 1800
@ -1355,6 +1361,14 @@
# disable timeout. (integer value)
#clean_callback_timeout = 1800
# Timeout (seconds) to wait for a callback from the rescue
# ramdisk. If the timeout is reached the node will be put in
# the "rescue failed" provision state. Set to 0 to disable
# timeout. This option is part of rescue feature work, which
# is not currently exposed to users. (integer value)
# Minimum value: 0
#rescue_callback_timeout = 1800
# Timeout (in seconds) of soft reboot and soft power off
# operation. This value always has to be positive. (integer
# value)

View File

@ -1727,6 +1727,27 @@ class ConductorManager(base_manager.BaseConductorManager):
keep_target_state=True,
callback_method=utils.cleanup_cleanwait_timeout)
@METRICS.timer('ConductorManager._check_rescuewait_timeouts')
@periodics.periodic(spacing=CONF.conductor.check_rescue_state_interval,
enabled=bool(CONF.conductor.rescue_callback_timeout))
def _check_rescuewait_timeouts(self, context):
"""Periodically checks if rescue has timed out waiting for heartbeat.
If a rescue call has timed out, fail the rescue and clean up.
:param context: request context.
"""
callback_timeout = CONF.conductor.rescue_callback_timeout
filters = {'reserved': False,
'provision_state': states.RESCUEWAIT,
'maintenance': False,
'provisioned_before': callback_timeout}
self._fail_if_in_state(context, filters, states.RESCUEWAIT,
'provision_updated_at',
keep_target_state=True,
callback_method=utils.cleanup_rescuewait_timeout
)
@METRICS.timer('ConductorManager._sync_local_state')
@periodics.periodic(spacing=CONF.conductor.sync_local_state_interval)
def _sync_local_state(self, context):

View File

@ -398,14 +398,48 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
task.process_event('fail', target_state=target_state)
@task_manager.require_exclusive_lock
def cleanup_rescuewait_timeout(task):
"""Cleanup rescue task after timeout.
:param task: a TaskManager instance.
"""
node = task.node
msg = _('Timeout reached while waiting for rescue ramdisk callback '
'for node')
errmsg = msg + ' %(node)s'
LOG.error(errmsg, {'node': node.uuid})
try:
node_power_action(task, states.POWER_OFF)
task.driver.rescue.clean_up(task)
node.last_error = msg
node.save()
except Exception as e:
if isinstance(e, exception.IronicException):
error_msg = _('Cleanup failed for %(node_info)s after rescue '
'timeout: %(error)s')
node_info = ('node')
node.last_error = error_msg % {'node_info': node_info, 'error': e}
node_info = ('node %s') % node.uuid
LOG.error(error_msg, {'node_info': node_info, 'error': e})
else:
node.last_error = _('Rescue timed out, but an unhandled '
'exception was encountered while aborting. '
'More info may be found in the log file.')
LOG.exception('Rescue timed out for node %(node)s, an exception '
'was encountered while aborting. Error: %(err)s',
{'node': node.uuid, 'err': e})
node.save()
def _spawn_error_handler(e, node, state):
"""Handle spawning error for node."""
if isinstance(e, exception.NoFreeConductorWorker):
node.last_error = (_("No free conductor workers available"))
node.save()
LOG.warning("No free conductor workers available to perform "
"%(state)s on node %(node)s",
{'state': state, 'node': node.uuid})
"%(operation)s on node %(node)s",
{'operation': state, 'node': node.uuid})
def spawn_cleaning_error_handler(e, node):

View File

@ -52,6 +52,12 @@ opts = [
default=60,
help=_('Interval between checks of provision timeouts, '
'in seconds.')),
cfg.IntOpt('check_rescue_state_interval',
default=60,
min=1,
help=_('Interval (seconds) between checks of rescue '
'timeouts. This option is part of rescue feature '
'work, which is not currently exposed to users.')),
cfg.IntOpt('deploy_callback_timeout',
default=1800,
help=_('Timeout (seconds) to wait for a callback from '
@ -145,6 +151,15 @@ opts = [
'ramdisk doing the cleaning. If the timeout is reached '
'the node will be put in the "clean failed" provision '
'state. Set to 0 to disable timeout.')),
cfg.IntOpt('rescue_callback_timeout',
default=1800,
min=0,
help=_('Timeout (seconds) to wait for a callback from the '
'rescue ramdisk. If the timeout is reached the node '
'will be put in the "rescue failed" provision state. '
'Set to 0 to disable timeout. This option is part of '
'rescue feature work, which is not currently exposed '
'to users.')),
cfg.IntOpt('soft_power_off_timeout',
default=600,
min=1,

View File

@ -1610,6 +1610,32 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin,
def test__check_cleanwait_timeouts_manual_clean(self):
self._check_cleanwait_timeouts(manual=True)
@mock.patch('ironic.drivers.modules.fake.FakeRescue.clean_up')
@mock.patch.object(conductor_utils, 'node_power_action')
def test_check_rescuewait_timeouts(self, node_power_mock,
mock_clean_up):
self._start_service()
CONF.set_override('rescue_callback_timeout', 1, group='conductor')
tgt_prov_state = states.RESCUE
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
rescue_interface='fake',
network_interface='flat',
provision_state=states.RESCUEWAIT,
target_provision_state=tgt_prov_state,
provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0))
self.service._check_rescuewait_timeouts(self.context)
self._stop_service()
node.refresh()
self.assertEqual(states.RESCUEFAIL, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
self.assertIsNotNone(node.last_error)
self.assertIn('Timeout reached while waiting for rescue ramdisk',
node.last_error)
mock_clean_up.assert_called_once_with(mock.ANY)
node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF)
def test_do_node_tear_down_invalid_state(self):
self._start_service()
# test node.provision_state is incorrect for tear_down

View File

@ -1075,8 +1075,9 @@ class ErrorHandlersTestCase(tests_base.TestCase):
def setUp(self):
super(ErrorHandlersTestCase, self).setUp()
self.task = mock.Mock(spec=task_manager.TaskManager)
self.task.driver = mock.Mock(spec_set=['deploy'])
self.task.driver = mock.Mock(spec_set=['deploy', 'network', 'rescue'])
self.task.node = mock.Mock(spec_set=objects.Node)
self.task.shared = False
self.node = self.task.node
# NOTE(mariojv) Some of the test cases that use the task below require
# strict typing of the node power state fields and would fail if passed
@ -1237,6 +1238,43 @@ class ErrorHandlersTestCase(tests_base.TestCase):
self.assertFalse(self.node.save.called)
self.assertFalse(log_mock.warning.called)
@mock.patch.object(conductor_utils, 'LOG')
@mock.patch.object(conductor_utils, 'node_power_action')
def test_cleanup_rescuewait_timeout(self, node_power_mock, log_mock):
conductor_utils.cleanup_rescuewait_timeout(self.task)
self.assertTrue(log_mock.error.called)
node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF)
self.task.driver.rescue.clean_up.assert_called_once_with(self.task)
self.assertIn('Timeout reached', self.node.last_error)
self.node.save.assert_called_once_with()
@mock.patch.object(conductor_utils, 'LOG')
@mock.patch.object(conductor_utils, 'node_power_action')
def test_cleanup_rescuewait_timeout_known_exc(
self, node_power_mock, log_mock):
clean_up_mock = self.task.driver.rescue.clean_up
clean_up_mock.side_effect = exception.IronicException('moocow')
conductor_utils.cleanup_rescuewait_timeout(self.task)
self.assertEqual(2, log_mock.error.call_count)
node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF)
self.task.driver.rescue.clean_up.assert_called_once_with(self.task)
self.assertIn('moocow', self.node.last_error)
self.node.save.assert_called_once_with()
@mock.patch.object(conductor_utils, 'LOG')
@mock.patch.object(conductor_utils, 'node_power_action')
def test_cleanup_rescuewait_timeout_unknown_exc(
self, node_power_mock, log_mock):
clean_up_mock = self.task.driver.rescue.clean_up
clean_up_mock.side_effect = Exception('moocow')
conductor_utils.cleanup_rescuewait_timeout(self.task)
self.assertTrue(log_mock.error.called)
node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF)
self.task.driver.rescue.clean_up.assert_called_once_with(self.task)
self.assertIn('Rescue timed out', self.node.last_error)
self.node.save.assert_called_once_with()
self.assertTrue(log_mock.exception.called)
class ValidatePortPhysnetTestCase(db_base.DbTestCase):