Config to stop powering off nodes on failure

This adds the config option deploy.power_off_after_deploy_failure,
which will prevent powering off nodes once a deployment fails.

Powering off nodes means loss of the log output from the agent.
Choosing to leave the machine powered on would give an operator
the opportunity to troubleshoot the issue using agent logs on the
system in case of failure, or to prevent needing to power the machine
back on for provisioning after cleaning.

As a note, Nova will still attempt to delete the node on a failed
deployment, so the option will not be very effective in non-standalone
deployments until a similar patch is put into nova.

Related-bug: 1531217
Change-Id: I7c25946de50ffda2869307cb4016e0ccfe81d192
Co-Authored-By: Josh Gachnang <josh@servercobra.com>
This commit is contained in:
Jay Faulkner 2016-03-23 16:00:37 +00:00
parent 797e6f20e0
commit 6c96257142
4 changed files with 52 additions and 12 deletions

View File

@ -919,6 +919,10 @@
# Deprecated group/name - [agent]/agent_erase_devices_iterations
#erase_devices_iterations=1
# Whether to power off a node after deploy failure. Defaults
# to True. (boolean value)
#power_off_after_deploy_failure=true
[dhcp]

View File

@ -71,6 +71,10 @@ deploy_opts = [
deprecated_group='agent',
default=1,
help=_('Number of iterations to be run for erasing devices.')),
cfg.BoolOpt('power_off_after_deploy_failure',
default=True,
help=_('Whether to power off a node after deploy failure. '
'Defaults to True.')),
]
CONF = cfg.CONF
CONF.register_opts(deploy_opts, group='deploy')
@ -492,15 +496,15 @@ def set_failed_state(task, msg):
% {'node': node.uuid, 'state': node.provision_state})
LOG.exception(msg2)
try:
manager_utils.node_power_action(task, states.POWER_OFF)
except Exception:
msg2 = (_LE('Node %s failed to power off while handling deploy '
'failure. This may be a serious condition. Node '
'should be removed from Ironic or put in maintenance '
'mode until the problem is resolved.') % node.uuid)
LOG.exception(msg2)
if CONF.deploy.power_off_after_deploy_failure:
try:
manager_utils.node_power_action(task, states.POWER_OFF)
except Exception:
msg2 = (_LE('Node %s failed to power off while handling deploy '
'failure. This may be a serious condition. Node '
'should be removed from Ironic or put in maintenance '
'mode until the problem is resolved.') % node.uuid)
LOG.exception(msg2)
# NOTE(deva): node_power_action() erases node.last_error
# so we need to set it here.
node.last_error = msg

View File

@ -1252,7 +1252,7 @@ class OtherFunctionTestCase(db_base.DbTestCase):
autospec=True)
def _test_set_failed_state(self, mock_event, mock_power, mock_log,
event_value=None, power_value=None,
log_calls=None):
log_calls=None, poweroff=True):
err_msg = 'some failure'
mock_event.side_effect = event_value
mock_power.side_effect = power_value
@ -1260,9 +1260,12 @@ class OtherFunctionTestCase(db_base.DbTestCase):
shared=False) as task:
utils.set_failed_state(task, err_msg)
mock_event.assert_called_once_with(task, 'fail')
mock_power.assert_called_once_with(task, states.POWER_OFF)
if poweroff:
mock_power.assert_called_once_with(task, states.POWER_OFF)
else:
self.assertFalse(mock_power.called)
self.assertEqual(err_msg, task.node.last_error)
if log_calls:
if (log_calls and poweroff):
mock_log.exception.assert_has_calls(log_calls)
else:
self.assertFalse(mock_log.called)
@ -1283,6 +1286,23 @@ class OtherFunctionTestCase(db_base.DbTestCase):
power_value=iter([exc_param] * len(calls)),
log_calls=calls)
def test_set_failed_state_no_poweroff(self):
cfg.CONF.deploy.power_off_after_deploy_failure = False
exc_state = exception.InvalidState('invalid state')
exc_param = exception.InvalidParameterValue('invalid parameter')
mock_call = mock.call(mock.ANY)
self._test_set_failed_state(poweroff=False)
calls = [mock_call]
self._test_set_failed_state(event_value=iter([exc_state] * len(calls)),
log_calls=calls, poweroff=False)
calls = [mock_call]
self._test_set_failed_state(power_value=iter([exc_param] * len(calls)),
log_calls=calls, poweroff=False)
calls = [mock_call, mock_call]
self._test_set_failed_state(event_value=iter([exc_state] * len(calls)),
power_value=iter([exc_param] * len(calls)),
log_calls=calls, poweroff=False)
def test_get_boot_option(self):
self.node.instance_info = {'capabilities': '{"boot_option": "local"}'}
result = utils.get_boot_option(self.node)

View File

@ -0,0 +1,12 @@
---
features:
- Operators can now set
deploy.power_off_after_deploy_failure to leave nodes
powered on when a deployment fails. This is useful
for troubleshooting deployment issues. As a note,
Nova will still attempt to delete a node after a failed
deployment, so deploy.power_off_after_deploy_failure
may not be very effective in non-standalone
deployments until a similar patch to ironic's driver in
nova is proposed.