diff --git a/ironic/drivers/modules/redfish/management.py b/ironic/drivers/modules/redfish/management.py index 4026da48cb..dff790ce56 100644 --- a/ironic/drivers/modules/redfish/management.py +++ b/ironic/drivers/modules/redfish/management.py @@ -24,6 +24,7 @@ from ironic.common import components from ironic.common import exception from ironic.common.i18n import _ from ironic.common import indicator_states +from ironic.common import utils from ironic.conductor import task_manager from ironic.drivers import base from ironic.drivers.modules.redfish import utils as redfish_utils @@ -68,6 +69,42 @@ if sushy: v: k for k, v in INDICATOR_MAP.items()} +def _set_boot_device(task, system, device, enabled=None): + """An internal routine to set the boot device. + + :param task: a task from TaskManager. + :param system: a Redfish System object. + :param device: the Redfish boot device. + :param enabled: Redfish boot device persistence value or None. + :raises: SushyError on an error from the Sushy library + """ + try: + system.set_system_boot_options(device, enabled=enabled) + except sushy.exceptions.SushyError as e: + if enabled == sushy.BOOT_SOURCE_ENABLED_CONTINUOUS: + # NOTE(dtantsur): continuous boot device settings have been + # removed from Redfish, and some vendors stopped supporting + # it before an alternative was provided. As a work around, + # use one-time boot and restore the boot device on every + # reboot via RedfishPower. + LOG.debug('Error %(error)s when trying to set a ' + 'persistent boot device on node %(node)s, ' + 'falling back to one-time boot settings', + {'error': e, 'node': task.node.uuid}) + system.set_system_boot_options( + device, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE) + LOG.warning('Could not set persistent boot device to ' + '%(dev)s for node %(node)s, using one-time ' + 'boot device instead', + {'dev': device, 'node': task.node.uuid}) + utils.set_node_nested_field( + task.node, 'driver_internal_info', + 'redfish_boot_device', device) + task.node.save() + else: + raise + + class RedfishManagement(base.ManagementInterface): def __init__(self): @@ -107,6 +144,33 @@ class RedfishManagement(base.ManagementInterface): """ return list(BOOT_DEVICE_MAP_REV) + @task_manager.require_exclusive_lock + def restore_boot_device(self, task, system): + """Restore boot device if needed. + + Checks the redfish_boot_device internal flag and sets the one-time + boot device accordingly. A warning is issued if it fails. + + This method is supposed to be called from the Redfish power interface + and should be considered private to the Redfish hardware type. + + :param task: a task from TaskManager. + :param system: a Redfish System object. + """ + device = task.node.driver_internal_info.get('redfish_boot_device') + if not device: + return + + LOG.debug('Restoring boot device %(dev)s on node %(node)s', + {'dev': device, 'node': task.node.uuid}) + try: + _set_boot_device(task, system, device) + except sushy.exceptions.SushyError as e: + LOG.warning('Unable to recover boot device %(dev)s for node ' + '%(node)s, relying on the pre-configured boot order. ' + 'Error: %(error)s', + {'dev': device, 'node': task.node.uuid, 'error': e}) + @task_manager.require_exclusive_lock def set_boot_device(self, task, device, persistent=False): """Set the boot device for a node. @@ -124,6 +188,10 @@ class RedfishManagement(base.ManagementInterface): :raises: RedfishConnectionError when it fails to connect to Redfish :raises: RedfishError on an error from the Sushy library """ + utils.pop_node_nested_field( + task.node, 'driver_internal_info', 'redfish_boot_device') + task.node.save() + system = redfish_utils.get_system(task.node) desired_persistence = BOOT_DEVICE_PERSISTENT_MAP_REV[persistent] @@ -132,11 +200,9 @@ class RedfishManagement(base.ManagementInterface): # NOTE(etingof): this can be racy, esp if BMC is not RESTful enabled = (desired_persistence if desired_persistence != current_persistence else None) - try: - system.set_system_boot_options( - BOOT_DEVICE_MAP_REV[device], enabled=enabled) - + _set_boot_device(task, system, BOOT_DEVICE_MAP_REV[device], + enabled=enabled) except sushy.exceptions.SushyError as e: error_msg = (_('Redfish set boot device failed for node ' '%(node)s. Error: %(error)s') % diff --git a/ironic/drivers/modules/redfish/power.py b/ironic/drivers/modules/redfish/power.py index cb944a4273..0a6f3f338a 100644 --- a/ironic/drivers/modules/redfish/power.py +++ b/ironic/drivers/modules/redfish/power.py @@ -22,6 +22,7 @@ from ironic.common import states from ironic.conductor import task_manager from ironic.conductor import utils as cond_utils from ironic.drivers import base +from ironic.drivers.modules.redfish import management as redfish_mgmt from ironic.drivers.modules.redfish import utils as redfish_utils LOG = log.getLogger(__name__) @@ -123,6 +124,12 @@ class RedfishPower(base.PowerInterface): :raises: RedfishError on an error from the Sushy library """ system = redfish_utils.get_system(task.node) + + if (power_state in (states.POWER_ON, states.SOFT_REBOOT, states.REBOOT) + and isinstance(task.driver.management, + redfish_mgmt.RedfishManagement)): + task.driver.management.restore_boot_device(task, system) + try: _set_power_state(task, system, power_state, timeout=timeout) except sushy.exceptions.SushyError as e: @@ -151,6 +158,10 @@ class RedfishPower(base.PowerInterface): next_state = states.POWER_OFF _set_power_state(task, system, next_state, timeout=timeout) + if isinstance(task.driver.management, + redfish_mgmt.RedfishManagement): + task.driver.management.restore_boot_device(task, system) + next_state = states.POWER_ON _set_power_state(task, system, next_state, timeout=timeout) except sushy.exceptions.SushyError as e: diff --git a/ironic/tests/unit/drivers/modules/redfish/test_management.py b/ironic/tests/unit/drivers/modules/redfish/test_management.py index c8a37951a9..694b445e53 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_management.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_management.py @@ -99,6 +99,8 @@ class RedfishManagementTestCase(db_base.DbTestCase): fake_system.set_system_boot_options.assert_called_once_with( expected, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE) mock_get_system.assert_called_once_with(task.node) + self.assertNotIn('redfish_boot_device', + task.node.driver_internal_info) # Reset mocks fake_system.set_system_boot_options.reset_mock() @@ -122,6 +124,8 @@ class RedfishManagementTestCase(db_base.DbTestCase): fake_system.set_system_boot_options.assert_called_once_with( sushy.BOOT_SOURCE_TARGET_PXE, enabled=expected) mock_get_system.assert_called_once_with(task.node) + self.assertNotIn('redfish_boot_device', + task.node.driver_internal_info) # Reset mocks fake_system.set_system_boot_options.reset_mock() @@ -169,6 +173,82 @@ class RedfishManagementTestCase(db_base.DbTestCase): sushy.BOOT_SOURCE_TARGET_PXE, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE) mock_get_system.assert_called_once_with(task.node) + self.assertNotIn('redfish_boot_device', + task.node.driver_internal_info) + + @mock.patch.object(sushy, 'Sushy', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + def test_set_boot_device_persistence_fallback(self, mock_get_system, + mock_sushy): + fake_system = mock.Mock() + fake_system.set_system_boot_options.side_effect = [ + sushy.exceptions.SushyError(), + None, + ] + mock_get_system.return_value = fake_system + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.management.set_boot_device( + task, boot_devices.PXE, persistent=True) + fake_system.set_system_boot_options.assert_has_calls([ + mock.call(sushy.BOOT_SOURCE_TARGET_PXE, + enabled=sushy.BOOT_SOURCE_ENABLED_CONTINUOUS), + mock.call(sushy.BOOT_SOURCE_TARGET_PXE, + enabled=sushy.BOOT_SOURCE_ENABLED_ONCE), + ]) + mock_get_system.assert_called_once_with(task.node) + + task.node.refresh() + self.assertEqual( + sushy.BOOT_SOURCE_TARGET_PXE, + task.node.driver_internal_info['redfish_boot_device']) + + def test_restore_boot_device(self): + fake_system = mock.Mock() + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.driver_internal_info['redfish_boot_device'] = ( + sushy.BOOT_SOURCE_TARGET_HDD + ) + + task.driver.management.restore_boot_device(task, fake_system) + + fake_system.set_system_boot_options.assert_called_once_with( + sushy.BOOT_SOURCE_TARGET_HDD, enabled=None) + # The stored boot device is kept intact + self.assertEqual( + sushy.BOOT_SOURCE_TARGET_HDD, + task.node.driver_internal_info['redfish_boot_device']) + + def test_restore_boot_device_noop(self): + fake_system = mock.Mock() + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.management.restore_boot_device(task, fake_system) + + self.assertFalse(fake_system.set_system_boot_options.called) + + @mock.patch.object(redfish_mgmt.LOG, 'warning', autospec=True) + def test_restore_boot_device_failure(self, mock_log): + fake_system = mock.Mock() + fake_system.set_system_boot_options.side_effect = ( + sushy.exceptions.SushyError() + ) + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.driver_internal_info['redfish_boot_device'] = ( + sushy.BOOT_SOURCE_TARGET_HDD + ) + + task.driver.management.restore_boot_device(task, fake_system) + + fake_system.set_system_boot_options.assert_called_once_with( + sushy.BOOT_SOURCE_TARGET_HDD, enabled=None) + self.assertTrue(mock_log.called) + # The stored boot device is kept intact + self.assertEqual( + sushy.BOOT_SOURCE_TARGET_HDD, + task.node.driver_internal_info['redfish_boot_device']) @mock.patch.object(redfish_utils, 'get_system', autospec=True) def test_get_boot_device(self, mock_get_system): diff --git a/ironic/tests/unit/drivers/modules/redfish/test_power.py b/ironic/tests/unit/drivers/modules/redfish/test_power.py index 6ebe98d5c0..99f155d1e9 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_power.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_power.py @@ -19,6 +19,7 @@ from oslo_utils import importutils from ironic.common import exception from ironic.common import states from ironic.conductor import task_manager +from ironic.drivers.modules.redfish import management as redfish_mgmt from ironic.drivers.modules.redfish import power as redfish_power from ironic.drivers.modules.redfish import utils as redfish_utils from ironic.tests.unit.db import base as db_base @@ -83,19 +84,21 @@ class RedfishPowerTestCase(db_base.DbTestCase): mock_get_system.assert_called_once_with(task.node) mock_get_system.reset_mock() + @mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device', + autospec=True) @mock.patch.object(redfish_utils, 'get_system', autospec=True) - def test_set_power_state(self, mock_get_system): + def test_set_power_state(self, mock_get_system, mock_restore_bootdev): with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: expected_values = [ - (states.POWER_ON, sushy.RESET_ON), - (states.POWER_OFF, sushy.RESET_FORCE_OFF), - (states.REBOOT, sushy.RESET_FORCE_RESTART), - (states.SOFT_REBOOT, sushy.RESET_GRACEFUL_RESTART), - (states.SOFT_POWER_OFF, sushy.RESET_GRACEFUL_SHUTDOWN) + (states.POWER_ON, sushy.RESET_ON, True), + (states.POWER_OFF, sushy.RESET_FORCE_OFF, False), + (states.REBOOT, sushy.RESET_FORCE_RESTART, True), + (states.SOFT_REBOOT, sushy.RESET_GRACEFUL_RESTART, True), + (states.SOFT_POWER_OFF, sushy.RESET_GRACEFUL_SHUTDOWN, False) ] - for target, expected in expected_values: + for target, expected, restore_bootdev in expected_values: if target in (states.POWER_OFF, states.SOFT_POWER_OFF): final = sushy.SYSTEM_POWER_STATE_OFF transient = sushy.SYSTEM_POWER_STATE_ON @@ -114,9 +117,15 @@ class RedfishPowerTestCase(db_base.DbTestCase): system_result[0].reset_system.assert_called_once_with(expected) mock_get_system.assert_called_with(task.node) self.assertEqual(4, mock_get_system.call_count) + if restore_bootdev: + mock_restore_bootdev.assert_called_once_with( + task.driver.management, task, system_result[0]) + else: + self.assertFalse(mock_restore_bootdev.called) # Reset mocks mock_get_system.reset_mock() + mock_restore_bootdev.reset_mock() @mock.patch.object(redfish_utils, 'get_system', autospec=True) def test_set_power_state_not_reached(self, mock_get_system): @@ -165,8 +174,11 @@ class RedfishPowerTestCase(db_base.DbTestCase): sushy.RESET_ON) mock_get_system.assert_called_once_with(task.node) + @mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device', + autospec=True) @mock.patch.object(redfish_utils, 'get_system', autospec=True) - def test_reboot_from_power_off(self, mock_get_system): + def test_reboot_from_power_off(self, mock_get_system, + mock_restore_bootdev): with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: system_result = [ @@ -186,9 +198,13 @@ class RedfishPowerTestCase(db_base.DbTestCase): sushy.RESET_ON) mock_get_system.assert_called_with(task.node) self.assertEqual(3, mock_get_system.call_count) + mock_restore_bootdev.assert_called_once_with( + task.driver.management, task, system_result[0]) + @mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device', + autospec=True) @mock.patch.object(redfish_utils, 'get_system', autospec=True) - def test_reboot_from_power_on(self, mock_get_system): + def test_reboot_from_power_on(self, mock_get_system, mock_restore_bootdev): with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: system_result = [ @@ -210,6 +226,8 @@ class RedfishPowerTestCase(db_base.DbTestCase): ]) mock_get_system.assert_called_with(task.node) self.assertEqual(3, mock_get_system.call_count) + mock_restore_bootdev.assert_called_once_with( + task.driver.management, task, system_result[0]) @mock.patch.object(redfish_utils, 'get_system', autospec=True) def test_reboot_not_reached(self, mock_get_system): diff --git a/releasenotes/notes/redfish-sadness-6e2a37b3f45ef1aa.yaml b/releasenotes/notes/redfish-sadness-6e2a37b3f45ef1aa.yaml new file mode 100644 index 0000000000..a8cb499beb --- /dev/null +++ b/releasenotes/notes/redfish-sadness-6e2a37b3f45ef1aa.yaml @@ -0,0 +1,18 @@ +--- +fixes: + - | + Provides a workaround for hardware that does not support persistent boot + device setting with the ``redfish`` hardware type. When such situation is + detected, ironic will fall back to one-time boot device setting, restoring + it on every reboot. +issues: + - | + Some redfish-enabled hardware is known not to support persistent boot + device setting that is used by the Bare Metal service for deployed + instances. The ``redfish`` hardware type tries to work around this problem, + but rebooting such an instance in-band may cause it to boot incorrectly. + A predictable boot order should be configured in the node's boot firmware + to avoid issues and at least metadata cleaning must be enabled. + See `this mailing list thread + `_ + for technical details.