diff --git a/ironic_python_agent/config.py b/ironic_python_agent/config.py index 9251a3e37..d81e67c43 100644 --- a/ironic_python_agent/config.py +++ b/ironic_python_agent/config.py @@ -326,6 +326,11 @@ cli_opts = [ 'cleaning from inadvertently destroying a running ' 'cluster which may be visible over a storage fabric ' 'such as FibreChannel.'), + cfg.BoolOpt('config_drive_rebuild', + default=False, + help='If the agent should rebuild the configuration drive ' + 'using a local filesystem, instead of letting Ironic ' + 'determine if this action is necessary.'), ] CONF.register_cli_opts(cli_opts) diff --git a/ironic_python_agent/partition_utils.py b/ironic_python_agent/partition_utils.py index cb8225fbc..ab933e03f 100644 --- a/ironic_python_agent/partition_utils.py +++ b/ironic_python_agent/partition_utils.py @@ -465,12 +465,27 @@ def create_config_drive_partition(node_uuid, device, configdrive): {'part': config_drive_part, 'node': node_uuid}) utils.execute('test', '-e', config_drive_part, attempts=15, delay_on_retry=True) - - disk_utils.dd(confdrive_file, config_drive_part) + if not CONF.config_drive_rebuild: + disk_utils.dd(confdrive_file, config_drive_part) + if not _does_config_drive_work(config_drive_part): + # If we have reached this point, we might have an + # invalid configuration drive, OR the block device + # layer doesn't support 2K block Logical IO (iso9660) + _try_build_fat32_config_drive(config_drive_part, + confdrive_file) + else: + LOG.info('Extracting configuration drive to write copy to disk.') + _try_build_fat32_config_drive(config_drive_part, confdrive_file) LOG.info("Configdrive for node %(node)s successfully " "copied onto partition %(part)s", {'node': node_uuid, 'part': config_drive_part}) + except exception.InstanceDeployFailure: + # Since we no longer have a final action on the decorator, we need + # to catch the failure, and still perform the cleanup. + if confdrive_file: + utils.unlink_without_raise(confdrive_file) + raise except (processutils.UnknownArgumentError, processutils.ProcessExecutionError, OSError) as e: msg = ('Failed to create config drive on disk %(disk)s ' @@ -478,13 +493,90 @@ def create_config_drive_partition(node_uuid, device, configdrive): {'disk': device, 'node': node_uuid, 'error': e}) LOG.error(msg) raise exception.InstanceDeployFailure(msg) - finally: # If the configdrive was requested make sure we delete the file # after copying the content to the partition + + finally: if confdrive_file: utils.unlink_without_raise(confdrive_file) +def _does_config_drive_work(config_drive_part): + """Attempts to mount the config drive to validate it works. + + :param config_drive_part: The partition to which the configuration drive + was written. + :returns: True if we were able to mount the configuration drive partition. + """ + temp_folder = tempfile.mkdtemp() + try: + # Why: If the filesystem is ISO9660 or vfat, and the logical sector + # size which is supported is *not* something which supports 512 bytes, + # i.e. a 4k Block size, then ISO9660 just will not work. Vfat also + # will not work because the logical size needs to match the logical + # size which is usable. If the underlying driver cannot use that size, + # then the filesystem will not work and cannot be updated because + # structurally it is incompaible with the block device driver. + utils.execute('mount', '-o', 'ro', '-t', 'auto', config_drive_part, + temp_folder) + utils.execute('umount', temp_folder) + except (processutils.ProcessExecutionError, OSError) as e: + LOG.error('Encountered issue attempting to validate the ' + 'supplied configuration drive. Error: %s', e) + return False + finally: + utils.unlink_without_raise(temp_folder) + + return True + + +def _try_build_fat32_config_drive(partition, confdrive_file): + conf_drive_temp = tempfile.mkdtemp() + try: + utils.execute('mount', '-o', 'loop,ro', '-t', 'auto', + confdrive_file, conf_drive_temp) + except (processutils.ProcessExecutionError, OSError) as e: + # Config drive is invalid, at least to our point of view. + # Bailing. + LOG.warning('We were unable to examine the configuration drive, ' + 'bypassing. Error: %s', e) + return + + new_drive_temp = tempfile.mkdtemp() + try: + # While creating a config drive file from scratch or on + # a loopback will likely result in a 512 byte sector size, + # the underlying fat filesystem utilities *automatically* + # check the device block sector sizing. This *will* break + # above 4k blocks, or at least might. Officially, 4k is the + # *maximum* in the FAT standard. See: + # https://github.com/dosfstools/dosfstools/blame/c483196dd46eab22abba756cef511d36f5f42070/src/mkfs.fat.c#L1987 + utils.mkfs(fs='vfat', path=partition, label='CONFIG-2') + utils.execute('mount', '-t', 'auto', partition, new_drive_temp) + # copytree, using copy2, copies everything in the source folder + # into the destination folder, so we should be good, and metadata + # is attempted to be preserved. + shutil.copytree(conf_drive_temp, new_drive_temp, dirs_exist_ok=True) + except (processutils.ProcessExecutionError, OSError) as e: + # We failed to make the filesystem :( + # This is a fairly hard error as we could not use the + # config drive, nor could we recover the state. + LOG.error('We were unable to make a new filesystem for the ' + 'configuration drive. Error: %s', e) + msg = ('A failure occured while attempting to format, copy, and ' + 're-create the configuration drive in a structure which ' + 'is compatible with the underlying hardware and Operating ' + 'System. Due to the nature of configuration drive, it could ' + 'have been incorrectly formatted. Operator investigation is ' + 'required. Error: {}'.format(str(e))) + raise exception.InstanceDeployFailure(msg) + finally: + utils.execute('umount', conf_drive_temp) + utils.execute('umount', new_drive_temp) + utils.unlink_without_raise(new_drive_temp) + utils.unlink_without_raise(conf_drive_temp) + + def _is_disk_larger_than_max_size(device, node_uuid): """Check if total disk size exceeds 2TB msdos limit diff --git a/ironic_python_agent/tests/unit/test_partition_utils.py b/ironic_python_agent/tests/unit/test_partition_utils.py index 4923642b7..b9893f1bd 100644 --- a/ironic_python_agent/tests/unit/test_partition_utils.py +++ b/ironic_python_agent/tests/unit/test_partition_utils.py @@ -20,6 +20,7 @@ from ironic_lib import disk_utils from ironic_lib import exception from ironic_lib import utils from oslo_concurrency import processutils +from oslo_config import cfg import requests from ironic_python_agent import errors @@ -28,6 +29,9 @@ from ironic_python_agent import partition_utils from ironic_python_agent.tests.unit import base +CONF = cfg.CONF + + @mock.patch.object(shutil, 'copyfileobj', autospec=True) @mock.patch.object(requests, 'get', autospec=True) class GetConfigdriveTestCase(base.IronicAgentTest): @@ -649,7 +653,10 @@ class CreateConfigDriveTestCases(base.IronicAgentTest): self.config_part_label, self.node_uuid) self.assertFalse(mock_list_partitions.called) - self.assertFalse(mock_execute.called) + mock_execute.assert_has_calls([ + mock.call('mount', '-o', 'ro', '-t', 'auto', + '/dev/fake-part1', mock.ANY), + mock.call('umount', mock.ANY)]) self.assertFalse(mock_table_type.called) mock_dd.assert_called_with(configdrive_file, configdrive_part) mock_unlink.assert_called_with(configdrive_file) @@ -706,6 +713,135 @@ class CreateConfigDriveTestCases(base.IronicAgentTest): mock_dd.assert_called_with(configdrive_file, expected_part) mock_unlink.assert_called_with(configdrive_file) + @mock.patch('oslo_utils.uuidutils.generate_uuid', lambda: 'fake-uuid') + @mock.patch.object(partition_utils, '_try_build_fat32_config_drive', + autospec=True) + @mock.patch.object(partition_utils, '_does_config_drive_work', + autospec=True) + @mock.patch.object(utils, 'execute', autospec=True) + @mock.patch.object(utils, 'unlink_without_raise', + autospec=True) + @mock.patch.object(disk_utils, 'dd', + autospec=True) + @mock.patch.object(disk_utils, 'fix_gpt_partition', + autospec=True) + @mock.patch.object(disk_utils, 'get_partition_table_type', + autospec=True) + @mock.patch.object(partition_utils, 'get_partition', + autospec=True) + @mock.patch.object(partition_utils, 'get_labelled_partition', + autospec=True) + @mock.patch.object(partition_utils, 'get_configdrive', + autospec=True) + def test_create_partition_gpt_with_fallback( + self, mock_get_configdrive, + mock_get_labelled_partition, + mock_get_partition_by_uuid, + mock_table_type, + mock_fix_gpt_partition, + mock_dd, mock_unlink, mock_execute, + mock_config_drive_work, + mock_rebuild_config_drive): + config_url = 'http://1.2.3.4/cd' + configdrive_file = '/tmp/xyz' + configdrive_mb = 10 + + mock_get_configdrive.return_value = (configdrive_mb, configdrive_file) + mock_get_labelled_partition.return_value = None + + mock_table_type.return_value = 'gpt' + expected_part = '/dev/fake4' + mock_get_partition_by_uuid.return_value = expected_part + mock_config_drive_work.return_value = False + + partition_utils.create_config_drive_partition(self.node_uuid, self.dev, + config_url) + mock_execute.assert_has_calls([ + mock.call('sgdisk', '-n', '0:-64MB:0', '-u', '0:fake-uuid', + self.dev, run_as_root=True), + mock.call('sync'), + mock.call('udevadm', 'settle'), + mock.call('partprobe', self.dev, attempts=10, run_as_root=True), + mock.call('sgdisk', '-v', self.dev, run_as_root=True), + + mock.call('udevadm', 'settle'), + mock.call('test', '-e', expected_part, attempts=15, + delay_on_retry=True) + ]) + + mock_table_type.assert_called_with(self.dev) + mock_fix_gpt_partition.assert_called_with(self.dev, self.node_uuid) + mock_dd.assert_called_with(configdrive_file, expected_part) + mock_unlink.assert_called_with(configdrive_file) + mock_config_drive_work.assert_called_once_with(expected_part) + mock_rebuild_config_drive.assert_called_once_with(expected_part, + configdrive_file) + + @mock.patch('oslo_utils.uuidutils.generate_uuid', lambda: 'fake-uuid') + @mock.patch.object(partition_utils, '_try_build_fat32_config_drive', + autospec=True) + @mock.patch.object(partition_utils, '_does_config_drive_work', + autospec=True) + @mock.patch.object(utils, 'execute', autospec=True) + @mock.patch.object(utils, 'unlink_without_raise', + autospec=True) + @mock.patch.object(disk_utils, 'dd', + autospec=True) + @mock.patch.object(disk_utils, 'fix_gpt_partition', + autospec=True) + @mock.patch.object(disk_utils, 'get_partition_table_type', + autospec=True) + @mock.patch.object(partition_utils, 'get_partition', + autospec=True) + @mock.patch.object(partition_utils, 'get_labelled_partition', + autospec=True) + @mock.patch.object(partition_utils, 'get_configdrive', + autospec=True) + def test_create_partition_gpt_use_vfat( + self, mock_get_configdrive, + mock_get_labelled_partition, + mock_get_partition_by_uuid, + mock_table_type, + mock_fix_gpt_partition, + mock_dd, mock_unlink, mock_execute, + mock_config_drive_work, + mock_rebuild_config_drive): + config_url = 'http://1.2.3.4/cd' + configdrive_file = '/tmp/xyz' + configdrive_mb = 10 + + CONF.set_override('config_drive_rebuild', True) + mock_get_configdrive.return_value = (configdrive_mb, configdrive_file) + mock_get_labelled_partition.return_value = None + + mock_table_type.return_value = 'gpt' + expected_part = '/dev/fake4' + mock_get_partition_by_uuid.return_value = expected_part + mock_config_drive_work.return_value = True + + partition_utils.create_config_drive_partition(self.node_uuid, self.dev, + config_url) + mock_execute.assert_has_calls([ + mock.call('sgdisk', '-n', '0:-64MB:0', '-u', '0:fake-uuid', + self.dev, run_as_root=True), + mock.call('sync'), + mock.call('udevadm', 'settle'), + mock.call('partprobe', self.dev, attempts=10, run_as_root=True), + mock.call('sgdisk', '-v', self.dev, run_as_root=True), + + mock.call('udevadm', 'settle'), + mock.call('test', '-e', expected_part, attempts=15, + delay_on_retry=True) + ]) + + mock_table_type.assert_called_with(self.dev) + mock_fix_gpt_partition.assert_called_with(self.dev, self.node_uuid) + mock_dd.assert_not_called() + mock_unlink.assert_called_with(configdrive_file) + mock_config_drive_work.assert_not_called() + mock_rebuild_config_drive.assert_called_once_with(expected_part, + configdrive_file) + @mock.patch.object(disk_utils, 'count_mbr_partitions', autospec=True) @mock.patch.object(utils, 'execute', autospec=True) @mock.patch.object(partition_utils.LOG, 'warning', autospec=True) @@ -1288,3 +1424,94 @@ class TestGetPartition(base.IronicAgentTest): mock.call('lsblk', '-PbioKNAME,UUID,PARTUUID,TYPE,LABEL', self.fake_dev)] mock_execute.assert_has_calls(expected) + + +@mock.patch.object(utils, 'execute', autospec=True) +class TestConfigDriveTestRecovery(base.IronicAgentTest): + + fake_dev = '/dev/fake' + configdrive_file = '/tmp/config-drive' + + def test__does_config_drive_work(self, mock_execute): + self.assertTrue(partition_utils._does_config_drive_work(self.fake_dev)) + mock_execute.assert_has_calls([ + mock.call('mount', '-o', 'ro', '-t', 'auto', self.fake_dev, + mock.ANY), + mock.call('umount', mock.ANY)]) + + def test__does_config_drive_failed(self, mock_execute): + mock_execute.side_effect = processutils.ProcessExecutionError('boom') + self.assertFalse( + partition_utils._does_config_drive_work(self.fake_dev) + ) + mock_execute.assert_has_calls([ + mock.call('mount', '-o', 'ro', '-t', 'auto', self.fake_dev, + mock.ANY)]) + + @mock.patch.object(shutil, 'copytree', autospec=True) + @mock.patch.object(utils, 'mkfs', autospec=True) + def test__try_build_fat32_config_drive(self, + mock_mkfs, + mock_copy, + mock_execute): + partition_utils._try_build_fat32_config_drive(self.fake_dev, + self.configdrive_file) + mock_execute.assert_has_calls([ + mock.call('mount', '-o', 'loop,ro', '-t', 'auto', + self.configdrive_file, mock.ANY), + mock.call('mount', '-t', 'auto', self.fake_dev, mock.ANY), + mock.call('umount', mock.ANY), + mock.call('umount', mock.ANY), + ]) + mock_mkfs.assert_called_once_with(fs='vfat', path=self.fake_dev, + label='CONFIG-2') + # Validate we called copy as we expect, both source and destination + # are temporary folders. + mock_copy.assert_called_once_with(mock.ANY, mock.ANY, + dirs_exist_ok=True) + + @mock.patch.object(shutil, 'copytree', autospec=True) + @mock.patch.object(utils, 'mkfs', autospec=True) + def test__try_build_fat32_config_drive_graceful_fail( + self, + mock_mkfs, + mock_copy, + mock_execute): + mock_execute.side_effect = processutils.ProcessExecutionError('boom') + self.assertIsNone( + partition_utils._try_build_fat32_config_drive( + self.fake_dev, + self.configdrive_file) + ) + mock_execute.assert_called_once_with( + 'mount', '-o', 'loop,ro', '-t', 'auto', + self.configdrive_file, mock.ANY) + mock_mkfs.assert_not_called() + # Validate we called copy as we expect, both source and destination + # are temporary folders. + mock_copy.assert_not_called() + + @mock.patch.object(shutil, 'copytree', autospec=True) + @mock.patch.object(utils, 'mkfs', autospec=True) + def test__try_build_fat32_config_drive_fails_once_invalid( + self, + mock_mkfs, + mock_copy, + mock_execute): + mock_mkfs.side_effect = processutils.ProcessExecutionError('boom') + self.assertRaisesRegex( + exception.InstanceDeployFailure, + 'A failure occured while attempting to format.*', + partition_utils._try_build_fat32_config_drive, + self.fake_dev, + self.configdrive_file) + mock_execute.assert_has_calls([ + mock.call('mount', '-o', 'loop,ro', '-t', 'auto', + self.configdrive_file, mock.ANY), + mock.call('umount', mock.ANY), + mock.call('umount', mock.ANY), + ]) + + mock_mkfs.assert_called_once_with(fs='vfat', path=self.fake_dev, + label='CONFIG-2') + mock_copy.assert_not_called() diff --git a/releasenotes/notes/4k-block-size-config-drives-4470828dd06d2600.yaml b/releasenotes/notes/4k-block-size-config-drives-4470828dd06d2600.yaml new file mode 100644 index 000000000..7e65348c6 --- /dev/null +++ b/releasenotes/notes/4k-block-size-config-drives-4470828dd06d2600.yaml @@ -0,0 +1,12 @@ +--- +fixes: + - | + Fixes a failure case where a deployed instance may be unable to access + the configuration drive post-deployment. This can occur when block + devices only support 4KB IO interactions. When 4KB block IO sizes + are in use, the ISO9660 filesystem driver in Linux cannot be used + as it is modeled around a 2KB block. We now attempt to verify, and + rebuild the configuration drive on a FAT filesystem when we cannot + mount the supplied configuration drive. Operators can force the agent + to write configuration drives using the FAT filesystem using the + ``[DEFAULT]config_drive_rebuild`` option.