From 5a78bf286f11c4de095b936a14ca2e69896f6a78 Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Tue, 18 Jul 2023 11:01:22 -0700 Subject: [PATCH] preserve/handle config drives on 4k block devices When an underlying block device (or driver) only supports 4KB IO, this can cause some issues with aspects like using an ISO9660 filesystem which can only support a maximum of 2KB IO. The agent will now attempt to mount the filesystem *before* deleting the supplied file, and should that fail it will mount the configuration drive file from the ramdisk utilizing a loopback, and then extract the contents of the ramdisk into a newly created VFAT filesystem which supports 4KB block IO. Closes-Bug: #2028002 Change-Id: I336acb8e8eb5a02dde2f5e24c258e23797d200ee (cherry picked from commit b6c263a5dc34c10b50a480e3fdcf22784f2aa77c) --- ironic_python_agent/config.py | 5 + ironic_python_agent/partition_utils.py | 98 +++++++- .../tests/unit/test_partition_utils.py | 229 +++++++++++++++++- ...k-size-config-drives-4470828dd06d2600.yaml | 12 + 4 files changed, 340 insertions(+), 4 deletions(-) create mode 100644 releasenotes/notes/4k-block-size-config-drives-4470828dd06d2600.yaml diff --git a/ironic_python_agent/config.py b/ironic_python_agent/config.py index 9251a3e37..d81e67c43 100644 --- a/ironic_python_agent/config.py +++ b/ironic_python_agent/config.py @@ -326,6 +326,11 @@ cli_opts = [ 'cleaning from inadvertently destroying a running ' 'cluster which may be visible over a storage fabric ' 'such as FibreChannel.'), + cfg.BoolOpt('config_drive_rebuild', + default=False, + help='If the agent should rebuild the configuration drive ' + 'using a local filesystem, instead of letting Ironic ' + 'determine if this action is necessary.'), ] CONF.register_cli_opts(cli_opts) diff --git a/ironic_python_agent/partition_utils.py b/ironic_python_agent/partition_utils.py index cb8225fbc..ab933e03f 100644 --- a/ironic_python_agent/partition_utils.py +++ b/ironic_python_agent/partition_utils.py @@ -465,12 +465,27 @@ def create_config_drive_partition(node_uuid, device, configdrive): {'part': config_drive_part, 'node': node_uuid}) utils.execute('test', '-e', config_drive_part, attempts=15, delay_on_retry=True) - - disk_utils.dd(confdrive_file, config_drive_part) + if not CONF.config_drive_rebuild: + disk_utils.dd(confdrive_file, config_drive_part) + if not _does_config_drive_work(config_drive_part): + # If we have reached this point, we might have an + # invalid configuration drive, OR the block device + # layer doesn't support 2K block Logical IO (iso9660) + _try_build_fat32_config_drive(config_drive_part, + confdrive_file) + else: + LOG.info('Extracting configuration drive to write copy to disk.') + _try_build_fat32_config_drive(config_drive_part, confdrive_file) LOG.info("Configdrive for node %(node)s successfully " "copied onto partition %(part)s", {'node': node_uuid, 'part': config_drive_part}) + except exception.InstanceDeployFailure: + # Since we no longer have a final action on the decorator, we need + # to catch the failure, and still perform the cleanup. + if confdrive_file: + utils.unlink_without_raise(confdrive_file) + raise except (processutils.UnknownArgumentError, processutils.ProcessExecutionError, OSError) as e: msg = ('Failed to create config drive on disk %(disk)s ' @@ -478,13 +493,90 @@ def create_config_drive_partition(node_uuid, device, configdrive): {'disk': device, 'node': node_uuid, 'error': e}) LOG.error(msg) raise exception.InstanceDeployFailure(msg) - finally: # If the configdrive was requested make sure we delete the file # after copying the content to the partition + + finally: if confdrive_file: utils.unlink_without_raise(confdrive_file) +def _does_config_drive_work(config_drive_part): + """Attempts to mount the config drive to validate it works. + + :param config_drive_part: The partition to which the configuration drive + was written. + :returns: True if we were able to mount the configuration drive partition. + """ + temp_folder = tempfile.mkdtemp() + try: + # Why: If the filesystem is ISO9660 or vfat, and the logical sector + # size which is supported is *not* something which supports 512 bytes, + # i.e. a 4k Block size, then ISO9660 just will not work. Vfat also + # will not work because the logical size needs to match the logical + # size which is usable. If the underlying driver cannot use that size, + # then the filesystem will not work and cannot be updated because + # structurally it is incompaible with the block device driver. + utils.execute('mount', '-o', 'ro', '-t', 'auto', config_drive_part, + temp_folder) + utils.execute('umount', temp_folder) + except (processutils.ProcessExecutionError, OSError) as e: + LOG.error('Encountered issue attempting to validate the ' + 'supplied configuration drive. Error: %s', e) + return False + finally: + utils.unlink_without_raise(temp_folder) + + return True + + +def _try_build_fat32_config_drive(partition, confdrive_file): + conf_drive_temp = tempfile.mkdtemp() + try: + utils.execute('mount', '-o', 'loop,ro', '-t', 'auto', + confdrive_file, conf_drive_temp) + except (processutils.ProcessExecutionError, OSError) as e: + # Config drive is invalid, at least to our point of view. + # Bailing. + LOG.warning('We were unable to examine the configuration drive, ' + 'bypassing. Error: %s', e) + return + + new_drive_temp = tempfile.mkdtemp() + try: + # While creating a config drive file from scratch or on + # a loopback will likely result in a 512 byte sector size, + # the underlying fat filesystem utilities *automatically* + # check the device block sector sizing. This *will* break + # above 4k blocks, or at least might. Officially, 4k is the + # *maximum* in the FAT standard. See: + # https://github.com/dosfstools/dosfstools/blame/c483196dd46eab22abba756cef511d36f5f42070/src/mkfs.fat.c#L1987 + utils.mkfs(fs='vfat', path=partition, label='CONFIG-2') + utils.execute('mount', '-t', 'auto', partition, new_drive_temp) + # copytree, using copy2, copies everything in the source folder + # into the destination folder, so we should be good, and metadata + # is attempted to be preserved. + shutil.copytree(conf_drive_temp, new_drive_temp, dirs_exist_ok=True) + except (processutils.ProcessExecutionError, OSError) as e: + # We failed to make the filesystem :( + # This is a fairly hard error as we could not use the + # config drive, nor could we recover the state. + LOG.error('We were unable to make a new filesystem for the ' + 'configuration drive. Error: %s', e) + msg = ('A failure occured while attempting to format, copy, and ' + 're-create the configuration drive in a structure which ' + 'is compatible with the underlying hardware and Operating ' + 'System. Due to the nature of configuration drive, it could ' + 'have been incorrectly formatted. Operator investigation is ' + 'required. Error: {}'.format(str(e))) + raise exception.InstanceDeployFailure(msg) + finally: + utils.execute('umount', conf_drive_temp) + utils.execute('umount', new_drive_temp) + utils.unlink_without_raise(new_drive_temp) + utils.unlink_without_raise(conf_drive_temp) + + def _is_disk_larger_than_max_size(device, node_uuid): """Check if total disk size exceeds 2TB msdos limit diff --git a/ironic_python_agent/tests/unit/test_partition_utils.py b/ironic_python_agent/tests/unit/test_partition_utils.py index 4923642b7..b9893f1bd 100644 --- a/ironic_python_agent/tests/unit/test_partition_utils.py +++ b/ironic_python_agent/tests/unit/test_partition_utils.py @@ -20,6 +20,7 @@ from ironic_lib import disk_utils from ironic_lib import exception from ironic_lib import utils from oslo_concurrency import processutils +from oslo_config import cfg import requests from ironic_python_agent import errors @@ -28,6 +29,9 @@ from ironic_python_agent import partition_utils from ironic_python_agent.tests.unit import base +CONF = cfg.CONF + + @mock.patch.object(shutil, 'copyfileobj', autospec=True) @mock.patch.object(requests, 'get', autospec=True) class GetConfigdriveTestCase(base.IronicAgentTest): @@ -649,7 +653,10 @@ class CreateConfigDriveTestCases(base.IronicAgentTest): self.config_part_label, self.node_uuid) self.assertFalse(mock_list_partitions.called) - self.assertFalse(mock_execute.called) + mock_execute.assert_has_calls([ + mock.call('mount', '-o', 'ro', '-t', 'auto', + '/dev/fake-part1', mock.ANY), + mock.call('umount', mock.ANY)]) self.assertFalse(mock_table_type.called) mock_dd.assert_called_with(configdrive_file, configdrive_part) mock_unlink.assert_called_with(configdrive_file) @@ -706,6 +713,135 @@ class CreateConfigDriveTestCases(base.IronicAgentTest): mock_dd.assert_called_with(configdrive_file, expected_part) mock_unlink.assert_called_with(configdrive_file) + @mock.patch('oslo_utils.uuidutils.generate_uuid', lambda: 'fake-uuid') + @mock.patch.object(partition_utils, '_try_build_fat32_config_drive', + autospec=True) + @mock.patch.object(partition_utils, '_does_config_drive_work', + autospec=True) + @mock.patch.object(utils, 'execute', autospec=True) + @mock.patch.object(utils, 'unlink_without_raise', + autospec=True) + @mock.patch.object(disk_utils, 'dd', + autospec=True) + @mock.patch.object(disk_utils, 'fix_gpt_partition', + autospec=True) + @mock.patch.object(disk_utils, 'get_partition_table_type', + autospec=True) + @mock.patch.object(partition_utils, 'get_partition', + autospec=True) + @mock.patch.object(partition_utils, 'get_labelled_partition', + autospec=True) + @mock.patch.object(partition_utils, 'get_configdrive', + autospec=True) + def test_create_partition_gpt_with_fallback( + self, mock_get_configdrive, + mock_get_labelled_partition, + mock_get_partition_by_uuid, + mock_table_type, + mock_fix_gpt_partition, + mock_dd, mock_unlink, mock_execute, + mock_config_drive_work, + mock_rebuild_config_drive): + config_url = 'http://1.2.3.4/cd' + configdrive_file = '/tmp/xyz' + configdrive_mb = 10 + + mock_get_configdrive.return_value = (configdrive_mb, configdrive_file) + mock_get_labelled_partition.return_value = None + + mock_table_type.return_value = 'gpt' + expected_part = '/dev/fake4' + mock_get_partition_by_uuid.return_value = expected_part + mock_config_drive_work.return_value = False + + partition_utils.create_config_drive_partition(self.node_uuid, self.dev, + config_url) + mock_execute.assert_has_calls([ + mock.call('sgdisk', '-n', '0:-64MB:0', '-u', '0:fake-uuid', + self.dev, run_as_root=True), + mock.call('sync'), + mock.call('udevadm', 'settle'), + mock.call('partprobe', self.dev, attempts=10, run_as_root=True), + mock.call('sgdisk', '-v', self.dev, run_as_root=True), + + mock.call('udevadm', 'settle'), + mock.call('test', '-e', expected_part, attempts=15, + delay_on_retry=True) + ]) + + mock_table_type.assert_called_with(self.dev) + mock_fix_gpt_partition.assert_called_with(self.dev, self.node_uuid) + mock_dd.assert_called_with(configdrive_file, expected_part) + mock_unlink.assert_called_with(configdrive_file) + mock_config_drive_work.assert_called_once_with(expected_part) + mock_rebuild_config_drive.assert_called_once_with(expected_part, + configdrive_file) + + @mock.patch('oslo_utils.uuidutils.generate_uuid', lambda: 'fake-uuid') + @mock.patch.object(partition_utils, '_try_build_fat32_config_drive', + autospec=True) + @mock.patch.object(partition_utils, '_does_config_drive_work', + autospec=True) + @mock.patch.object(utils, 'execute', autospec=True) + @mock.patch.object(utils, 'unlink_without_raise', + autospec=True) + @mock.patch.object(disk_utils, 'dd', + autospec=True) + @mock.patch.object(disk_utils, 'fix_gpt_partition', + autospec=True) + @mock.patch.object(disk_utils, 'get_partition_table_type', + autospec=True) + @mock.patch.object(partition_utils, 'get_partition', + autospec=True) + @mock.patch.object(partition_utils, 'get_labelled_partition', + autospec=True) + @mock.patch.object(partition_utils, 'get_configdrive', + autospec=True) + def test_create_partition_gpt_use_vfat( + self, mock_get_configdrive, + mock_get_labelled_partition, + mock_get_partition_by_uuid, + mock_table_type, + mock_fix_gpt_partition, + mock_dd, mock_unlink, mock_execute, + mock_config_drive_work, + mock_rebuild_config_drive): + config_url = 'http://1.2.3.4/cd' + configdrive_file = '/tmp/xyz' + configdrive_mb = 10 + + CONF.set_override('config_drive_rebuild', True) + mock_get_configdrive.return_value = (configdrive_mb, configdrive_file) + mock_get_labelled_partition.return_value = None + + mock_table_type.return_value = 'gpt' + expected_part = '/dev/fake4' + mock_get_partition_by_uuid.return_value = expected_part + mock_config_drive_work.return_value = True + + partition_utils.create_config_drive_partition(self.node_uuid, self.dev, + config_url) + mock_execute.assert_has_calls([ + mock.call('sgdisk', '-n', '0:-64MB:0', '-u', '0:fake-uuid', + self.dev, run_as_root=True), + mock.call('sync'), + mock.call('udevadm', 'settle'), + mock.call('partprobe', self.dev, attempts=10, run_as_root=True), + mock.call('sgdisk', '-v', self.dev, run_as_root=True), + + mock.call('udevadm', 'settle'), + mock.call('test', '-e', expected_part, attempts=15, + delay_on_retry=True) + ]) + + mock_table_type.assert_called_with(self.dev) + mock_fix_gpt_partition.assert_called_with(self.dev, self.node_uuid) + mock_dd.assert_not_called() + mock_unlink.assert_called_with(configdrive_file) + mock_config_drive_work.assert_not_called() + mock_rebuild_config_drive.assert_called_once_with(expected_part, + configdrive_file) + @mock.patch.object(disk_utils, 'count_mbr_partitions', autospec=True) @mock.patch.object(utils, 'execute', autospec=True) @mock.patch.object(partition_utils.LOG, 'warning', autospec=True) @@ -1288,3 +1424,94 @@ class TestGetPartition(base.IronicAgentTest): mock.call('lsblk', '-PbioKNAME,UUID,PARTUUID,TYPE,LABEL', self.fake_dev)] mock_execute.assert_has_calls(expected) + + +@mock.patch.object(utils, 'execute', autospec=True) +class TestConfigDriveTestRecovery(base.IronicAgentTest): + + fake_dev = '/dev/fake' + configdrive_file = '/tmp/config-drive' + + def test__does_config_drive_work(self, mock_execute): + self.assertTrue(partition_utils._does_config_drive_work(self.fake_dev)) + mock_execute.assert_has_calls([ + mock.call('mount', '-o', 'ro', '-t', 'auto', self.fake_dev, + mock.ANY), + mock.call('umount', mock.ANY)]) + + def test__does_config_drive_failed(self, mock_execute): + mock_execute.side_effect = processutils.ProcessExecutionError('boom') + self.assertFalse( + partition_utils._does_config_drive_work(self.fake_dev) + ) + mock_execute.assert_has_calls([ + mock.call('mount', '-o', 'ro', '-t', 'auto', self.fake_dev, + mock.ANY)]) + + @mock.patch.object(shutil, 'copytree', autospec=True) + @mock.patch.object(utils, 'mkfs', autospec=True) + def test__try_build_fat32_config_drive(self, + mock_mkfs, + mock_copy, + mock_execute): + partition_utils._try_build_fat32_config_drive(self.fake_dev, + self.configdrive_file) + mock_execute.assert_has_calls([ + mock.call('mount', '-o', 'loop,ro', '-t', 'auto', + self.configdrive_file, mock.ANY), + mock.call('mount', '-t', 'auto', self.fake_dev, mock.ANY), + mock.call('umount', mock.ANY), + mock.call('umount', mock.ANY), + ]) + mock_mkfs.assert_called_once_with(fs='vfat', path=self.fake_dev, + label='CONFIG-2') + # Validate we called copy as we expect, both source and destination + # are temporary folders. + mock_copy.assert_called_once_with(mock.ANY, mock.ANY, + dirs_exist_ok=True) + + @mock.patch.object(shutil, 'copytree', autospec=True) + @mock.patch.object(utils, 'mkfs', autospec=True) + def test__try_build_fat32_config_drive_graceful_fail( + self, + mock_mkfs, + mock_copy, + mock_execute): + mock_execute.side_effect = processutils.ProcessExecutionError('boom') + self.assertIsNone( + partition_utils._try_build_fat32_config_drive( + self.fake_dev, + self.configdrive_file) + ) + mock_execute.assert_called_once_with( + 'mount', '-o', 'loop,ro', '-t', 'auto', + self.configdrive_file, mock.ANY) + mock_mkfs.assert_not_called() + # Validate we called copy as we expect, both source and destination + # are temporary folders. + mock_copy.assert_not_called() + + @mock.patch.object(shutil, 'copytree', autospec=True) + @mock.patch.object(utils, 'mkfs', autospec=True) + def test__try_build_fat32_config_drive_fails_once_invalid( + self, + mock_mkfs, + mock_copy, + mock_execute): + mock_mkfs.side_effect = processutils.ProcessExecutionError('boom') + self.assertRaisesRegex( + exception.InstanceDeployFailure, + 'A failure occured while attempting to format.*', + partition_utils._try_build_fat32_config_drive, + self.fake_dev, + self.configdrive_file) + mock_execute.assert_has_calls([ + mock.call('mount', '-o', 'loop,ro', '-t', 'auto', + self.configdrive_file, mock.ANY), + mock.call('umount', mock.ANY), + mock.call('umount', mock.ANY), + ]) + + mock_mkfs.assert_called_once_with(fs='vfat', path=self.fake_dev, + label='CONFIG-2') + mock_copy.assert_not_called() diff --git a/releasenotes/notes/4k-block-size-config-drives-4470828dd06d2600.yaml b/releasenotes/notes/4k-block-size-config-drives-4470828dd06d2600.yaml new file mode 100644 index 000000000..7e65348c6 --- /dev/null +++ b/releasenotes/notes/4k-block-size-config-drives-4470828dd06d2600.yaml @@ -0,0 +1,12 @@ +--- +fixes: + - | + Fixes a failure case where a deployed instance may be unable to access + the configuration drive post-deployment. This can occur when block + devices only support 4KB IO interactions. When 4KB block IO sizes + are in use, the ISO9660 filesystem driver in Linux cannot be used + as it is modeled around a 2KB block. We now attempt to verify, and + rebuild the configuration drive on a FAT filesystem when we cannot + mount the supplied configuration drive. Operators can force the agent + to write configuration drives using the FAT filesystem using the + ``[DEFAULT]config_drive_rebuild`` option.