From 9d5935d007e5a50a4fd61ad033cd0135fdcaa2eb Mon Sep 17 00:00:00 2001 From: Amit Uniyal Date: Thu, 4 May 2023 12:41:00 +0000 Subject: [PATCH] Delete dangling bdms On reboot, check the instance volume status on the cinder side. Verify if volume exists and cinder has an attachment ID, else delete its BDMS data from nova DB and vice versa. Updated existing test cases to use CinderFixture while rebooting as reboot calls get_all_attachments Implements: blueprint https://blueprints.launchpad.net/nova/+spec/cleanup-dangling-volume-attachments Closes-Bug: 2019078 Change-Id: Ieb619d4bfe0a6472aefb118b58283d7ad8d24c29 --- doc/source/admin/manage-volumes.rst | 12 ++ nova/compute/manager.py | 55 ++++++- .../compute/test_attached_volumes.py | 15 +- nova/tests/functional/integrated_helpers.py | 1 + .../regressions/test_bug_1835822.py | 1 + .../tests/functional/test_instance_actions.py | 1 + nova/tests/functional/test_server_faults.py | 1 + nova/tests/unit/compute/test_compute.py | 138 +++++++++++++++++- ...ete-dangling-volumes-2615100187fe29fb.yaml | 24 +++ 9 files changed, 233 insertions(+), 15 deletions(-) create mode 100644 releasenotes/notes/delete-dangling-volumes-2615100187fe29fb.yaml diff --git a/doc/source/admin/manage-volumes.rst b/doc/source/admin/manage-volumes.rst index ef45d2c7aa88..d274e8db7c05 100644 --- a/doc/source/admin/manage-volumes.rst +++ b/doc/source/admin/manage-volumes.rst @@ -81,6 +81,18 @@ hosting the instance. This could even include refreshing different elements of the attachment to ensure the latest configuration changes within the environment have been applied. +.. note:: + + If you encounter any dangling volume attachments in either the Nova or + Cinder databases, a ``hard reboot`` of the affected instance can help + resolve the issue. During the instance reboot process, Nova performs + a synchronization mechanism that verifies the availability of volume + attachments in the Cinder database. Any missing or dangling/stale + attachments are detected and deleted from both Nova and Cinder during + ``hard reboot`` process. + + + Checking an existing attachment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/nova/compute/manager.py b/nova/compute/manager.py index e6ad4ac15471..b87a6955363c 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -4174,6 +4174,56 @@ class ComputeManager(manager.Manager): accel_info = [] return accel_info + def _delete_dangling_bdms(self, context, instance, bdms): + """Deletes dangling or stale attachments for volume from + Nova and Cinder DB so both service DBs can be in sync. + + Retrieves volume attachments from the Nova block_device_mapping + table and verifies them with the Cinder volume_attachment table. + If attachment is not present in any one of the DBs, delete + attachments from the other DB. + + :param context: The nova request context. + :param instance: instance object. + :param instance: BlockDeviceMappingList list object. + """ + + # attachments present in nova DB, ones nova knows about + nova_attachments = [] + bdms_to_delete = [] + for bdm in bdms.objects: + if bdm.volume_id and bdm.source_type == 'volume' and \ + bdm.destination_type == 'volume': + try: + self.volume_api.attachment_get(context, bdm.attachment_id) + except exception.VolumeAttachmentNotFound: + LOG.info( + f"Removing stale volume attachment " + f"'{bdm.attachment_id}' from instance for " + f"volume '{bdm.volume_id}'.", instance=instance) + bdm.destroy() + bdms_to_delete.append(bdm) + else: + nova_attachments.append(bdm.attachment_id) + + cinder_attachments = self.volume_api.attachment_get_all( + context, instance.uuid) + cinder_attachments = [each['id'] for each in cinder_attachments] + + if len(set(cinder_attachments) - set(nova_attachments)): + LOG.info( + "Removing stale volume attachments of instance from " + "Cinder", instance=instance) + for each_attach in set(cinder_attachments) - set(nova_attachments): + # delete only cinder known attachments, from cinder DB. + LOG.debug( + f"Removing attachment '{each_attach}'", instance=instance) + self.volume_api.attachment_delete(context, each_attach) + + # refresh bdms object + for bdm in bdms_to_delete: + bdms.objects.remove(bdm) + @wrap_exception() @reverts_task_state @wrap_instance_event(prefix='compute') @@ -4203,6 +4253,9 @@ class ComputeManager(manager.Manager): bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( context, instance.uuid) + + self._delete_dangling_bdms(context, instance, bdms) + block_device_info = self._get_instance_block_device_info( context, instance, bdms=bdms) @@ -5636,7 +5689,7 @@ class ComputeManager(manager.Manager): # in case _prep_resize fails. instance_state = instance.vm_state with self._error_out_instance_on_exception( - context, instance, instance_state=instance_state),\ + context, instance, instance_state=instance_state), \ errors_out_migration_ctxt(migration): self._send_prep_resize_notifications( diff --git a/nova/tests/functional/compute/test_attached_volumes.py b/nova/tests/functional/compute/test_attached_volumes.py index 7c064004d5f6..4304e3c812db 100644 --- a/nova/tests/functional/compute/test_attached_volumes.py +++ b/nova/tests/functional/compute/test_attached_volumes.py @@ -66,10 +66,8 @@ class TestAttachedVolumes( # verify if volume attachment is present at nova bdm_list = self._get_bdm_list(server) - # TODO(auniyal): Reboot should remove stale bdms # after fix bdm should not have any volume - self.assertEqual(1, len(bdm_list)) - self.assertEqual(volume_id, bdm_list[0][0]) + self.assertEqual(0, len(bdm_list)) def test_delete_multiple_stale_attachment_from_nova(self): volumes = [ @@ -109,13 +107,12 @@ class TestAttachedVolumes( bdm_list_2 = self._get_bdm_list(server) bdm_attc_vols = [bdm[0] for bdm in bdm_list_2] - # after fix only 2 volumes should be attached instead 4 - self.assertEqual(4, len(bdm_list_2)) + self.assertEqual(2, len(bdm_list_2)) self.assertIn(bdm_list[0][0], bdm_attc_vols) - self.assertIn(bdm_list[1][0], bdm_attc_vols) + self.assertNotIn(bdm_list[1][0], bdm_attc_vols) self.assertIn(bdm_list[2][0], bdm_attc_vols) - self.assertIn(bdm_list[3][0], bdm_attc_vols) + self.assertNotIn(bdm_list[3][0], bdm_attc_vols) def test_delete_multiple_stale_attachment_from_cinder(self): volume_id = 'aeb9b5f4-1fe9-4964-ab65-5e168be4de8e' @@ -149,9 +146,7 @@ class TestAttachedVolumes( # rebooting server should remove stale attachments server = self._reboot_server(server, hard=True) - # TODO(auniyal): Reboot should remove only stale attachments - # from cinder too # verify if how many volume attachments are present at cinder attached_volume_ids = self.cinder.attachment_ids_for_instance( server['id']) - self.assertEqual(attachments + 1, len(attached_volume_ids)) + self.assertEqual(1, len(attached_volume_ids)) diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py index 847409324926..6d00355d8912 100644 --- a/nova/tests/functional/integrated_helpers.py +++ b/nova/tests/functional/integrated_helpers.py @@ -1326,6 +1326,7 @@ class ProviderUsageBaseTestCase(test.TestCase, PlacementInstanceHelperMixin): self.policy = self.useFixture(nova_fixtures.RealPolicyFixture()) self.neutron = self.useFixture(nova_fixtures.NeutronFixture(self)) self.glance = self.useFixture(nova_fixtures.GlanceFixture(self)) + self.useFixture(nova_fixtures.CinderFixture(self)) self.placement = self.useFixture(func_fixtures.PlacementFixture()).api self.useFixture(nova_fixtures.AllServicesCurrent()) diff --git a/nova/tests/functional/regressions/test_bug_1835822.py b/nova/tests/functional/regressions/test_bug_1835822.py index 5736e9ea3dae..92c1fffe3c50 100644 --- a/nova/tests/functional/regressions/test_bug_1835822.py +++ b/nova/tests/functional/regressions/test_bug_1835822.py @@ -27,6 +27,7 @@ class RegressionTest1835822( self.useFixture(nova_fixtures.GlanceFixture(self)) self.useFixture(nova_fixtures.NeutronFixture(self)) self.useFixture(func_fixtures.PlacementFixture()) + self.useFixture(nova_fixtures.CinderFixture(self)) self.api = self.useFixture(nova_fixtures.OSAPIFixture( api_version='v2.1')).api diff --git a/nova/tests/functional/test_instance_actions.py b/nova/tests/functional/test_instance_actions.py index 060133ce93ec..d13b6ce4e2e4 100644 --- a/nova/tests/functional/test_instance_actions.py +++ b/nova/tests/functional/test_instance_actions.py @@ -85,6 +85,7 @@ class InstanceActionEventFaultsTestCase( self.useFixture(nova_fixtures.NeutronFixture(self)) self.useFixture(func_fixtures.PlacementFixture()) self.useFixture(nova_fixtures.RealPolicyFixture()) + self.useFixture(nova_fixtures.CinderFixture(self)) # Start the compute services. self.start_service('conductor') diff --git a/nova/tests/functional/test_server_faults.py b/nova/tests/functional/test_server_faults.py index edc3c3b377f9..632bb21f0ae7 100644 --- a/nova/tests/functional/test_server_faults.py +++ b/nova/tests/functional/test_server_faults.py @@ -33,6 +33,7 @@ class ServerFaultTestCase(test.TestCase, self.useFixture(func_fixtures.PlacementFixture()) self.useFixture(nova_fixtures.GlanceFixture(self)) self.useFixture(nova_fixtures.RealPolicyFixture()) + self.useFixture(nova_fixtures.CinderFixture(self)) # Start the compute services. self.start_service('conductor') diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 2004bcc8fb5c..447ec3199a1e 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -1471,6 +1471,132 @@ class ComputeVolumeTestCase(BaseTestCase): self.assertEqual(1, attach_block_devices.call_count) get_swap.assert_called_once_with([]) + def _test__delete_dangling_bdms( + self, instance, nova_bdms, cinder_attachments, valid=False): + with test.nested( + mock.patch.object(objects.BlockDeviceMappingList, + 'get_by_instance_uuid'), + mock.patch.object(objects.BlockDeviceMapping, 'destroy'), + mock.patch.object(cinder.API, 'attachment_get'), + mock.patch.object(cinder.API, 'attachment_get_all'), + mock.patch.object(cinder.API, 'attachment_delete') + ) as (mock_get_bdms, mock_destroy, mock_attach_get, + mock_all_attachments, mock_attachment_delete): + mock_get_bdms.return_value = nova_bdms + mock_all_attachments.return_value = cinder_attachments + + if not valid: + err = exception.VolumeAttachmentNotFound( + attachment_id=None) + mock_attach_get.side_effect = err + + self.compute._delete_dangling_bdms( + self.context, instance, nova_bdms) + + return mock_destroy, mock_attachment_delete + + def test_dangling_bdms_nothing_to_delete(self): + """no bdm, no attachments""" + instance = self._create_fake_instance_obj() + bdms = objects.BlockDeviceMappingList(objects=[]) + mock_dstr, mock_atach_del = self._test__delete_dangling_bdms( + instance, bdms, []) + self.assertTrue(mock_dstr.assert_not_called) + self.assertTrue(mock_atach_del.assert_not_called) + + def test_dangling_bdms_delete_from_bdm(self): + """valid source type: + able to retrieve from valid target attachmnet from cinder + bdm should not get deleted. + there is a 'valid' flag passed while + calling _test__delete_dangling_bdms + + invalid source type: + attachment is of image type, bdm should not get deleted + """ + instance = self._create_fake_instance_obj() + bdms = objects.BlockDeviceMappingList(objects=[ + objects.BlockDeviceMapping( + **fake_block_device.AnonFakeDbBlockDeviceDict( + { + 'instance_uuid': instance.uuid, + 'volume_id': uuids.fake_vol1, + 'attachment_id': uuids.fake_attachment_1, + 'source_type': 'volume', + 'destination_type': 'volume'})), + objects.BlockDeviceMapping( + **fake_block_device.AnonFakeDbBlockDeviceDict( + { + 'instance_uuid': instance.uuid, + 'volume_id': uuids.fake_vol2, + 'attachment_id': uuids.fake_attachment_2, + 'source_type': 'image', + 'destination_type': 'volume'})) + ]) + + mock_destroy, _ = self._test__delete_dangling_bdms( + instance, bdms, [], True) + # bdm.destroy never gets called + self.assertFalse(mock_destroy.called) + self.assertEqual(mock_destroy.call_count, 0) + # u_bdms are valid bdms, both image and volume are valid bdm + self.assertEqual(len(bdms), 2) + + def test_dangling_bdms_delete_from_multi_bdm(self): + """nova has bdms but they are not present at cinder side + both bdms should be deleted + """ + instance = self._create_fake_instance_obj() + bdms = objects.BlockDeviceMappingList(objects=[ + objects.BlockDeviceMapping( + **fake_block_device.AnonFakeDbBlockDeviceDict( + { + 'instance_uuid': instance.uuid, + 'volume_id': uuids.fake_vol1, + 'attachment_id': uuids.fake_attachment_1, + 'source_type': 'volume', + 'destination_type': 'volume'})), + objects.BlockDeviceMapping( + **fake_block_device.AnonFakeDbBlockDeviceDict( + { + 'instance_uuid': instance.uuid, + 'volume_id': uuids.fake_vol2, + 'attachment_id': uuids.fake_attachment_2, + 'source_type': 'volume', + 'destination_type': 'volume'})) + ]) + + mock_destroy, _ = self._test__delete_dangling_bdms( + instance, bdms, []) + self.assertTrue(mock_destroy.called) + self.assertEqual(mock_destroy.call_count, 2) + self.assertEqual(len(bdms), 0) + + def test_dangling_bdms_delete_cinder_attachments(self): + """out of 2 one cinder attachment is present in nova side""" + instance = self._create_fake_instance_obj() + bdms = objects.BlockDeviceMappingList(objects=[ + objects.BlockDeviceMapping( + **fake_block_device.AnonFakeDbBlockDeviceDict( + { + 'instance_uuid': instance.uuid, + 'volume_id': uuids.fake_vol1, + 'attachment_id': uuids.fake_attachment_1, + 'source_type': 'volume', + 'destination_type': 'volume'})), + ]) + + cinder_attachments = [ + {'id': uuids.fake_attachment_1}, + {'id': 2}, + ] + _, mock_attachment_delete = self._test__delete_dangling_bdms( + instance, bdms, cinder_attachments, True) + + self.assertTrue(mock_attachment_delete.call_count, 1) + self.assertEqual(mock_attachment_delete.call_args_list[0][0][1], 2) + self.assertEqual(len(bdms), 1) + class ComputeTestCase(BaseTestCase, test_diagnostics.DiagnosticsComparisonMixin, @@ -2897,6 +3023,8 @@ class ComputeTestCase(BaseTestCase, reimage_boot_volume=False, target_state=None) self.compute.terminate_instance(self.context, instance, []) + @mock.patch.object(compute_manager.ComputeManager, + '_delete_dangling_bdms') @mock.patch.object(objects.BlockDeviceMappingList, 'get_by_instance_uuid') @mock.patch.object(compute_manager.ComputeManager, '_get_instance_block_device_info') @@ -2908,9 +3036,9 @@ class ComputeTestCase(BaseTestCase, @mock.patch('nova.compute.utils.notify_about_instance_action') def _test_reboot(self, soft, mock_notify_action, mock_get_power, mock_get_orig, mock_update, mock_notify_usage, - mock_get_blk, mock_get_bdms, test_delete=False, - test_unrescue=False, fail_reboot=False, - fail_running=False): + mock_get_blk, mock_get_bdms, mock_del_stale_bdms, + test_delete=False, test_unrescue=False, + fail_reboot=False, fail_running=False): reboot_type = soft and 'SOFT' or 'HARD' task_pending = (soft and task_states.REBOOT_PENDING or task_states.REBOOT_PENDING_HARD) @@ -3127,6 +3255,8 @@ class ComputeTestCase(BaseTestCase, def test_reboot_hard_and_delete_and_rescued(self): self._test_reboot(False, test_delete=True, test_unrescue=True) + @mock.patch.object(compute_manager.ComputeManager, + '_delete_dangling_bdms') @mock.patch('nova.virt.fake.FakeDriver.reboot') @mock.patch('nova.objects.instance.Instance.save') @mock.patch.object(objects.BlockDeviceMappingList, 'get_by_instance_uuid') @@ -3141,7 +3271,7 @@ class ComputeTestCase(BaseTestCase, def _test_reboot_with_accels(self, mock_notify_action, mock_get_power, mock_get_orig, mock_update, mock_notify_usage, mock_get_blk, mock_get_bdms, mock_inst_save, mock_reboot, - extra_specs=None, accel_info=None): + mock_del_stale_bdms, extra_specs=None, accel_info=None): self.compute.network_api.get_instance_nw_info = mock.Mock() diff --git a/releasenotes/notes/delete-dangling-volumes-2615100187fe29fb.yaml b/releasenotes/notes/delete-dangling-volumes-2615100187fe29fb.yaml new file mode 100644 index 000000000000..e758fb2e7f5f --- /dev/null +++ b/releasenotes/notes/delete-dangling-volumes-2615100187fe29fb.yaml @@ -0,0 +1,24 @@ +--- +features: + - | + This change ensures the synchronization of volume attachments + between Nova and Cinder, by deleting any dangling volume attachments + and maintaining consistency between two databases. + + Block device mapping (BDM) table in the Nova database, stores + information about volume attachments, image attachments + and swap attachments. Similarly, each volume attachment had a + corresponding entry in the Cinder database volume attachment table. + + With this change, on instance reboot, Nova will checks for all volume + attachments associated with the instance and verifies their availability + in the Cinder database. If attachments are not found they will get deleted + from Nova database too. + + After Nova database cleanup, similarly Cinder database is checked for + attachments related to instance. If attachments found in Cinder DB that + are not present in Nova DB, they will get deleted from Cinder databse. + + See `spec`__ for more details. + + __ https://specs.openstack.org/openstack/nova-specs/specs/2023.2/approved/cleanup-dangling-volume-attachments.html