Delete dangling bdms

On reboot, check the instance volume status on the cinder side.
Verify if volume exists and cinder has an attachment ID, else
delete its BDMS data from nova DB and vice versa.

Updated existing test cases to use CinderFixture while rebooting as
reboot calls get_all_attachments

Implements: blueprint https://blueprints.launchpad.net/nova/+spec/cleanup-dangling-volume-attachments
Closes-Bug: 2019078

Change-Id: Ieb619d4bfe0a6472aefb118b58283d7ad8d24c29
This commit is contained in:
Amit Uniyal 2023-05-04 12:41:00 +00:00
parent c33a9ccf4c
commit 9d5935d007
9 changed files with 233 additions and 15 deletions

View File

@ -81,6 +81,18 @@ hosting the instance. This could even include refreshing different elements of
the attachment to ensure the latest configuration changes within the
environment have been applied.
.. note::
If you encounter any dangling volume attachments in either the Nova or
Cinder databases, a ``hard reboot`` of the affected instance can help
resolve the issue. During the instance reboot process, Nova performs
a synchronization mechanism that verifies the availability of volume
attachments in the Cinder database. Any missing or dangling/stale
attachments are detected and deleted from both Nova and Cinder during
``hard reboot`` process.
Checking an existing attachment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -4174,6 +4174,56 @@ class ComputeManager(manager.Manager):
accel_info = []
return accel_info
def _delete_dangling_bdms(self, context, instance, bdms):
"""Deletes dangling or stale attachments for volume from
Nova and Cinder DB so both service DBs can be in sync.
Retrieves volume attachments from the Nova block_device_mapping
table and verifies them with the Cinder volume_attachment table.
If attachment is not present in any one of the DBs, delete
attachments from the other DB.
:param context: The nova request context.
:param instance: instance object.
:param instance: BlockDeviceMappingList list object.
"""
# attachments present in nova DB, ones nova knows about
nova_attachments = []
bdms_to_delete = []
for bdm in bdms.objects:
if bdm.volume_id and bdm.source_type == 'volume' and \
bdm.destination_type == 'volume':
try:
self.volume_api.attachment_get(context, bdm.attachment_id)
except exception.VolumeAttachmentNotFound:
LOG.info(
f"Removing stale volume attachment "
f"'{bdm.attachment_id}' from instance for "
f"volume '{bdm.volume_id}'.", instance=instance)
bdm.destroy()
bdms_to_delete.append(bdm)
else:
nova_attachments.append(bdm.attachment_id)
cinder_attachments = self.volume_api.attachment_get_all(
context, instance.uuid)
cinder_attachments = [each['id'] for each in cinder_attachments]
if len(set(cinder_attachments) - set(nova_attachments)):
LOG.info(
"Removing stale volume attachments of instance from "
"Cinder", instance=instance)
for each_attach in set(cinder_attachments) - set(nova_attachments):
# delete only cinder known attachments, from cinder DB.
LOG.debug(
f"Removing attachment '{each_attach}'", instance=instance)
self.volume_api.attachment_delete(context, each_attach)
# refresh bdms object
for bdm in bdms_to_delete:
bdms.objects.remove(bdm)
@wrap_exception()
@reverts_task_state
@wrap_instance_event(prefix='compute')
@ -4203,6 +4253,9 @@ class ComputeManager(manager.Manager):
bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
context, instance.uuid)
self._delete_dangling_bdms(context, instance, bdms)
block_device_info = self._get_instance_block_device_info(
context, instance, bdms=bdms)
@ -5636,7 +5689,7 @@ class ComputeManager(manager.Manager):
# in case _prep_resize fails.
instance_state = instance.vm_state
with self._error_out_instance_on_exception(
context, instance, instance_state=instance_state),\
context, instance, instance_state=instance_state), \
errors_out_migration_ctxt(migration):
self._send_prep_resize_notifications(

View File

@ -66,10 +66,8 @@ class TestAttachedVolumes(
# verify if volume attachment is present at nova
bdm_list = self._get_bdm_list(server)
# TODO(auniyal): Reboot should remove stale bdms
# after fix bdm should not have any volume
self.assertEqual(1, len(bdm_list))
self.assertEqual(volume_id, bdm_list[0][0])
self.assertEqual(0, len(bdm_list))
def test_delete_multiple_stale_attachment_from_nova(self):
volumes = [
@ -109,13 +107,12 @@ class TestAttachedVolumes(
bdm_list_2 = self._get_bdm_list(server)
bdm_attc_vols = [bdm[0] for bdm in bdm_list_2]
# after fix only 2 volumes should be attached instead 4
self.assertEqual(4, len(bdm_list_2))
self.assertEqual(2, len(bdm_list_2))
self.assertIn(bdm_list[0][0], bdm_attc_vols)
self.assertIn(bdm_list[1][0], bdm_attc_vols)
self.assertNotIn(bdm_list[1][0], bdm_attc_vols)
self.assertIn(bdm_list[2][0], bdm_attc_vols)
self.assertIn(bdm_list[3][0], bdm_attc_vols)
self.assertNotIn(bdm_list[3][0], bdm_attc_vols)
def test_delete_multiple_stale_attachment_from_cinder(self):
volume_id = 'aeb9b5f4-1fe9-4964-ab65-5e168be4de8e'
@ -149,9 +146,7 @@ class TestAttachedVolumes(
# rebooting server should remove stale attachments
server = self._reboot_server(server, hard=True)
# TODO(auniyal): Reboot should remove only stale attachments
# from cinder too
# verify if how many volume attachments are present at cinder
attached_volume_ids = self.cinder.attachment_ids_for_instance(
server['id'])
self.assertEqual(attachments + 1, len(attached_volume_ids))
self.assertEqual(1, len(attached_volume_ids))

View File

@ -1326,6 +1326,7 @@ class ProviderUsageBaseTestCase(test.TestCase, PlacementInstanceHelperMixin):
self.policy = self.useFixture(nova_fixtures.RealPolicyFixture())
self.neutron = self.useFixture(nova_fixtures.NeutronFixture(self))
self.glance = self.useFixture(nova_fixtures.GlanceFixture(self))
self.useFixture(nova_fixtures.CinderFixture(self))
self.placement = self.useFixture(func_fixtures.PlacementFixture()).api
self.useFixture(nova_fixtures.AllServicesCurrent())

View File

@ -27,6 +27,7 @@ class RegressionTest1835822(
self.useFixture(nova_fixtures.GlanceFixture(self))
self.useFixture(nova_fixtures.NeutronFixture(self))
self.useFixture(func_fixtures.PlacementFixture())
self.useFixture(nova_fixtures.CinderFixture(self))
self.api = self.useFixture(nova_fixtures.OSAPIFixture(
api_version='v2.1')).api

View File

@ -85,6 +85,7 @@ class InstanceActionEventFaultsTestCase(
self.useFixture(nova_fixtures.NeutronFixture(self))
self.useFixture(func_fixtures.PlacementFixture())
self.useFixture(nova_fixtures.RealPolicyFixture())
self.useFixture(nova_fixtures.CinderFixture(self))
# Start the compute services.
self.start_service('conductor')

View File

@ -33,6 +33,7 @@ class ServerFaultTestCase(test.TestCase,
self.useFixture(func_fixtures.PlacementFixture())
self.useFixture(nova_fixtures.GlanceFixture(self))
self.useFixture(nova_fixtures.RealPolicyFixture())
self.useFixture(nova_fixtures.CinderFixture(self))
# Start the compute services.
self.start_service('conductor')

View File

@ -1471,6 +1471,132 @@ class ComputeVolumeTestCase(BaseTestCase):
self.assertEqual(1, attach_block_devices.call_count)
get_swap.assert_called_once_with([])
def _test__delete_dangling_bdms(
self, instance, nova_bdms, cinder_attachments, valid=False):
with test.nested(
mock.patch.object(objects.BlockDeviceMappingList,
'get_by_instance_uuid'),
mock.patch.object(objects.BlockDeviceMapping, 'destroy'),
mock.patch.object(cinder.API, 'attachment_get'),
mock.patch.object(cinder.API, 'attachment_get_all'),
mock.patch.object(cinder.API, 'attachment_delete')
) as (mock_get_bdms, mock_destroy, mock_attach_get,
mock_all_attachments, mock_attachment_delete):
mock_get_bdms.return_value = nova_bdms
mock_all_attachments.return_value = cinder_attachments
if not valid:
err = exception.VolumeAttachmentNotFound(
attachment_id=None)
mock_attach_get.side_effect = err
self.compute._delete_dangling_bdms(
self.context, instance, nova_bdms)
return mock_destroy, mock_attachment_delete
def test_dangling_bdms_nothing_to_delete(self):
"""no bdm, no attachments"""
instance = self._create_fake_instance_obj()
bdms = objects.BlockDeviceMappingList(objects=[])
mock_dstr, mock_atach_del = self._test__delete_dangling_bdms(
instance, bdms, [])
self.assertTrue(mock_dstr.assert_not_called)
self.assertTrue(mock_atach_del.assert_not_called)
def test_dangling_bdms_delete_from_bdm(self):
"""valid source type:
able to retrieve from valid target attachmnet from cinder
bdm should not get deleted.
there is a 'valid' flag passed while
calling _test__delete_dangling_bdms
invalid source type:
attachment is of image type, bdm should not get deleted
"""
instance = self._create_fake_instance_obj()
bdms = objects.BlockDeviceMappingList(objects=[
objects.BlockDeviceMapping(
**fake_block_device.AnonFakeDbBlockDeviceDict(
{
'instance_uuid': instance.uuid,
'volume_id': uuids.fake_vol1,
'attachment_id': uuids.fake_attachment_1,
'source_type': 'volume',
'destination_type': 'volume'})),
objects.BlockDeviceMapping(
**fake_block_device.AnonFakeDbBlockDeviceDict(
{
'instance_uuid': instance.uuid,
'volume_id': uuids.fake_vol2,
'attachment_id': uuids.fake_attachment_2,
'source_type': 'image',
'destination_type': 'volume'}))
])
mock_destroy, _ = self._test__delete_dangling_bdms(
instance, bdms, [], True)
# bdm.destroy never gets called
self.assertFalse(mock_destroy.called)
self.assertEqual(mock_destroy.call_count, 0)
# u_bdms are valid bdms, both image and volume are valid bdm
self.assertEqual(len(bdms), 2)
def test_dangling_bdms_delete_from_multi_bdm(self):
"""nova has bdms but they are not present at cinder side
both bdms should be deleted
"""
instance = self._create_fake_instance_obj()
bdms = objects.BlockDeviceMappingList(objects=[
objects.BlockDeviceMapping(
**fake_block_device.AnonFakeDbBlockDeviceDict(
{
'instance_uuid': instance.uuid,
'volume_id': uuids.fake_vol1,
'attachment_id': uuids.fake_attachment_1,
'source_type': 'volume',
'destination_type': 'volume'})),
objects.BlockDeviceMapping(
**fake_block_device.AnonFakeDbBlockDeviceDict(
{
'instance_uuid': instance.uuid,
'volume_id': uuids.fake_vol2,
'attachment_id': uuids.fake_attachment_2,
'source_type': 'volume',
'destination_type': 'volume'}))
])
mock_destroy, _ = self._test__delete_dangling_bdms(
instance, bdms, [])
self.assertTrue(mock_destroy.called)
self.assertEqual(mock_destroy.call_count, 2)
self.assertEqual(len(bdms), 0)
def test_dangling_bdms_delete_cinder_attachments(self):
"""out of 2 one cinder attachment is present in nova side"""
instance = self._create_fake_instance_obj()
bdms = objects.BlockDeviceMappingList(objects=[
objects.BlockDeviceMapping(
**fake_block_device.AnonFakeDbBlockDeviceDict(
{
'instance_uuid': instance.uuid,
'volume_id': uuids.fake_vol1,
'attachment_id': uuids.fake_attachment_1,
'source_type': 'volume',
'destination_type': 'volume'})),
])
cinder_attachments = [
{'id': uuids.fake_attachment_1},
{'id': 2},
]
_, mock_attachment_delete = self._test__delete_dangling_bdms(
instance, bdms, cinder_attachments, True)
self.assertTrue(mock_attachment_delete.call_count, 1)
self.assertEqual(mock_attachment_delete.call_args_list[0][0][1], 2)
self.assertEqual(len(bdms), 1)
class ComputeTestCase(BaseTestCase,
test_diagnostics.DiagnosticsComparisonMixin,
@ -2897,6 +3023,8 @@ class ComputeTestCase(BaseTestCase,
reimage_boot_volume=False, target_state=None)
self.compute.terminate_instance(self.context, instance, [])
@mock.patch.object(compute_manager.ComputeManager,
'_delete_dangling_bdms')
@mock.patch.object(objects.BlockDeviceMappingList, 'get_by_instance_uuid')
@mock.patch.object(compute_manager.ComputeManager,
'_get_instance_block_device_info')
@ -2908,9 +3036,9 @@ class ComputeTestCase(BaseTestCase,
@mock.patch('nova.compute.utils.notify_about_instance_action')
def _test_reboot(self, soft, mock_notify_action, mock_get_power,
mock_get_orig, mock_update, mock_notify_usage,
mock_get_blk, mock_get_bdms, test_delete=False,
test_unrescue=False, fail_reboot=False,
fail_running=False):
mock_get_blk, mock_get_bdms, mock_del_stale_bdms,
test_delete=False, test_unrescue=False,
fail_reboot=False, fail_running=False):
reboot_type = soft and 'SOFT' or 'HARD'
task_pending = (soft and task_states.REBOOT_PENDING or
task_states.REBOOT_PENDING_HARD)
@ -3127,6 +3255,8 @@ class ComputeTestCase(BaseTestCase,
def test_reboot_hard_and_delete_and_rescued(self):
self._test_reboot(False, test_delete=True, test_unrescue=True)
@mock.patch.object(compute_manager.ComputeManager,
'_delete_dangling_bdms')
@mock.patch('nova.virt.fake.FakeDriver.reboot')
@mock.patch('nova.objects.instance.Instance.save')
@mock.patch.object(objects.BlockDeviceMappingList, 'get_by_instance_uuid')
@ -3141,7 +3271,7 @@ class ComputeTestCase(BaseTestCase,
def _test_reboot_with_accels(self, mock_notify_action, mock_get_power,
mock_get_orig, mock_update, mock_notify_usage,
mock_get_blk, mock_get_bdms, mock_inst_save, mock_reboot,
extra_specs=None, accel_info=None):
mock_del_stale_bdms, extra_specs=None, accel_info=None):
self.compute.network_api.get_instance_nw_info = mock.Mock()

View File

@ -0,0 +1,24 @@
---
features:
- |
This change ensures the synchronization of volume attachments
between Nova and Cinder, by deleting any dangling volume attachments
and maintaining consistency between two databases.
Block device mapping (BDM) table in the Nova database, stores
information about volume attachments, image attachments
and swap attachments. Similarly, each volume attachment had a
corresponding entry in the Cinder database volume attachment table.
With this change, on instance reboot, Nova will checks for all volume
attachments associated with the instance and verifies their availability
in the Cinder database. If attachments are not found they will get deleted
from Nova database too.
After Nova database cleanup, similarly Cinder database is checked for
attachments related to instance. If attachments found in Cinder DB that
are not present in Nova DB, they will get deleted from Cinder databse.
See `spec`__ for more details.
__ https://specs.openstack.org/openstack/nova-specs/specs/2023.2/approved/cleanup-dangling-volume-attachments.html