From 3a0f26c822912dbaad7c8a1d6a3c599025afab37 Mon Sep 17 00:00:00 2001 From: Lee Yarwood Date: Mon, 30 Jul 2018 13:41:35 +0100 Subject: [PATCH] conductor: Recreate volume attachments during a reschedule When an instance with attached volumes fails to spawn, cleanup code within the compute manager (_shutdown_instance called from _build_resources) will delete the volume attachments referenced by the bdms in Cinder. As a result we should check and if necessary recreate these volume attachments when rescheduling an instance. Note that there are a few different ways to fix this bug by making changes to the compute manager code, either by not deleting the volume attachment on failure before rescheduling [1] or by performing the get/create check during each build after the reschedule [2]. The problem with *not* cleaning up the attachments is if we don't reschedule, then we've left orphaned "reserved" volumes in Cinder (or we have to add special logic to tell compute when to cleanup attachments). The problem with checking the existence of the attachment on every new host we build on is that we'd be needlessly checking that for initial creates even if we don't ever need to reschedule, unless again we have special logic against that (like checking to see if we've rescheduled at all). Also, in either case that involves changes to the compute means that older computes might not have the fix. So ultimately it seems that the best way to handle this is: 1. Only deal with this on reschedules. 2. Let the cell conductor orchestrate it since it's already dealing with the reschedule. Then the compute logic doesn't need to change. [1] https://review.openstack.org/#/c/587071/3/nova/compute/manager.py@1631 [2] https://review.openstack.org/#/c/587071/4/nova/compute/manager.py@1667 Conflicts: nova/tests/unit/conductor/test_conductor.py NOTE(mriedem): There was a minor conflict due to not having change I56fb1fd984f06a58c3a7e8c2596471991950433a in Queens. Change-Id: I739c06bd02336bf720cddacb21f48e7857378487 Closes-bug: #1784353 (cherry picked from commit 41452a5c6adb8cae54eef24803f4adc468131b34) (cherry picked from commit d3397788fe2d9267c34698d9459b0abe3f215046) --- nova/conductor/manager.py | 25 ++++++ .../regressions/test_bug_1784353.py | 14 ++-- nova/tests/unit/conductor/test_conductor.py | 84 +++++++++++++++++++ 3 files changed, 117 insertions(+), 6 deletions(-) diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index b6ee6aad44db..ebff9a22b94f 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -52,6 +52,7 @@ from nova.scheduler import client as scheduler_client from nova.scheduler import utils as scheduler_utils from nova import servicegroup from nova import utils +from nova.volume import cinder LOG = logging.getLogger(__name__) CONF = cfg.CONF @@ -225,6 +226,7 @@ class ComputeTaskManager(base.Base): def __init__(self): super(ComputeTaskManager, self).__init__() self.compute_rpcapi = compute_rpcapi.ComputeAPI() + self.volume_api = cinder.API() self.image_api = image.API() self.network_api = network.API() self.servicegroup_api = servicegroup.API() @@ -514,6 +516,24 @@ class ComputeTaskManager(base.Base): inst_mapping.save() return inst_mapping + def _validate_existing_attachment_ids(self, context, instance, bdms): + """Ensure any attachment ids referenced by the bdms exist. + + New attachments will only be created if the attachment ids referenced + by the bdms no longer exist. This can happen when an instance is + rescheduled after a failure to spawn as cleanup code on the previous + host will delete attachments before rescheduling. + """ + for bdm in bdms: + if bdm.is_volume and bdm.attachment_id: + try: + self.volume_api.attachment_get(context, bdm.attachment_id) + except exception.VolumeAttachmentNotFound: + attachment = self.volume_api.attachment_create( + context, bdm.volume_id, instance.uuid) + bdm.attachment_id = attachment['id'] + bdm.save() + # NOTE(danms): This is never cell-targeted because it is only used for # cellsv1 (which does not target cells directly) and n-cpu reschedules # (which go to the cell conductor and thus are always cell-specific). @@ -693,6 +713,11 @@ class ComputeTaskManager(base.Base): if inst_mapping: inst_mapping.destroy() return + else: + # NOTE(lyarwood): If this is a reschedule then recreate any + # attachments that were previously removed when cleaning up + # after failures to spawn etc. + self._validate_existing_attachment_ids(context, instance, bdms) alts = [(alt.service_host, alt.nodename) for alt in host_list] LOG.debug("Selected host: %s; Selected node: %s; Alternates: %s", diff --git a/nova/tests/functional/regressions/test_bug_1784353.py b/nova/tests/functional/regressions/test_bug_1784353.py index 16c4ce444e5d..837c31e33dfd 100644 --- a/nova/tests/functional/regressions/test_bug_1784353.py +++ b/nova/tests/functional/regressions/test_bug_1784353.py @@ -80,9 +80,11 @@ class TestRescheduleWithVolumesAttached( server_response = self.api.post_server({'server': server_request}) server_id = server_response['id'] - # FIXME(lyarwood): Assert that the instance ends up in an ERROR state - # with no attachments present in the fixture. The instance should be - # ACTIVE with a single attachment associated to the volume. - self._wait_for_state_change(self.api, server_response, 'ERROR') - self.assertNotIn(volume_id, - self.cinder.volume_ids_for_instance(server_id)) + self._wait_for_state_change(self.api, server_response, 'ACTIVE') + attached_volume_ids = self.cinder.volume_ids_for_instance(server_id) + self.assertIn(volume_id, attached_volume_ids) + self.assertEqual(1, len(self.cinder.volume_to_attachment)) + # There should only be one attachment record for the volume and + # instance because the original would have been deleted before + # rescheduling off the first host. + self.assertEqual(1, len(self.cinder.volume_to_attachment[volume_id])) diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py index a720223e40d4..4a4d11f8ce45 100644 --- a/nova/tests/unit/conductor/test_conductor.py +++ b/nova/tests/unit/conductor/test_conductor.py @@ -44,6 +44,7 @@ from nova import exception as exc from nova.image import api as image_api from nova import objects from nova.objects import base as obj_base +from nova.objects import block_device as block_device_obj from nova.objects import fields from nova import rpc from nova.scheduler import client as scheduler_client @@ -61,6 +62,7 @@ from nova.tests.unit import fake_server_actions from nova.tests.unit import utils as test_utils from nova.tests import uuidsentinel as uuids from nova import utils +from nova.volume import cinder CONF = conf.CONF @@ -961,6 +963,88 @@ class _BaseTaskTestCase(object): do_test() + @mock.patch.object(cinder.API, 'attachment_get') + @mock.patch.object(cinder.API, 'attachment_create') + @mock.patch.object(block_device_obj.BlockDeviceMapping, 'save') + def test_validate_existing_attachment_ids_with_missing_attachments(self, + mock_bdm_save, mock_attachment_create, mock_attachment_get): + instance = self._create_fake_instance_obj() + bdms = [ + block_device.BlockDeviceDict({ + 'boot_index': 0, + 'guest_format': None, + 'connection_info': None, + 'device_type': u'disk', + 'source_type': 'image', + 'destination_type': 'volume', + 'volume_size': 1, + 'image_id': 1, + 'device_name': '/dev/vdb', + 'attachment_id': uuids.attachment, + 'volume_id': uuids.volume + })] + bdms = block_device_obj.block_device_make_list_from_dicts( + self.context, bdms) + mock_attachment_get.side_effect = exc.VolumeAttachmentNotFound( + attachment_id=uuids.attachment) + mock_attachment_create.return_value = {'id': uuids.new_attachment} + + self.assertEqual(uuids.attachment, bdms[0].attachment_id) + self.conductor_manager._validate_existing_attachment_ids(self.context, + instance, + bdms) + mock_attachment_get.assert_called_once_with(self.context, + uuids.attachment) + mock_attachment_create.assert_called_once_with(self.context, + uuids.volume, + instance.uuid) + mock_bdm_save.assert_called_once() + self.assertEqual(uuids.new_attachment, bdms[0].attachment_id) + + @mock.patch.object(cinder.API, 'attachment_get') + @mock.patch.object(cinder.API, 'attachment_create') + @mock.patch.object(block_device_obj.BlockDeviceMapping, 'save') + def test_validate_existing_attachment_ids_with_attachments_present(self, + mock_bdm_save, mock_attachment_create, mock_attachment_get): + instance = self._create_fake_instance_obj() + bdms = [ + block_device.BlockDeviceDict({ + 'boot_index': 0, + 'guest_format': None, + 'connection_info': None, + 'device_type': u'disk', + 'source_type': 'image', + 'destination_type': 'volume', + 'volume_size': 1, + 'image_id': 1, + 'device_name': '/dev/vdb', + 'attachment_id': uuids.attachment, + 'volume_id': uuids.volume + })] + bdms = block_device_obj.block_device_make_list_from_dicts( + self.context, bdms) + mock_attachment_get.return_value = { + "attachment": { + "status": "attaching", + "detached_at": "2015-09-16T09:28:52.000000", + "connection_info": {}, + "attached_at": "2015-09-16T09:28:52.000000", + "attach_mode": "ro", + "instance": instance.uuid, + "volume_id": uuids.volume, + "id": uuids.attachment + }} + + self.assertEqual(uuids.attachment, bdms[0].attachment_id) + self.conductor_manager._validate_existing_attachment_ids(self.context, + instance, + bdms) + mock_attachment_get.assert_called_once_with(self.context, + uuids.attachment) + mock_attachment_create.assert_not_called() + mock_bdm_save.assert_not_called() + self.assertEqual(uuids.attachment, bdms[0].attachment_id) + def test_unshelve_instance_on_host(self): instance = self._create_fake_instance_obj() instance.vm_state = vm_states.SHELVED