conductor: Recreate volume attachments during a reschedule

When an instance with attached volumes fails to spawn, cleanup code
within the compute manager (_shutdown_instance called from
_build_resources) will delete the volume attachments referenced by
the bdms in Cinder. As a result we should check and if necessary
recreate these volume attachments when rescheduling an instance.

Note that there are a few different ways to fix this bug by
making changes to the compute manager code, either by not deleting
the volume attachment on failure before rescheduling [1] or by
performing the get/create check during each build after the
reschedule [2].

The problem with *not* cleaning up the attachments is if we don't
reschedule, then we've left orphaned "reserved" volumes in Cinder
(or we have to add special logic to tell compute when to cleanup
attachments).

The problem with checking the existence of the attachment on every
new host we build on is that we'd be needlessly checking that for
initial creates even if we don't ever need to reschedule, unless
again we have special logic against that (like checking to see if
we've rescheduled at all).

Also, in either case that involves changes to the compute means that
older computes might not have the fix.

So ultimately it seems that the best way to handle this is:

1. Only deal with this on reschedules.
2. Let the cell conductor orchestrate it since it's already dealing
   with the reschedule. Then the compute logic doesn't need to change.

[1] https://review.openstack.org/#/c/587071/3/nova/compute/manager.py@1631
[2] https://review.openstack.org/#/c/587071/4/nova/compute/manager.py@1667

Conflicts:

  nova/tests/unit/conductor/test_conductor.py

NOTE(mriedem): There was a minor conflict due to not having change
I56fb1fd984f06a58c3a7e8c2596471991950433a in Queens.

Change-Id: I739c06bd02336bf720cddacb21f48e7857378487
Closes-bug: #1784353
(cherry picked from commit 41452a5c6a)
(cherry picked from commit d3397788fe)
This commit is contained in:
Lee Yarwood 2018-07-30 13:41:35 +01:00 committed by Matt Riedemann
parent 51345d57a1
commit 3a0f26c822
3 changed files with 117 additions and 6 deletions

View File

@ -52,6 +52,7 @@ from nova.scheduler import client as scheduler_client
from nova.scheduler import utils as scheduler_utils
from nova import servicegroup
from nova import utils
from nova.volume import cinder
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
@ -225,6 +226,7 @@ class ComputeTaskManager(base.Base):
def __init__(self):
super(ComputeTaskManager, self).__init__()
self.compute_rpcapi = compute_rpcapi.ComputeAPI()
self.volume_api = cinder.API()
self.image_api = image.API()
self.network_api = network.API()
self.servicegroup_api = servicegroup.API()
@ -514,6 +516,24 @@ class ComputeTaskManager(base.Base):
inst_mapping.save()
return inst_mapping
def _validate_existing_attachment_ids(self, context, instance, bdms):
"""Ensure any attachment ids referenced by the bdms exist.
New attachments will only be created if the attachment ids referenced
by the bdms no longer exist. This can happen when an instance is
rescheduled after a failure to spawn as cleanup code on the previous
host will delete attachments before rescheduling.
"""
for bdm in bdms:
if bdm.is_volume and bdm.attachment_id:
try:
self.volume_api.attachment_get(context, bdm.attachment_id)
except exception.VolumeAttachmentNotFound:
attachment = self.volume_api.attachment_create(
context, bdm.volume_id, instance.uuid)
bdm.attachment_id = attachment['id']
bdm.save()
# NOTE(danms): This is never cell-targeted because it is only used for
# cellsv1 (which does not target cells directly) and n-cpu reschedules
# (which go to the cell conductor and thus are always cell-specific).
@ -693,6 +713,11 @@ class ComputeTaskManager(base.Base):
if inst_mapping:
inst_mapping.destroy()
return
else:
# NOTE(lyarwood): If this is a reschedule then recreate any
# attachments that were previously removed when cleaning up
# after failures to spawn etc.
self._validate_existing_attachment_ids(context, instance, bdms)
alts = [(alt.service_host, alt.nodename) for alt in host_list]
LOG.debug("Selected host: %s; Selected node: %s; Alternates: %s",

View File

@ -80,9 +80,11 @@ class TestRescheduleWithVolumesAttached(
server_response = self.api.post_server({'server': server_request})
server_id = server_response['id']
# FIXME(lyarwood): Assert that the instance ends up in an ERROR state
# with no attachments present in the fixture. The instance should be
# ACTIVE with a single attachment associated to the volume.
self._wait_for_state_change(self.api, server_response, 'ERROR')
self.assertNotIn(volume_id,
self.cinder.volume_ids_for_instance(server_id))
self._wait_for_state_change(self.api, server_response, 'ACTIVE')
attached_volume_ids = self.cinder.volume_ids_for_instance(server_id)
self.assertIn(volume_id, attached_volume_ids)
self.assertEqual(1, len(self.cinder.volume_to_attachment))
# There should only be one attachment record for the volume and
# instance because the original would have been deleted before
# rescheduling off the first host.
self.assertEqual(1, len(self.cinder.volume_to_attachment[volume_id]))

View File

@ -44,6 +44,7 @@ from nova import exception as exc
from nova.image import api as image_api
from nova import objects
from nova.objects import base as obj_base
from nova.objects import block_device as block_device_obj
from nova.objects import fields
from nova import rpc
from nova.scheduler import client as scheduler_client
@ -61,6 +62,7 @@ from nova.tests.unit import fake_server_actions
from nova.tests.unit import utils as test_utils
from nova.tests import uuidsentinel as uuids
from nova import utils
from nova.volume import cinder
CONF = conf.CONF
@ -961,6 +963,88 @@ class _BaseTaskTestCase(object):
do_test()
@mock.patch.object(cinder.API, 'attachment_get')
@mock.patch.object(cinder.API, 'attachment_create')
@mock.patch.object(block_device_obj.BlockDeviceMapping, 'save')
def test_validate_existing_attachment_ids_with_missing_attachments(self,
mock_bdm_save, mock_attachment_create, mock_attachment_get):
instance = self._create_fake_instance_obj()
bdms = [
block_device.BlockDeviceDict({
'boot_index': 0,
'guest_format': None,
'connection_info': None,
'device_type': u'disk',
'source_type': 'image',
'destination_type': 'volume',
'volume_size': 1,
'image_id': 1,
'device_name': '/dev/vdb',
'attachment_id': uuids.attachment,
'volume_id': uuids.volume
})]
bdms = block_device_obj.block_device_make_list_from_dicts(
self.context, bdms)
mock_attachment_get.side_effect = exc.VolumeAttachmentNotFound(
attachment_id=uuids.attachment)
mock_attachment_create.return_value = {'id': uuids.new_attachment}
self.assertEqual(uuids.attachment, bdms[0].attachment_id)
self.conductor_manager._validate_existing_attachment_ids(self.context,
instance,
bdms)
mock_attachment_get.assert_called_once_with(self.context,
uuids.attachment)
mock_attachment_create.assert_called_once_with(self.context,
uuids.volume,
instance.uuid)
mock_bdm_save.assert_called_once()
self.assertEqual(uuids.new_attachment, bdms[0].attachment_id)
@mock.patch.object(cinder.API, 'attachment_get')
@mock.patch.object(cinder.API, 'attachment_create')
@mock.patch.object(block_device_obj.BlockDeviceMapping, 'save')
def test_validate_existing_attachment_ids_with_attachments_present(self,
mock_bdm_save, mock_attachment_create, mock_attachment_get):
instance = self._create_fake_instance_obj()
bdms = [
block_device.BlockDeviceDict({
'boot_index': 0,
'guest_format': None,
'connection_info': None,
'device_type': u'disk',
'source_type': 'image',
'destination_type': 'volume',
'volume_size': 1,
'image_id': 1,
'device_name': '/dev/vdb',
'attachment_id': uuids.attachment,
'volume_id': uuids.volume
})]
bdms = block_device_obj.block_device_make_list_from_dicts(
self.context, bdms)
mock_attachment_get.return_value = {
"attachment": {
"status": "attaching",
"detached_at": "2015-09-16T09:28:52.000000",
"connection_info": {},
"attached_at": "2015-09-16T09:28:52.000000",
"attach_mode": "ro",
"instance": instance.uuid,
"volume_id": uuids.volume,
"id": uuids.attachment
}}
self.assertEqual(uuids.attachment, bdms[0].attachment_id)
self.conductor_manager._validate_existing_attachment_ids(self.context,
instance,
bdms)
mock_attachment_get.assert_called_once_with(self.context,
uuids.attachment)
mock_attachment_create.assert_not_called()
mock_bdm_save.assert_not_called()
self.assertEqual(uuids.attachment, bdms[0].attachment_id)
def test_unshelve_instance_on_host(self):
instance = self._create_fake_instance_obj()
instance.vm_state = vm_states.SHELVED