diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 246cc92dd588..021c6f9d440c 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -31,6 +31,7 @@ import contextlib import copy import functools import inspect +import math import sys import time import traceback @@ -615,7 +616,7 @@ class ComputeVirtAPI(virtapi.VirtAPI): class ComputeManager(manager.Manager): """Manages the running instances from creation to destruction.""" - target = messaging.Target(version='6.0') + target = messaging.Target(version='6.1') def __init__(self, compute_driver=None, *args, **kwargs): """Load configuration options and connect to the hypervisor.""" @@ -3398,18 +3399,124 @@ class ComputeManager(manager.Manager): migration.status = status migration.save() + @staticmethod + def _reimage_failed_callback(event_name, instance): + msg = ('Cinder reported failure during reimaging ' + 'with %(event)s for instance %(uuid)s') + msg_args = {'event': event_name, 'uuid': instance.uuid} + LOG.error(msg, msg_args) + raise exception.ReimageException(msg % msg_args) + + def _detach_root_volume(self, context, instance, root_bdm): + volume_id = root_bdm.volume_id + mp = root_bdm.device_name + old_connection_info = jsonutils.loads(root_bdm.connection_info) + try: + self.driver.detach_volume(context, old_connection_info, + instance, root_bdm.device_name) + except exception.DiskNotFound as err: + LOG.warning('Ignoring DiskNotFound exception while ' + 'detaching volume %(volume_id)s from ' + '%(mp)s : %(err)s', + {'volume_id': volume_id, 'mp': mp, + 'err': err}, instance=instance) + except exception.DeviceDetachFailed: + with excutils.save_and_reraise_exception(): + LOG.warning('Guest refused to detach volume %(vol)s', + {'vol': volume_id}, instance=instance) + self.volume_api.roll_detaching(context, volume_id) + except Exception: + with excutils.save_and_reraise_exception(): + LOG.exception('Failed to detach volume ' + '%(volume_id)s from %(mp)s', + {'volume_id': volume_id, 'mp': mp}, + instance=instance) + self.volume_api.roll_detaching(context, volume_id) + + def _rebuild_volume_backed_instance(self, context, instance, bdms, + image_id): + # Get root bdm and attachment ID associated to it + root_bdm = compute_utils.get_root_bdm(context, instance, bdms) + old_attachment_id = root_bdm.attachment_id + + # Create a new attachment and delete the previous attachment + # We create a new attachment first to keep the volume in + # reserved state after old attachment is deleted and avoid any + # races in between the attachment create and delete. + attachment_id = None + try: + attachment_id = self.volume_api.attachment_create( + context, root_bdm.volume_id, instance.uuid)['id'] + self._detach_root_volume(context, instance, root_bdm) + root_bdm.attachment_id = attachment_id + root_bdm.save() + self.volume_api.attachment_delete(context, + old_attachment_id) + except exception.InstanceNotFound: + # This means we failed to save the new attachment because + # the instance is deleted, so (try to) delete it and abort. + try: + self.volume_api.attachment_delete(context, + attachment_id) + except cinder_exception.ClientException: + LOG.error('Failed to delete new attachment %s', + attachment_id) + msg = _('Failed to rebuild volume backed instance.') + raise exception.BuildAbortException( + instance_uuid=instance.uuid, reason=msg) + except cinder_exception.ClientException: + if attachment_id: + LOG.error('Failed to delete old attachment %s', + old_attachment_id) + else: + LOG.error('Failed to create new attachment') + msg = _('Failed to rebuild volume backed instance.') + raise exception.BuildAbortException( + instance_uuid=instance.uuid, reason=msg) + events = [('volume-reimaged', root_bdm.volume_id)] + + # Get the image requested for rebuild + try: + image = self.image_api.get(context, image_id) + except exception.ImageNotFound: + msg = _('Image %s not found.') % image_id + LOG.error(msg) + raise exception.BuildAbortException( + instance_uuid=instance.uuid, reason=msg) + image_size = int(math.ceil(float(image.get('size')) / units.Gi)) + deadline = CONF.reimage_timeout_per_gb * image_size + error_cb = self._reimage_failed_callback + + # Call cinder to perform reimage operation and wait until an + # external event is triggered. + try: + with self.virtapi.wait_for_instance_event(instance, events, + deadline=deadline, + error_callback=error_cb): + self.volume_api.reimage_volume( + context, root_bdm.volume_id, image_id, + reimage_reserved=True) + + except Exception as ex: + LOG.error('Failed to rebuild volume backed instance: %s', + str(ex), instance=instance) + msg = _('Failed to rebuild volume backed instance.') + raise exception.BuildAbortException( + instance_uuid=instance.uuid, reason=msg) + def _rebuild_default_impl( self, context, instance, image_meta, injected_files, admin_password, allocations, bdms, detach_block_devices, attach_block_devices, network_info=None, evacuate=False, block_device_info=None, preserve_ephemeral=False, - accel_uuids=None): + accel_uuids=None, reimage_boot_volume=False): if preserve_ephemeral: # The default code path does not support preserving ephemeral # partitions. raise exception.PreserveEphemeralNotSupported() accel_info = [] + detach_root_bdm = not reimage_boot_volume if evacuate: if instance.flavor.extra_specs.get('accel:device_profile'): try: @@ -3421,13 +3528,36 @@ class ComputeManager(manager.Manager): msg = _('Failure getting accelerator resources.') raise exception.BuildAbortException( instance_uuid=instance.uuid, reason=msg) - detach_block_devices(context, bdms) + detach_block_devices(context, bdms, + detach_root_bdm=detach_root_bdm) else: self._power_off_instance(instance, clean_shutdown=True) - detach_block_devices(context, bdms) - self.driver.destroy(context, instance, - network_info=network_info, - block_device_info=block_device_info) + detach_block_devices(context, bdms, + detach_root_bdm=detach_root_bdm) + if reimage_boot_volume: + # Previously, the calls reaching here were for image + # backed instance rebuild and didn't have a root bdm + # so now we need to handle the case for root bdm. + # For the root BDM, we are doing attach/detach operations + # manually as we want to maintain a 'reserved' state + # throughout the reimage process from the cinder side so + # we are excluding the root BDM from certain operations + # here i.e. deleteing it's mapping before the destroy call. + block_device_info_copy = copy.deepcopy(block_device_info) + root_bdm = compute_utils.get_root_bdm(context, instance, bdms) + mapping = block_device_info_copy["block_device_mapping"] + # drop root bdm from the mapping + mapping = [ + bdm for bdm in mapping + if bdm["volume_id"] != root_bdm.volume_id + ] + self.driver.destroy(context, instance, + network_info=network_info, + block_device_info=block_device_info_copy) + else: + self.driver.destroy(context, instance, + network_info=network_info, + block_device_info=block_device_info) try: accel_info = self._get_accel_info(context, instance) except Exception as exc: @@ -3436,6 +3566,12 @@ class ComputeManager(manager.Manager): msg = _('Failure getting accelerator resources.') raise exception.BuildAbortException( instance_uuid=instance.uuid, reason=msg) + if reimage_boot_volume: + is_volume_backed = compute_utils.is_volume_backed_instance( + context, instance, bdms) + if is_volume_backed: + self._rebuild_volume_backed_instance( + context, instance, bdms, image_meta.id) instance.task_state = task_states.REBUILD_BLOCK_DEVICE_MAPPING instance.save(expected_task_state=[task_states.REBUILDING]) @@ -3470,7 +3606,8 @@ class ComputeManager(manager.Manager): injected_files, new_pass, orig_sys_metadata, bdms, recreate, on_shared_storage, preserve_ephemeral, migration, - scheduled_node, limits, request_spec, accel_uuids): + scheduled_node, limits, request_spec, accel_uuids, + reimage_boot_volume): """Destroy and re-make this instance. A 'rebuild' effectively purges all existing data from the system and @@ -3502,6 +3639,9 @@ class ComputeManager(manager.Manager): specified by the user, this will be None :param request_spec: a RequestSpec object used to schedule the instance :param accel_uuids: a list of cyborg ARQ uuids + :param reimage_boot_volume: Boolean to specify whether the user has + explicitly requested to rebuild a boot + volume """ # recreate=True means the instance is being evacuated from a failed @@ -3566,7 +3706,7 @@ class ComputeManager(manager.Manager): image_meta, injected_files, new_pass, orig_sys_metadata, bdms, evacuate, on_shared_storage, preserve_ephemeral, migration, request_spec, allocs, rebuild_claim, - scheduled_node, limits, accel_uuids) + scheduled_node, limits, accel_uuids, reimage_boot_volume) except (exception.ComputeResourcesUnavailable, exception.RescheduledException) as e: if isinstance(e, exception.ComputeResourcesUnavailable): @@ -3625,7 +3765,8 @@ class ComputeManager(manager.Manager): self, context, instance, orig_image_ref, image_meta, injected_files, new_pass, orig_sys_metadata, bdms, evacuate, on_shared_storage, preserve_ephemeral, migration, request_spec, - allocations, rebuild_claim, scheduled_node, limits, accel_uuids): + allocations, rebuild_claim, scheduled_node, limits, accel_uuids, + reimage_boot_volume): """Helper to avoid deep nesting in the top-level method.""" provider_mapping = None @@ -3647,7 +3788,7 @@ class ComputeManager(manager.Manager): context, instance, orig_image_ref, image_meta, injected_files, new_pass, orig_sys_metadata, bdms, evacuate, on_shared_storage, preserve_ephemeral, migration, request_spec, allocations, - provider_mapping, accel_uuids) + provider_mapping, accel_uuids, reimage_boot_volume) @staticmethod def _get_image_name(image_meta): @@ -3661,7 +3802,7 @@ class ComputeManager(manager.Manager): injected_files, new_pass, orig_sys_metadata, bdms, evacuate, on_shared_storage, preserve_ephemeral, migration, request_spec, allocations, request_group_resource_providers_mapping, - accel_uuids): + accel_uuids, reimage_boot_volume): orig_vm_state = instance.vm_state if evacuate: @@ -3766,8 +3907,23 @@ class ComputeManager(manager.Manager): self._get_instance_block_device_info( context, instance, bdms=bdms) - def detach_block_devices(context, bdms): + def detach_block_devices(context, bdms, detach_root_bdm=True): for bdm in bdms: + # Previously, the calls made to this method by rebuild + # instance operation were for image backed instances which + # assumed we only had attached volumes and no root BDM. + # Now we need to handle case for root BDM which we are + # doing manually so skipping the attachment create/delete + # calls from here. + # The detach_root_bdm parameter is only passed while + # rebuilding the volume backed instance so we don't have + # to worry about other callers as they won't satisfy this + # condition. + # For evacuate case, we have detach_root_bdm always True + # since we don't have reimage_boot_volume parameter in + # this case so this will not be executed. + if not detach_root_bdm and bdm.is_root: + continue if bdm.is_volume: # NOTE (ildikov): Having the attachment_id set in the BDM # means that it's the new Cinder attach/detach flow @@ -3803,7 +3959,8 @@ class ComputeManager(manager.Manager): network_info=network_info, preserve_ephemeral=preserve_ephemeral, evacuate=evacuate, - accel_uuids=accel_uuids) + accel_uuids=accel_uuids, + reimage_boot_volume=reimage_boot_volume) try: with instance.mutated_migration_context(): self.driver.rebuild(**kwargs) @@ -11082,7 +11239,7 @@ class _ComputeV5Proxy(object): bdms, recreate, on_shared_storage, preserve_ephemeral, migration, scheduled_node, limits, request_spec, - accel_uuids) + accel_uuids, False) # 5.13 support for optional accel_uuids argument def shelve_instance(self, context, instance, image_id, diff --git a/nova/compute/rpcapi.py b/nova/compute/rpcapi.py index fa5b0ee8d99c..b370919e8361 100644 --- a/nova/compute/rpcapi.py +++ b/nova/compute/rpcapi.py @@ -402,6 +402,7 @@ class ComputeAPI(object): * ... - Rename the instance_type argument of prep_resize() to flavor * ... - Rename the instance_type argument of resize_instance() to flavor + * 6.1 - Add reimage_boot_volume parameter to rebuild_instance() ''' VERSION_ALIASES = { @@ -1080,7 +1081,8 @@ class ComputeAPI(object): self, ctxt, instance, new_pass, injected_files, image_ref, orig_image_ref, orig_sys_metadata, bdms, recreate, on_shared_storage, host, node, - preserve_ephemeral, migration, limits, request_spec, accel_uuids): + preserve_ephemeral, migration, limits, request_spec, accel_uuids, + reimage_boot_volume): # NOTE(edleafe): compute nodes can only use the dict form of limits. if isinstance(limits, objects.SchedulerLimits): @@ -1092,10 +1094,20 @@ class ComputeAPI(object): 'scheduled_node': node, 'limits': limits, 'request_spec': request_spec, - 'accel_uuids': accel_uuids + 'accel_uuids': accel_uuids, + 'reimage_boot_volume': reimage_boot_volume } - version = self._ver(ctxt, '5.12') + + version = '6.1' client = self.router.client(ctxt) + if not client.can_send_version(version): + if msg_args['reimage_boot_volume']: + raise exception.NovaException( + 'Compute RPC version does not support ' + 'reimage_boot_volume parameter.') + else: + del msg_args['reimage_boot_volume'] + version = self._ver(ctxt, '5.12') if not client.can_send_version(version): del msg_args['accel_uuids'] version = '5.0' diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index 594c3dd61cec..2d2fb2b24357 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -1146,7 +1146,7 @@ class ComputeTaskManager: injected_files, new_pass, orig_sys_metadata, bdms, recreate, on_shared_storage, preserve_ephemeral=False, host=None, - request_spec=None): + request_spec=None, reimage_boot_volume=False): # recreate=True means the instance is being evacuated from a failed # host to a new destination host. The 'recreate' variable name is # confusing, so rename it to evacuate here at the top, which is simpler @@ -1343,7 +1343,8 @@ class ComputeTaskManager: node=node, limits=limits, request_spec=request_spec, - accel_uuids=accel_uuids) + accel_uuids=accel_uuids, + reimage_boot_volume=False) def _validate_image_traits_for_rebuild(self, context, instance, image_ref): """Validates that the traits specified in the image can be satisfied diff --git a/nova/conf/compute.py b/nova/conf/compute.py index 1a139e08d55e..004dbb83b6da 100644 --- a/nova/conf/compute.py +++ b/nova/conf/compute.py @@ -305,6 +305,21 @@ Related options: agent disabled. When used with libvirt the instance mode should be configured as HVM. """), + cfg.IntOpt('reimage_timeout_per_gb', + default=20, + min=1, + help=""" +Timeout for reimaging a volume. + +Number of seconds to wait for volume-reimaged events to arrive before +continuing or failing. + +This is a per gigabyte time which has a default value of 20 seconds and +will be multiplied by the GB size of image. Eg: an image of 6 GB will have +a timeout of 20 * 6 = 120 seconds. +Try increasing the timeout if the image copy per GB takes more time and you +are hitting timeout failures. +"""), ] resource_tracker_opts = [ diff --git a/nova/exception.py b/nova/exception.py index fe3c12d87072..27a067640409 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -2477,3 +2477,7 @@ class PlacementPciMixedTraitsException(PlacementPciException): "of 'traits' in [pci]device_spec. We got %(new_traits)s for " "%(new_dev)s and %(current_traits)s for %(current_devs)s." ) + + +class ReimageException(NovaException): + msg_fmt = _("Reimaging volume failed.") diff --git a/nova/objects/external_event.py b/nova/objects/external_event.py index b1acfc4aa0d6..e17008dade17 100644 --- a/nova/objects/external_event.py +++ b/nova/objects/external_event.py @@ -33,6 +33,9 @@ EVENT_NAMES = [ # Accelerator Request got bound, tag is ARQ uuid. # Sent when an ARQ for an instance has been bound or failed to bind. 'accelerator-request-bound', + + # re-image operation has completed from cinder side + 'volume-reimaged', ] EVENT_STATUSES = ['failed', 'completed', 'in-progress'] @@ -50,7 +53,8 @@ class InstanceExternalEvent(obj_base.NovaObject): # Version 1.2: adds volume-extended event # Version 1.3: adds power-update event # Version 1.4: adds accelerator-request-bound event - VERSION = '1.4' + # Version 1.5: adds volume-reimaged event + VERSION = '1.5' fields = { 'instance_uuid': fields.UUIDField(), diff --git a/nova/objects/service.py b/nova/objects/service.py index 8885120ddd01..05aeb1b53837 100644 --- a/nova/objects/service.py +++ b/nova/objects/service.py @@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__) # NOTE(danms): This is the global service version counter -SERVICE_VERSION = 63 +SERVICE_VERSION = 64 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any @@ -222,6 +222,9 @@ SERVICE_VERSION_HISTORY = ( # Version 63: Compute RPC v6.0: # Add support for VDPA hotplug live migration and suspend/resume {'compute_rpc': '6.0'}, + # Version 64: Compute RPC v6.1: + # Add reimage_boot_volume parameter to rebuild_instance() + {'compute_rpc': '6.1'}, ) # This is used to raise an error at service startup if older than N-1 computes diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index dadfa8fe25df..314c29f583dd 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -2730,7 +2730,8 @@ class ComputeTestCase(BaseTestCase, new_pass="new_password", orig_sys_metadata=sys_metadata, bdms=[], recreate=False, on_shared_storage=False, preserve_ephemeral=False, migration=None, scheduled_node=None, - limits={}, request_spec=None, accel_uuids=[]) + limits={}, request_spec=None, accel_uuids=[], + reimage_boot_volume=False) self.compute.terminate_instance(self.context, instance, []) def test_rebuild_driver(self): @@ -2760,7 +2761,8 @@ class ComputeTestCase(BaseTestCase, new_pass="new_password", orig_sys_metadata=sys_metadata, bdms=[], recreate=False, on_shared_storage=False, preserve_ephemeral=False, migration=None, scheduled_node=None, - limits={}, request_spec=None, accel_uuids=[]) + limits={}, request_spec=None, accel_uuids=[], + reimage_boot_volume=False) self.assertTrue(called['rebuild']) self.compute.terminate_instance(self.context, instance, []) @@ -2812,7 +2814,8 @@ class ComputeTestCase(BaseTestCase, new_pass="new_password", orig_sys_metadata=sys_metadata, bdms=bdms, recreate=False, preserve_ephemeral=False, migration=None, scheduled_node=None, limits={}, - on_shared_storage=False, request_spec=None, accel_uuids=[]) + on_shared_storage=False, request_spec=None, accel_uuids=[], + reimage_boot_volume=False) self.assertTrue(called['rebuild']) self.compute.terminate_instance(self.context, instance, []) @@ -2831,7 +2834,7 @@ class ComputeTestCase(BaseTestCase, new_pass="new_password", orig_sys_metadata=sys_metadata, bdms=[], recreate=False, on_shared_storage=False, preserve_ephemeral=False, migration=None, scheduled_node=None, limits=None, - request_spec=None, accel_uuids=[]) + request_spec=None, accel_uuids=[], reimage_boot_volume=False) self.compute.terminate_instance(self.context, instance, []) def test_rebuild_launched_at_time(self): @@ -2852,7 +2855,7 @@ class ComputeTestCase(BaseTestCase, new_pass="new_password", orig_sys_metadata={}, bdms=[], recreate=False, on_shared_storage=False, preserve_ephemeral=False, migration=None, scheduled_node=None, limits={}, request_spec=None, - accel_uuids=[]) + accel_uuids=[], reimage_boot_volume=False) instance.refresh() self.assertEqual(cur_time, instance['launched_at'].replace(tzinfo=None)) @@ -2885,7 +2888,8 @@ class ComputeTestCase(BaseTestCase, injected_files=injected_files, new_pass="new_password", orig_sys_metadata=sys_metadata, bdms=[], recreate=False, on_shared_storage=False, preserve_ephemeral=False, migration=None, - scheduled_node=None, limits={}, request_spec=None, accel_uuids=[]) + scheduled_node=None, limits={}, request_spec=None, accel_uuids=[], + reimage_boot_volume=False) self.compute.terminate_instance(self.context, instance, []) @mock.patch.object(objects.BlockDeviceMappingList, 'get_by_instance_uuid') @@ -4612,7 +4616,8 @@ class ComputeTestCase(BaseTestCase, 'limits': {}, 'request_spec': None, 'on_shared_storage': False, - 'accel_uuids': ()}), + 'accel_uuids': (), + 'reimage_boot_volume': False}), ("set_admin_password", task_states.UPDATING_PASSWORD, {'new_pass': None}), ("rescue_instance", task_states.RESCUING, @@ -5130,7 +5135,8 @@ class ComputeTestCase(BaseTestCase, injected_files=[], new_pass=password, orig_sys_metadata=orig_sys_metadata, bdms=[], recreate=False, on_shared_storage=False, preserve_ephemeral=False, migration=None, - scheduled_node=None, limits={}, request_spec=None, accel_uuids=[]) + scheduled_node=None, limits={}, request_spec=None, accel_uuids=[], + reimage_boot_volume=False) inst_ref.refresh() @@ -13534,7 +13540,7 @@ class EvacuateHostTestCase(BaseTestCase): image_ref, injected_files, 'newpass', {}, bdms, recreate=True, on_shared_storage=on_shared_storage, migration=migration, preserve_ephemeral=False, scheduled_node=node, limits=limits, - request_spec=None, accel_uuids=[]) + request_spec=None, accel_uuids=[], reimage_boot_volume=False) if vm_states_is_stopped: mock_notify_rebuild.assert_has_calls([ mock.call(ctxt, self.inst, self.inst.host, phase='start', diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index b9cacfa82e76..2aadcd106c19 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -5305,7 +5305,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, self.compute.rebuild_instance( self.context, instance, None, None, None, None, None, None, - recreate, False, False, None, scheduled_node, {}, None, []) + recreate, False, False, None, scheduled_node, {}, None, [], False) mock_set.assert_called_once_with(None, 'failed') mock_notify_about_instance_usage.assert_called_once_with( mock.ANY, instance, 'rebuild.error', fault=mock_rebuild.side_effect @@ -5416,7 +5416,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, None, recreate=True, on_shared_storage=None, preserve_ephemeral=False, migration=None, scheduled_node='fake-node', - limits={}, request_spec=request_spec, accel_uuids=[]) + limits={}, request_spec=request_spec, accel_uuids=[], + reimage_boot_volume=False) mock_validate_policy.assert_called_once_with( elevated_context, instance, {'group': [uuids.group]}) @@ -5455,7 +5456,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, self.context, instance, None, None, None, None, None, None, recreate=True, on_shared_storage=None, preserve_ephemeral=False, migration=None, scheduled_node='fake-node', limits={}, - request_spec=request_spec, accel_uuids=[]) + request_spec=request_spec, accel_uuids=[], + reimage_boot_volume=False) mock_validate_policy.assert_called_once_with( elevated_context, instance, {'group': [uuids.group]}) @@ -5481,7 +5483,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, self.compute.rebuild_instance( self.context, instance, None, None, None, None, None, None, False, - False, False, migration, None, {}, None, []) + False, False, migration, None, {}, None, [], False) self.assertFalse(mock_get.called) self.assertEqual(node, instance.node) self.assertEqual('done', migration.status) @@ -5503,7 +5505,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, self.compute.rebuild_instance( self.context, instance, None, None, None, None, None, None, True, False, False, mock.sentinel.migration, None, {}, - None, []) + None, [], False) mock_get.assert_called_once_with(mock.ANY, self.compute.host) mock_rt.finish_evacuation.assert_called_once_with( instance, 'new-node', mock.sentinel.migration) @@ -5585,7 +5587,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, recreate, on_shared_storage, preserve_ephemeral, {}, {}, self.allocations, - mock.sentinel.mapping, []) + mock.sentinel.mapping, [], + False) mock_notify_usage.assert_has_calls( [mock.call(self.context, instance, "rebuild.start", @@ -5603,8 +5606,12 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, provider_mappings=mock.sentinel.mapping) mock_get_nw_info.assert_called_once_with(self.context, instance) - def test_rebuild_default_impl(self): - def _detach(context, bdms): + @ddt.data((False, False), (False, True), (True, False), (True, True)) + @ddt.unpack + def test_rebuild_default_impl(self, is_vol_backed, reimage_boot_vol): + fake_image_meta = mock.MagicMock(id='fake_id') + + def _detach(context, bdms, detach_root_bdm=True): # NOTE(rpodolyaka): check that instance has been powered off by # the time we detach block devices, exact calls arguments will be # checked below @@ -5630,13 +5637,20 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, mock.patch.object(self.compute, '_power_off_instance', return_value=None), mock.patch.object(self.compute, '_get_accel_info', - return_value=[]) + return_value=[]), + mock.patch.object(compute_utils, 'is_volume_backed_instance', + return_value=is_vol_backed), + mock.patch.object(self.compute, '_rebuild_volume_backed_instance'), + mock.patch.object(compute_utils, 'get_root_bdm') ) as( mock_destroy, mock_spawn, mock_save, mock_power_off, - mock_accel_info + mock_accel_info, + mock_is_volume_backed, + mock_rebuild_vol_backed_inst, + mock_get_root, ): instance = fake_instance.fake_instance_obj(self.context) instance.migration_context = None @@ -5646,9 +5660,19 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, instance.device_metadata = None instance.task_state = task_states.REBUILDING instance.save(expected_task_state=[task_states.REBUILDING]) + fake_block_device_info = { + 'block_device_mapping': [ + {'attachment_id': '341a8917-f74d-4473-8ee7-4ca05e5e0ab3', + 'volume_id': 'b7c93bb9-dfe4-41af-aa56-e6b28342fd8f', + 'connection_info': {'driver_volume_type': 'iscsi', + 'data': {'target_discovered': False, + 'target_portal': '127.0.0.1:3260', + 'target_iqn': 'iqn.2010-10.org.openstack:volume-' + 'b7c93bb9-dfe4-41af-aa56-e6b28342fd8f', + 'target_lun': 0}}}]} self.compute._rebuild_default_impl(self.context, instance, - None, + fake_image_meta, [], admin_password='new_pass', bdms=[], @@ -5657,16 +5681,151 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, attach_block_devices=_attach, network_info=None, evacuate=False, - block_device_info=None, - preserve_ephemeral=False) + block_device_info= + fake_block_device_info, + preserve_ephemeral=False, + reimage_boot_volume= + reimage_boot_vol) self.assertTrue(mock_save.called) self.assertTrue(mock_spawn.called) mock_destroy.assert_called_once_with( self.context, instance, - network_info=None, block_device_info=None) + network_info=None, block_device_info=fake_block_device_info) mock_power_off.assert_called_once_with( instance, clean_shutdown=True) + if is_vol_backed and reimage_boot_vol: + mock_rebuild_vol_backed_inst.assert_called_once_with( + self.context, instance, [], fake_image_meta.id) + else: + mock_rebuild_vol_backed_inst.assert_not_called() + + @mock.patch('nova.volume.cinder.API.attachment_delete') + @mock.patch('nova.volume.cinder.API.attachment_create', + return_value={'id': uuids.new_attachment_id}) + @mock.patch.object(nova.compute.manager.ComputeVirtAPI, + 'wait_for_instance_event') + def test__rebuild_volume_backed_instance( + self, wait_inst_event, attach_create, attach_delete): + fake_conn_info = '{}' + fake_device = 'fake_vda' + root_bdm = mock.MagicMock( + volume_id=uuids.volume_id, connection_info=fake_conn_info, + device_name=fake_device, attachment_id=uuids.old_attachment_id, + save=mock.MagicMock()) + bdms = [root_bdm] + events = [('volume-reimaged', root_bdm.volume_id)] + image_size_gb = 1 + deadline = CONF.reimage_timeout_per_gb * image_size_gb + + with test.nested( + mock.patch.object(objects.Instance, 'save', + return_value=None), + mock.patch.object(compute_utils, 'get_root_bdm', + return_value=root_bdm), + mock.patch.object(self.compute, 'volume_api'), + mock.patch.object(self.compute.image_api, 'get'), + ) as ( + mock_save, + mock_get_root_bdm, + mock_vol_api, + mock_get_img + ): + instance = fake_instance.fake_instance_obj(self.context) + instance.task_state = task_states.REBUILDING + # 1024 ** 3 = 1073741824 + mock_get_img.return_value = {'size': 1073741824} + self.compute._rebuild_volume_backed_instance( + self.context, instance, bdms, uuids.image_id) + mock_vol_api.attachment_create.assert_called_once_with( + self.context, uuids.volume_id, instance.uuid) + mock_vol_api.attachment_delete.assert_called_once_with( + self.context, uuids.old_attachment_id) + mock_vol_api.reimage_volume.assert_called_once_with( + self.context, uuids.volume_id, uuids.image_id, + reimage_reserved=True) + mock_get_img.assert_called_once_with( + self.context, uuids.image_id) + mock_get_root_bdm.assert_called_once_with( + self.context, instance, bdms) + wait_inst_event.assert_called_once_with( + instance, events, deadline=deadline, + error_callback=self.compute._reimage_failed_callback) + + @mock.patch('nova.volume.cinder.API.attachment_delete') + @mock.patch('nova.volume.cinder.API.attachment_create', + return_value={'id': uuids.new_attachment_id}) + @mock.patch.object(nova.compute.manager.ComputeVirtAPI, + 'wait_for_instance_event') + def test__rebuild_volume_backed_instance_image_not_found( + self, wait_inst_event, attach_create, attach_delete): + fake_conn_info = '{}' + fake_device = 'fake_vda' + root_bdm = mock.MagicMock( + volume_id=uuids.volume_id, connection_info=fake_conn_info, + device_name=fake_device, attachment_id=uuids.old_attachment_id, + save=mock.MagicMock()) + bdms = [root_bdm] + + with test.nested( + mock.patch.object(objects.Instance, 'save', + return_value=None), + mock.patch.object(compute_utils, 'get_root_bdm', + return_value=root_bdm), + mock.patch.object(self.compute, 'volume_api'), + mock.patch.object(self.compute.image_api, 'get'), + ) as( + mock_save, + mock_get_root_bdm, + mock_vol_api, + mock_get_img + ): + mock_get_img.side_effect = exception.ImageNotFound( + image_id=uuids.image_id) + instance = fake_instance.fake_instance_obj(self.context) + instance.task_state = task_states.REBUILDING + instance.save(expected_task_state=[task_states.REBUILDING]) + mock_get_img.return_value = {'size': 1} + self.assertRaises( + exception.BuildAbortException, + self.compute._rebuild_volume_backed_instance, + self.context, instance, bdms, uuids.image_id) + mock_vol_api.attachment_create.assert_called_once_with( + self.context, uuids.volume_id, instance.uuid) + mock_vol_api.attachment_delete.assert_called_once_with( + self.context, uuids.old_attachment_id) + mock_get_img.assert_called_once_with( + self.context, uuids.image_id) + + @mock.patch.object(objects.Instance, 'save', return_value=None) + @mock.patch.object(fake_driver.SmallFakeDriver, 'detach_volume') + @mock.patch.object(cinder.API, 'roll_detaching') + def test__detach_root_volume(self, mock_roll_detach, mock_detach, + mock_save): + exception_list = [ + '', + exception.DiskNotFound(location="not\\here"), + exception.DeviceDetachFailed(device="fake_dev", reason="unknown"), + ] + mock_detach.side_effect = exception_list + fake_conn_info = '{}' + fake_device = 'fake_vda' + root_bdm = mock.MagicMock( + volume_id=uuids.volume_id, connection_info=fake_conn_info, + device_name=fake_device, attachment_id=uuids.old_attachment_id, + save=mock.MagicMock()) + instance = fake_instance.fake_instance_obj(self.context) + instance.task_state = task_states.REBUILDING + instance.save(expected_task_state=[task_states.REBUILDING]) + self.compute._detach_root_volume(self.context, instance, root_bdm) + self.compute._detach_root_volume(self.context, instance, root_bdm) + self.assertRaises(exception.DeviceDetachFailed, + self.compute._detach_root_volume, + self.context, instance, root_bdm) + mock_roll_detach.assert_called_with(self.context, uuids.volume_id) + self.assertRaises(Exception, self.compute._detach_root_volume, # noqa + self.context, instance, root_bdm) + mock_roll_detach.assert_called_with(self.context, uuids.volume_id) def test_do_rebuild_instance_check_trusted_certs(self): """Tests the scenario that we're rebuilding an instance with @@ -5688,7 +5847,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, request_spec=objects.RequestSpec(), allocations=self.allocations, request_group_resource_providers_mapping=mock.sentinel.mapping, - accel_uuids=[]) + accel_uuids=[], reimage_boot_volume=False) self.assertIn('Trusted image certificates provided on host', str(ex)) def test_reverts_task_state_instance_not_found(self): diff --git a/nova/tests/unit/compute/test_rpcapi.py b/nova/tests/unit/compute/test_rpcapi.py index 541cc1012eea..55d0fc53e855 100644 --- a/nova/tests/unit/compute/test_rpcapi.py +++ b/nova/tests/unit/compute/test_rpcapi.py @@ -835,7 +835,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): bdms=[], instance=self.fake_instance_obj, host='new_host', orig_sys_metadata=None, recreate=True, on_shared_storage=True, preserve_ephemeral=True, migration=None, node=None, - limits=None, request_spec=None, accel_uuids=[], version='6.0') + limits=None, request_spec=None, accel_uuids=[], + reimage_boot_volume=False, version='6.1') def test_rebuild_instance_old_rpcapi(self): # With rpcapi < 5.12, accel_uuids must be dropped in the client call. @@ -862,20 +863,58 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): 'migration': None, 'limits': None } + # Pass reimage_boot_volume to the client call... compute_api.rebuild_instance( ctxt, instance=self.fake_instance_obj, accel_uuids=['938af7f9-f136-4e5a-bdbe-3b6feab54311'], - node=None, host=None, **rebuild_args) + node=None, host=None, reimage_boot_volume=False, + **rebuild_args) mock_client.can_send_version.assert_has_calls([mock.call('6.0'), mock.call('5.12')]) mock_client.prepare.assert_called_with( server=self.fake_instance_obj.host, version='5.0') + # ...and assert that it does not show up on the wire before 6.1 mock_cctx.cast.assert_called_with( # No accel_uuids ctxt, 'rebuild_instance', instance=self.fake_instance_obj, scheduled_node=None, **rebuild_args) + def test_rebuild_instance_vol_backed_old_rpcapi(self): + # With rpcapi < 6.1, if reimage_boot_volume is True then we + # should raise error. + ctxt = context.RequestContext('fake_user', 'fake_project') + compute_api = compute_rpcapi.ComputeAPI() + compute_api.router.client = mock.Mock() + mock_client = mock.MagicMock() + compute_api.router.client.return_value = mock_client + # Force can_send_version to [False, True, True], so that 6.0 + # version is used. + mock_client.can_send_version.side_effect = [False, True, True] + mock_cctx = mock.MagicMock() + mock_client.prepare.return_value = mock_cctx + rebuild_args = { + 'new_pass': 'admin_password', + 'injected_files': 'files_to_inject', + 'image_ref': uuids.image_ref, + 'orig_image_ref': uuids.orig_image_ref, + 'orig_sys_metadata': 'orig_sys_meta', + 'bdms': {}, + 'recreate': False, + 'on_shared_storage': False, + 'preserve_ephemeral': False, + 'request_spec': None, + 'migration': None, + 'limits': None, + 'accel_uuids': [], + 'reimage_boot_volume': True, + } + self.assertRaises( + exception.NovaException, compute_api.rebuild_instance, + ctxt, instance=self.fake_instance_obj, + node=None, host=None, **rebuild_args) + mock_client.can_send_version.assert_has_calls([mock.call('6.1')]) + def test_reserve_block_device_name(self): self.flags(long_rpc_timeout=1234) self._test_compute_api('reserve_block_device_name', 'call', diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py index 4950bf7f4b40..e6fb2ca29939 100644 --- a/nova/tests/unit/conductor/test_conductor.py +++ b/nova/tests/unit/conductor/test_conductor.py @@ -388,7 +388,8 @@ class _BaseTaskTestCase(object): 'on_shared_storage': False, 'preserve_ephemeral': False, 'host': 'compute-host', - 'request_spec': None} + 'request_spec': None, + 'reimage_boot_volume': False} if update_args: rebuild_args.update(update_args) compute_rebuild_args = copy.deepcopy(rebuild_args) @@ -1927,6 +1928,8 @@ class _BaseTaskTestCase(object): migration_type='evacuation') migration.create() + # TODO(whoami-rajat): Remove this compatibility code + del rebuild_args['reimage_boot_volume'] self.assertRaises(exc.UnsupportedPolicyException, self.conductor.rebuild_instance, self.context, diff --git a/nova/tests/unit/objects/test_objects.py b/nova/tests/unit/objects/test_objects.py index f42b74e13593..9e49b549f36a 100644 --- a/nova/tests/unit/objects/test_objects.py +++ b/nova/tests/unit/objects/test_objects.py @@ -1079,7 +1079,7 @@ object_data = { 'InstanceActionEventList': '1.1-13d92fb953030cdbfee56481756e02be', 'InstanceActionList': '1.1-a2b2fb6006b47c27076d3a1d48baa759', 'InstanceDeviceMetadata': '1.0-74d78dd36aa32d26d2769a1b57caf186', - 'InstanceExternalEvent': '1.4-06c2dfcf2d2813c24cd37ee728524f1a', + 'InstanceExternalEvent': '1.5-1ec57351a9851c1eb43ccd90662d6dd0', 'InstanceFault': '1.2-7ef01f16f1084ad1304a513d6d410a38', 'InstanceFaultList': '1.2-6bb72de2872fe49ded5eb937a93f2451', 'InstanceGroup': '1.11-852ac511d30913ee88f3c3a869a8f30a', diff --git a/nova/virt/driver.py b/nova/virt/driver.py index cb6cd6d7d7d4..532ed1fa5091 100644 --- a/nova/virt/driver.py +++ b/nova/virt/driver.py @@ -333,7 +333,8 @@ class ComputeDriver(object): admin_password, allocations, bdms, detach_block_devices, attach_block_devices, network_info=None, evacuate=False, block_device_info=None, - preserve_ephemeral=False, accel_uuids=None): + preserve_ephemeral=False, accel_uuids=None, + reimage_boot_volume=False): """Destroy and re-make this instance. A 'rebuild' effectively purges all existing data from the system and @@ -371,6 +372,7 @@ class ComputeDriver(object): :param preserve_ephemeral: True if the default ephemeral storage partition must be preserved on rebuild :param accel_uuids: Accelerator UUIDs. + :param reimage_boot_volume: Re-image the volume backed instance. """ raise NotImplementedError() diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index 04b1c68bb1a9..7496db5a7cd0 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -1630,7 +1630,8 @@ class IronicDriver(virt_driver.ComputeDriver): admin_password, allocations, bdms, detach_block_devices, attach_block_devices, network_info=None, evacuate=False, block_device_info=None, - preserve_ephemeral=False, accel_uuids=None): + preserve_ephemeral=False, accel_uuids=None, + reimage_boot_volume=False): """Rebuild/redeploy an instance. This version of rebuild() allows for supporting the option to @@ -1671,7 +1672,13 @@ class IronicDriver(virt_driver.ComputeDriver): :param preserve_ephemeral: Boolean value; if True the ephemeral must be preserved on rebuild. :param accel_uuids: Accelerator UUIDs. Ignored by this driver. + :param reimage_boot_volume: Re-image the volume backed instance. """ + if reimage_boot_volume: + raise exception.NovaException( + _("Ironic doesn't support rebuilding volume backed " + "instances.")) + LOG.debug('Rebuild called for instance', instance=instance) instance.task_state = task_states.REBUILD_SPAWNING