From 1e59ed99c794e6554b9a3e1e2ed197de75e14189 Mon Sep 17 00:00:00 2001 From: ankitagrawal Date: Wed, 23 Sep 2015 03:58:19 -0700 Subject: [PATCH] Clean up ports and volumes when deleting ERROR instance Usually, when instance.host = None, it means the instance was never scheduled. However, the exception handling routine in compute manager [1] will set instance.host = None and set instance.vm_state = ERROR if the instance fails to build on the compute host. If that happens, we end up with an instance with host = None and vm_state = ERROR which may have ports and volumes still allocated. This adds some logic around deleting the instance when it may have ports or volumes allocated. 1. If the instance is not in ERROR or SHELVED_OFFLOADED state, we expect instance.host to be set to a compute host. So, if we find instance.host = None in states other than ERROR or SHELVED_OFFLOADED, we consider the instance to have failed scheduling and not require ports or volumes to be freed, and we simply destroy the instance database record and return. This is the "delete while booting" scenario. 2. If the instance is in ERROR because of a failed build or is SHELVED_OFFLOADED, we expect instance.host to be None even though there could be ports or volumes allocated. In this case, run the _local_delete routine to clean up ports and volumes and delete the instance database record. Co-Authored-By: Ankit Agrawal Co-Authored-By: Samuel Matzek Co-Authored-By: melanie witt Closes-Bug: 1404867 Closes-Bug: 1408527 Conflicts: nova/tests/unit/compute/test_compute_api.py [1] https://github.com/openstack/nova/blob/55ea961/nova/compute/manager.py#L1927-L1929 Change-Id: I4dc6c8bd3bb6c135f8a698af41f5d0e026c39117 (cherry picked from commit b3f39244a3eacd6fb141de61850cbd84fecdb544) --- nova/compute/api.py | 46 +++-- .../regressions/test_bug_1404867.py | 7 +- .../regressions/test_bug_1670627.py | 1 + .../regressions/test_bug_1689692.py | 1 + nova/tests/unit/compute/test_compute_api.py | 173 +++++++++++++++++- 5 files changed, 207 insertions(+), 21 deletions(-) diff --git a/nova/compute/api.py b/nova/compute/api.py index 8078add299ff..ecc04723d6bd 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -1788,12 +1788,12 @@ class API(base.Base): return cell = None - # If there is an instance.host (or the instance is shelved-offloaded), - # the instance has been scheduled and sent to a cell/compute which - # means it was pulled from the cell db. + # If there is an instance.host (or the instance is shelved-offloaded or + # in error state), the instance has been scheduled and sent to a + # cell/compute which means it was pulled from the cell db. # Normal delete should be attempted. - if not (instance.host or - instance.vm_state == vm_states.SHELVED_OFFLOADED): + may_have_ports_or_volumes = self._may_have_ports_or_volumes(instance) + if not instance.host and not may_have_ports_or_volumes: try: if self._delete_while_booting(context, instance): return @@ -1872,9 +1872,7 @@ class API(base.Base): # which will cause a cast to the child cell. cb(context, instance, bdms) return - shelved_offloaded = (instance.vm_state - == vm_states.SHELVED_OFFLOADED) - if not instance.host and not shelved_offloaded: + if not instance.host and not may_have_ports_or_volumes: try: compute_utils.notify_about_instance_usage( self.notifier, context, instance, @@ -1889,7 +1887,12 @@ class API(base.Base): {'state': instance.vm_state}, instance=instance) return - except exception.ObjectActionError: + except exception.ObjectActionError as ex: + # The instance's host likely changed under us as + # this instance could be building and has since been + # scheduled. Continue with attempts to delete it. + LOG.debug('Refreshing instance because: %s', ex, + instance=instance) instance.refresh() if instance.vm_state == vm_states.RESIZED: @@ -1897,7 +1900,8 @@ class API(base.Base): is_local_delete = True try: - if not shelved_offloaded: + # instance.host must be set in order to look up the service. + if instance.host is not None: service = objects.Service.get_by_compute_host( context.elevated(), instance.host) is_local_delete = not self.servicegroup_api.service_is_up( @@ -1914,7 +1918,9 @@ class API(base.Base): cb(context, instance, bdms) except exception.ComputeHostNotFound: - pass + LOG.debug('Compute host %s not found during service up check, ' + 'going to local delete instance', instance.host, + instance=instance) if is_local_delete: # If instance is in shelved_offloaded state or compute node @@ -1941,6 +1947,16 @@ class API(base.Base): # NOTE(comstud): Race condition. Instance already gone. pass + def _may_have_ports_or_volumes(self, instance): + # NOTE(melwitt): When an instance build fails in the compute manager, + # the instance host and node are set to None and the vm_state is set + # to ERROR. In the case, the instance with host = None has actually + # been scheduled and may have ports and/or volumes allocated on the + # compute node. + if instance.vm_state in (vm_states.SHELVED_OFFLOADED, vm_states.ERROR): + return True + return False + def _confirm_resize_on_deleting(self, context, instance): # If in the middle of a resize, use confirm_resize to # ensure the original instance is cleaned up too @@ -1996,6 +2012,14 @@ class API(base.Base): 'the instance host %(instance_host)s.', {'connector_host': connector.get('host'), 'instance_host': instance.host}, instance=instance) + if (instance.host is None and + self._may_have_ports_or_volumes(instance)): + LOG.debug('Allowing use of stashed volume connector with ' + 'instance host None because instance with ' + 'vm_state %(vm_state)s has been scheduled in ' + 'the past.', {'vm_state': instance.vm_state}, + instance=instance) + return connector def _local_cleanup_bdm_volumes(self, bdms, instance, context): """The method deletes the bdm records and, if a bdm is a volume, call diff --git a/nova/tests/functional/regressions/test_bug_1404867.py b/nova/tests/functional/regressions/test_bug_1404867.py index 89fca7782375..4fcaf9cbb995 100644 --- a/nova/tests/functional/regressions/test_bug_1404867.py +++ b/nova/tests/functional/regressions/test_bug_1404867.py @@ -80,9 +80,4 @@ class DeleteWithReservedVolumes(integrated_helpers._IntegratedTestBase, # The volume should no longer be reserved as the deletion of the # server should have released all the resources. - # TODO(mnaser): Uncomment this in patch resolving the issue - # self.assertNotIn(volume_id, self.cinder.reserved_volumes) - - # The volume is still reserved at this point, which it should not be. - # TODO(mnaser): Remove this in patch resolving the issue - self.assertIn(volume_id, self.cinder.reserved_volumes) + self.assertNotIn(volume_id, self.cinder.reserved_volumes) diff --git a/nova/tests/functional/regressions/test_bug_1670627.py b/nova/tests/functional/regressions/test_bug_1670627.py index 4c01c01ac324..de970628c74d 100644 --- a/nova/tests/functional/regressions/test_bug_1670627.py +++ b/nova/tests/functional/regressions/test_bug_1670627.py @@ -59,6 +59,7 @@ class TestDeleteFromCell0CheckQuota(test.TestCase): self.start_service('conductor') self.start_service('scheduler') + self.start_service('consoleauth') # We don't actually start a compute service; this way we don't have any # compute hosts to schedule the instance to and will go into error and diff --git a/nova/tests/functional/regressions/test_bug_1689692.py b/nova/tests/functional/regressions/test_bug_1689692.py index 820d2837d11d..5b3602f221e7 100644 --- a/nova/tests/functional/regressions/test_bug_1689692.py +++ b/nova/tests/functional/regressions/test_bug_1689692.py @@ -54,6 +54,7 @@ class ServerListLimitMarkerCell0Test(test.TestCase, self.start_service('conductor') self.flags(driver='chance_scheduler', group='scheduler') self.start_service('scheduler') + self.start_service('consoleauth') # We don't start the compute service because we want NoValidHost so # all of the instances go into ERROR state and get put into cell0. self.useFixture(cast_as_call.CastAsCall(self)) diff --git a/nova/tests/unit/compute/test_compute_api.py b/nova/tests/unit/compute/test_compute_api.py index f403d082be67..4c04aac720ab 100644 --- a/nova/tests/unit/compute/test_compute_api.py +++ b/nova/tests/unit/compute/test_compute_api.py @@ -952,7 +952,7 @@ class _ComputeAPIUnitTestMixIn(object): if self.cell_type != 'api': if inst.vm_state == vm_states.RESIZED: self._test_delete_resized_part(inst) - if inst.vm_state != vm_states.SHELVED_OFFLOADED: + if inst.host is not None: self.context.elevated().AndReturn(self.context) objects.Service.get_by_compute_host(self.context, inst.host).AndReturn(objects.Service()) @@ -960,9 +960,7 @@ class _ComputeAPIUnitTestMixIn(object): mox.IsA(objects.Service)).AndReturn( inst.host != 'down-host') - if (inst.host == 'down-host' or - inst.vm_state == vm_states.SHELVED_OFFLOADED): - + if inst.host == 'down-host' or inst.host is None: self._test_downed_host_part(inst, updates, delete_time, delete_type) cast = False @@ -1052,6 +1050,76 @@ class _ComputeAPIUnitTestMixIn(object): system_metadata=fake_sys_meta) self._test_delete('force_delete', vm_state=vm_state) + @mock.patch('nova.compute.api.API._delete_while_booting', + return_value=False) + @mock.patch('nova.compute.api.API._lookup_instance') + @mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid') + @mock.patch('nova.objects.Instance.save') + @mock.patch('nova.compute.utils.notify_about_instance_usage') + @mock.patch('nova.objects.Service.get_by_compute_host') + @mock.patch('nova.compute.api.API._local_delete') + def test_delete_error_state_with_no_host( + self, mock_local_delete, mock_service_get, _mock_notify, + _mock_save, mock_bdm_get, mock_lookup, _mock_del_booting): + # Instance in error state with no host should be a local delete + # for non API cells + inst = self._create_instance_obj(params=dict(vm_state=vm_states.ERROR, + host=None)) + mock_lookup.return_value = None, inst + with mock.patch.object(self.compute_api.compute_rpcapi, + 'terminate_instance') as mock_terminate: + self.compute_api.delete(self.context, inst) + if self.cell_type == 'api': + mock_terminate.assert_called_once_with( + self.context, inst, mock_bdm_get.return_value, + delete_type='delete') + mock_local_delete.assert_not_called() + else: + mock_local_delete.assert_called_once_with( + self.context, inst, mock_bdm_get.return_value, + 'delete', self.compute_api._do_delete) + mock_terminate.assert_not_called() + mock_service_get.assert_not_called() + + @mock.patch('nova.compute.api.API._delete_while_booting', + return_value=False) + @mock.patch('nova.compute.api.API._lookup_instance') + @mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid') + @mock.patch('nova.objects.Instance.save') + @mock.patch('nova.compute.utils.notify_about_instance_usage') + @mock.patch('nova.objects.Service.get_by_compute_host') + @mock.patch('nova.context.RequestContext.elevated') + @mock.patch('nova.servicegroup.api.API.service_is_up', return_value=True) + @mock.patch('nova.compute.api.API._record_action_start') + @mock.patch('nova.compute.api.API._local_delete') + def test_delete_error_state_with_host_set( + self, mock_local_delete, _mock_record, mock_service_up, + mock_elevated, mock_service_get, _mock_notify, _mock_save, + mock_bdm_get, mock_lookup, _mock_del_booting): + # Instance in error state with host set should be a non-local delete + # for non API cells if the service is up + inst = self._create_instance_obj(params=dict(vm_state=vm_states.ERROR, + host='fake-host')) + mock_lookup.return_value = inst + with mock.patch.object(self.compute_api.compute_rpcapi, + 'terminate_instance') as mock_terminate: + self.compute_api.delete(self.context, inst) + if self.cell_type == 'api': + mock_terminate.assert_called_once_with( + self.context, inst, mock_bdm_get.return_value, + delete_type='delete') + mock_local_delete.assert_not_called() + mock_service_get.assert_not_called() + else: + mock_service_get.assert_called_once_with( + mock_elevated.return_value, 'fake-host') + mock_service_up.assert_called_once_with( + mock_service_get.return_value) + mock_terminate.assert_called_once_with( + self.context, inst, mock_bdm_get.return_value, + delete_type='delete') + mock_local_delete.assert_not_called() + def test_delete_fast_if_host_not_set(self): self.useFixture(fixtures.AllServicesCurrent()) inst = self._create_instance_obj() @@ -1257,6 +1325,52 @@ class _ComputeAPIUnitTestMixIn(object): self.assertIsNone( self.compute_api._get_stashed_volume_connector(bdm, inst)) + @mock.patch.object(objects.BlockDeviceMapping, 'destroy') + def test_local_cleanup_bdm_volumes_stashed_connector_host_none( + self, mock_destroy): + """Tests that we call volume_api.terminate_connection when we found + a stashed connector in the bdm.connection_info dict. + + This tests the case where: + + 1) the instance host is None + 2) the instance vm_state is one where we expect host to be None + + We allow a mismatch of the host in this situation if the instance is + in a state where we expect its host to have been set to None, such + as ERROR or SHELVED_OFFLOADED. + """ + params = dict(host=None, vm_state=vm_states.ERROR) + inst = self._create_instance_obj(params=params) + conn_info = {'connector': {'host': 'orig-host'}} + vol_bdm = objects.BlockDeviceMapping(self.context, id=1, + instance_uuid=inst.uuid, + volume_id=uuids.volume_id, + source_type='volume', + destination_type='volume', + delete_on_termination=True, + connection_info=jsonutils.dumps( + conn_info), + attachment_id=None) + bdms = objects.BlockDeviceMappingList(objects=[vol_bdm]) + + @mock.patch.object(self.compute_api.volume_api, 'terminate_connection') + @mock.patch.object(self.compute_api.volume_api, 'detach') + @mock.patch.object(self.compute_api.volume_api, 'delete') + @mock.patch.object(self.context, 'elevated', return_value=self.context) + def do_test(self, mock_elevated, mock_delete, + mock_detach, mock_terminate): + self.compute_api._local_cleanup_bdm_volumes( + bdms, inst, self.context) + mock_terminate.assert_called_once_with( + self.context, uuids.volume_id, conn_info['connector']) + mock_detach.assert_called_once_with( + self.context, uuids.volume_id, inst.uuid) + mock_delete.assert_called_once_with(self.context, uuids.volume_id) + mock_destroy.assert_called_once_with() + + do_test(self) + def test_local_delete_without_info_cache(self): inst = self._create_instance_obj() @@ -5242,6 +5356,57 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase): network_id=None, port_id=None, requested_ip=None, tag='foo') + @mock.patch('nova.compute.api.API._delete_while_booting', + return_value=False) + @mock.patch('nova.compute.api.API._lookup_instance') + @mock.patch.object(objects.BlockDeviceMappingList, 'get_by_instance_uuid') + @mock.patch('nova.context.RequestContext.elevated') + @mock.patch.object(objects.Instance, 'save') + @mock.patch.object(compute_utils, 'notify_about_instance_usage') + @mock.patch.object(objects.BlockDeviceMapping, 'destroy') + @mock.patch.object(objects.Instance, 'destroy') + def _test_delete_volume_backed_instance( + self, vm_state, mock_instance_destroy, bdm_destroy, + notify_about_instance_usage, mock_save, mock_elevated, + bdm_get_by_instance_uuid, mock_lookup, _mock_del_booting): + volume_id = uuidutils.generate_uuid() + conn_info = {'connector': {'host': 'orig-host'}} + bdms = [objects.BlockDeviceMapping( + **fake_block_device.FakeDbBlockDeviceDict( + {'id': 42, 'volume_id': volume_id, + 'source_type': 'volume', 'destination_type': 'volume', + 'delete_on_termination': False, + 'connection_info': jsonutils.dumps(conn_info)}))] + + bdm_get_by_instance_uuid.return_value = bdms + mock_elevated.return_value = self.context + + params = {'host': None, 'vm_state': vm_state} + inst = self._create_instance_obj(params=params) + mock_lookup.return_value = None, inst + connector = conn_info['connector'] + + with mock.patch.object(self.compute_api.network_api, + 'deallocate_for_instance') as mock_deallocate, \ + mock.patch.object(self.compute_api.volume_api, + 'terminate_connection') as mock_terminate_conn, \ + mock.patch.object(self.compute_api.volume_api, + 'detach') as mock_detach: + self.compute_api.delete(self.context, inst) + + mock_deallocate.assert_called_once_with(self.context, inst) + mock_detach.assert_called_once_with(self.context, volume_id, + inst.uuid) + mock_terminate_conn.assert_called_once_with(self.context, + volume_id, connector) + bdm_destroy.assert_called_once_with() + + def test_delete_volume_backed_instance_in_error(self): + self._test_delete_volume_backed_instance(vm_states.ERROR) + + def test_delete_volume_backed_instance_in_shelved_offloaded(self): + self._test_delete_volume_backed_instance(vm_states.SHELVED_OFFLOADED) + class ComputeAPIAPICellUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase):