Clean up ports and volumes when deleting ERROR instance

Usually, when instance.host = None, it means the instance was never
scheduled. However, the exception handling routine in compute manager
[1] will set instance.host = None and set instance.vm_state = ERROR
if the instance fails to build on the compute host. If that happens, we
end up with an instance with host = None and vm_state = ERROR which may
have ports and volumes still allocated.

This adds some logic around deleting the instance when it may have
ports or volumes allocated.

  1. If the instance is not in ERROR or SHELVED_OFFLOADED state, we
     expect instance.host to be set to a compute host. So, if we find
     instance.host = None in states other than ERROR or
     SHELVED_OFFLOADED, we consider the instance to have failed
     scheduling and not require ports or volumes to be freed, and we
     simply destroy the instance database record and return. This is
     the "delete while booting" scenario.

  2. If the instance is in ERROR because of a failed build or is
     SHELVED_OFFLOADED, we expect instance.host to be None even though
     there could be ports or volumes allocated. In this case, run the
     _local_delete routine to clean up ports and volumes and delete the
     instance database record.

Co-Authored-By: Ankit Agrawal <ankit11.agrawal@nttdata.com>
Co-Authored-By: Samuel Matzek <smatzek@us.ibm.com>
Co-Authored-By: melanie witt <melwittt@gmail.com>

Closes-Bug: 1404867
Closes-Bug: 1408527

Conflicts:
      nova/tests/unit/compute/test_compute_api.py

[1] https://github.com/openstack/nova/blob/55ea961/nova/compute/manager.py#L1927-L1929

Change-Id: I4dc6c8bd3bb6c135f8a698af41f5d0e026c39117
(cherry picked from commit b3f39244a3)
This commit is contained in:
ankitagrawal 2015-09-23 03:58:19 -07:00 committed by Mohammed Naser
parent 72dcdecdb2
commit 1e59ed99c7
5 changed files with 207 additions and 21 deletions

View File

@ -1788,12 +1788,12 @@ class API(base.Base):
return
cell = None
# If there is an instance.host (or the instance is shelved-offloaded),
# the instance has been scheduled and sent to a cell/compute which
# means it was pulled from the cell db.
# If there is an instance.host (or the instance is shelved-offloaded or
# in error state), the instance has been scheduled and sent to a
# cell/compute which means it was pulled from the cell db.
# Normal delete should be attempted.
if not (instance.host or
instance.vm_state == vm_states.SHELVED_OFFLOADED):
may_have_ports_or_volumes = self._may_have_ports_or_volumes(instance)
if not instance.host and not may_have_ports_or_volumes:
try:
if self._delete_while_booting(context, instance):
return
@ -1872,9 +1872,7 @@ class API(base.Base):
# which will cause a cast to the child cell.
cb(context, instance, bdms)
return
shelved_offloaded = (instance.vm_state
== vm_states.SHELVED_OFFLOADED)
if not instance.host and not shelved_offloaded:
if not instance.host and not may_have_ports_or_volumes:
try:
compute_utils.notify_about_instance_usage(
self.notifier, context, instance,
@ -1889,7 +1887,12 @@ class API(base.Base):
{'state': instance.vm_state},
instance=instance)
return
except exception.ObjectActionError:
except exception.ObjectActionError as ex:
# The instance's host likely changed under us as
# this instance could be building and has since been
# scheduled. Continue with attempts to delete it.
LOG.debug('Refreshing instance because: %s', ex,
instance=instance)
instance.refresh()
if instance.vm_state == vm_states.RESIZED:
@ -1897,7 +1900,8 @@ class API(base.Base):
is_local_delete = True
try:
if not shelved_offloaded:
# instance.host must be set in order to look up the service.
if instance.host is not None:
service = objects.Service.get_by_compute_host(
context.elevated(), instance.host)
is_local_delete = not self.servicegroup_api.service_is_up(
@ -1914,7 +1918,9 @@ class API(base.Base):
cb(context, instance, bdms)
except exception.ComputeHostNotFound:
pass
LOG.debug('Compute host %s not found during service up check, '
'going to local delete instance', instance.host,
instance=instance)
if is_local_delete:
# If instance is in shelved_offloaded state or compute node
@ -1941,6 +1947,16 @@ class API(base.Base):
# NOTE(comstud): Race condition. Instance already gone.
pass
def _may_have_ports_or_volumes(self, instance):
# NOTE(melwitt): When an instance build fails in the compute manager,
# the instance host and node are set to None and the vm_state is set
# to ERROR. In the case, the instance with host = None has actually
# been scheduled and may have ports and/or volumes allocated on the
# compute node.
if instance.vm_state in (vm_states.SHELVED_OFFLOADED, vm_states.ERROR):
return True
return False
def _confirm_resize_on_deleting(self, context, instance):
# If in the middle of a resize, use confirm_resize to
# ensure the original instance is cleaned up too
@ -1996,6 +2012,14 @@ class API(base.Base):
'the instance host %(instance_host)s.',
{'connector_host': connector.get('host'),
'instance_host': instance.host}, instance=instance)
if (instance.host is None and
self._may_have_ports_or_volumes(instance)):
LOG.debug('Allowing use of stashed volume connector with '
'instance host None because instance with '
'vm_state %(vm_state)s has been scheduled in '
'the past.', {'vm_state': instance.vm_state},
instance=instance)
return connector
def _local_cleanup_bdm_volumes(self, bdms, instance, context):
"""The method deletes the bdm records and, if a bdm is a volume, call

View File

@ -80,9 +80,4 @@ class DeleteWithReservedVolumes(integrated_helpers._IntegratedTestBase,
# The volume should no longer be reserved as the deletion of the
# server should have released all the resources.
# TODO(mnaser): Uncomment this in patch resolving the issue
# self.assertNotIn(volume_id, self.cinder.reserved_volumes)
# The volume is still reserved at this point, which it should not be.
# TODO(mnaser): Remove this in patch resolving the issue
self.assertIn(volume_id, self.cinder.reserved_volumes)
self.assertNotIn(volume_id, self.cinder.reserved_volumes)

View File

@ -59,6 +59,7 @@ class TestDeleteFromCell0CheckQuota(test.TestCase):
self.start_service('conductor')
self.start_service('scheduler')
self.start_service('consoleauth')
# We don't actually start a compute service; this way we don't have any
# compute hosts to schedule the instance to and will go into error and

View File

@ -54,6 +54,7 @@ class ServerListLimitMarkerCell0Test(test.TestCase,
self.start_service('conductor')
self.flags(driver='chance_scheduler', group='scheduler')
self.start_service('scheduler')
self.start_service('consoleauth')
# We don't start the compute service because we want NoValidHost so
# all of the instances go into ERROR state and get put into cell0.
self.useFixture(cast_as_call.CastAsCall(self))

View File

@ -952,7 +952,7 @@ class _ComputeAPIUnitTestMixIn(object):
if self.cell_type != 'api':
if inst.vm_state == vm_states.RESIZED:
self._test_delete_resized_part(inst)
if inst.vm_state != vm_states.SHELVED_OFFLOADED:
if inst.host is not None:
self.context.elevated().AndReturn(self.context)
objects.Service.get_by_compute_host(self.context,
inst.host).AndReturn(objects.Service())
@ -960,9 +960,7 @@ class _ComputeAPIUnitTestMixIn(object):
mox.IsA(objects.Service)).AndReturn(
inst.host != 'down-host')
if (inst.host == 'down-host' or
inst.vm_state == vm_states.SHELVED_OFFLOADED):
if inst.host == 'down-host' or inst.host is None:
self._test_downed_host_part(inst, updates, delete_time,
delete_type)
cast = False
@ -1052,6 +1050,76 @@ class _ComputeAPIUnitTestMixIn(object):
system_metadata=fake_sys_meta)
self._test_delete('force_delete', vm_state=vm_state)
@mock.patch('nova.compute.api.API._delete_while_booting',
return_value=False)
@mock.patch('nova.compute.api.API._lookup_instance')
@mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid')
@mock.patch('nova.objects.Instance.save')
@mock.patch('nova.compute.utils.notify_about_instance_usage')
@mock.patch('nova.objects.Service.get_by_compute_host')
@mock.patch('nova.compute.api.API._local_delete')
def test_delete_error_state_with_no_host(
self, mock_local_delete, mock_service_get, _mock_notify,
_mock_save, mock_bdm_get, mock_lookup, _mock_del_booting):
# Instance in error state with no host should be a local delete
# for non API cells
inst = self._create_instance_obj(params=dict(vm_state=vm_states.ERROR,
host=None))
mock_lookup.return_value = None, inst
with mock.patch.object(self.compute_api.compute_rpcapi,
'terminate_instance') as mock_terminate:
self.compute_api.delete(self.context, inst)
if self.cell_type == 'api':
mock_terminate.assert_called_once_with(
self.context, inst, mock_bdm_get.return_value,
delete_type='delete')
mock_local_delete.assert_not_called()
else:
mock_local_delete.assert_called_once_with(
self.context, inst, mock_bdm_get.return_value,
'delete', self.compute_api._do_delete)
mock_terminate.assert_not_called()
mock_service_get.assert_not_called()
@mock.patch('nova.compute.api.API._delete_while_booting',
return_value=False)
@mock.patch('nova.compute.api.API._lookup_instance')
@mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid')
@mock.patch('nova.objects.Instance.save')
@mock.patch('nova.compute.utils.notify_about_instance_usage')
@mock.patch('nova.objects.Service.get_by_compute_host')
@mock.patch('nova.context.RequestContext.elevated')
@mock.patch('nova.servicegroup.api.API.service_is_up', return_value=True)
@mock.patch('nova.compute.api.API._record_action_start')
@mock.patch('nova.compute.api.API._local_delete')
def test_delete_error_state_with_host_set(
self, mock_local_delete, _mock_record, mock_service_up,
mock_elevated, mock_service_get, _mock_notify, _mock_save,
mock_bdm_get, mock_lookup, _mock_del_booting):
# Instance in error state with host set should be a non-local delete
# for non API cells if the service is up
inst = self._create_instance_obj(params=dict(vm_state=vm_states.ERROR,
host='fake-host'))
mock_lookup.return_value = inst
with mock.patch.object(self.compute_api.compute_rpcapi,
'terminate_instance') as mock_terminate:
self.compute_api.delete(self.context, inst)
if self.cell_type == 'api':
mock_terminate.assert_called_once_with(
self.context, inst, mock_bdm_get.return_value,
delete_type='delete')
mock_local_delete.assert_not_called()
mock_service_get.assert_not_called()
else:
mock_service_get.assert_called_once_with(
mock_elevated.return_value, 'fake-host')
mock_service_up.assert_called_once_with(
mock_service_get.return_value)
mock_terminate.assert_called_once_with(
self.context, inst, mock_bdm_get.return_value,
delete_type='delete')
mock_local_delete.assert_not_called()
def test_delete_fast_if_host_not_set(self):
self.useFixture(fixtures.AllServicesCurrent())
inst = self._create_instance_obj()
@ -1257,6 +1325,52 @@ class _ComputeAPIUnitTestMixIn(object):
self.assertIsNone(
self.compute_api._get_stashed_volume_connector(bdm, inst))
@mock.patch.object(objects.BlockDeviceMapping, 'destroy')
def test_local_cleanup_bdm_volumes_stashed_connector_host_none(
self, mock_destroy):
"""Tests that we call volume_api.terminate_connection when we found
a stashed connector in the bdm.connection_info dict.
This tests the case where:
1) the instance host is None
2) the instance vm_state is one where we expect host to be None
We allow a mismatch of the host in this situation if the instance is
in a state where we expect its host to have been set to None, such
as ERROR or SHELVED_OFFLOADED.
"""
params = dict(host=None, vm_state=vm_states.ERROR)
inst = self._create_instance_obj(params=params)
conn_info = {'connector': {'host': 'orig-host'}}
vol_bdm = objects.BlockDeviceMapping(self.context, id=1,
instance_uuid=inst.uuid,
volume_id=uuids.volume_id,
source_type='volume',
destination_type='volume',
delete_on_termination=True,
connection_info=jsonutils.dumps(
conn_info),
attachment_id=None)
bdms = objects.BlockDeviceMappingList(objects=[vol_bdm])
@mock.patch.object(self.compute_api.volume_api, 'terminate_connection')
@mock.patch.object(self.compute_api.volume_api, 'detach')
@mock.patch.object(self.compute_api.volume_api, 'delete')
@mock.patch.object(self.context, 'elevated', return_value=self.context)
def do_test(self, mock_elevated, mock_delete,
mock_detach, mock_terminate):
self.compute_api._local_cleanup_bdm_volumes(
bdms, inst, self.context)
mock_terminate.assert_called_once_with(
self.context, uuids.volume_id, conn_info['connector'])
mock_detach.assert_called_once_with(
self.context, uuids.volume_id, inst.uuid)
mock_delete.assert_called_once_with(self.context, uuids.volume_id)
mock_destroy.assert_called_once_with()
do_test(self)
def test_local_delete_without_info_cache(self):
inst = self._create_instance_obj()
@ -5242,6 +5356,57 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase):
network_id=None, port_id=None,
requested_ip=None, tag='foo')
@mock.patch('nova.compute.api.API._delete_while_booting',
return_value=False)
@mock.patch('nova.compute.api.API._lookup_instance')
@mock.patch.object(objects.BlockDeviceMappingList, 'get_by_instance_uuid')
@mock.patch('nova.context.RequestContext.elevated')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(compute_utils, 'notify_about_instance_usage')
@mock.patch.object(objects.BlockDeviceMapping, 'destroy')
@mock.patch.object(objects.Instance, 'destroy')
def _test_delete_volume_backed_instance(
self, vm_state, mock_instance_destroy, bdm_destroy,
notify_about_instance_usage, mock_save, mock_elevated,
bdm_get_by_instance_uuid, mock_lookup, _mock_del_booting):
volume_id = uuidutils.generate_uuid()
conn_info = {'connector': {'host': 'orig-host'}}
bdms = [objects.BlockDeviceMapping(
**fake_block_device.FakeDbBlockDeviceDict(
{'id': 42, 'volume_id': volume_id,
'source_type': 'volume', 'destination_type': 'volume',
'delete_on_termination': False,
'connection_info': jsonutils.dumps(conn_info)}))]
bdm_get_by_instance_uuid.return_value = bdms
mock_elevated.return_value = self.context
params = {'host': None, 'vm_state': vm_state}
inst = self._create_instance_obj(params=params)
mock_lookup.return_value = None, inst
connector = conn_info['connector']
with mock.patch.object(self.compute_api.network_api,
'deallocate_for_instance') as mock_deallocate, \
mock.patch.object(self.compute_api.volume_api,
'terminate_connection') as mock_terminate_conn, \
mock.patch.object(self.compute_api.volume_api,
'detach') as mock_detach:
self.compute_api.delete(self.context, inst)
mock_deallocate.assert_called_once_with(self.context, inst)
mock_detach.assert_called_once_with(self.context, volume_id,
inst.uuid)
mock_terminate_conn.assert_called_once_with(self.context,
volume_id, connector)
bdm_destroy.assert_called_once_with()
def test_delete_volume_backed_instance_in_error(self):
self._test_delete_volume_backed_instance(vm_states.ERROR)
def test_delete_volume_backed_instance_in_shelved_offloaded(self):
self._test_delete_volume_backed_instance(vm_states.SHELVED_OFFLOADED)
class ComputeAPIAPICellUnitTestCase(_ComputeAPIUnitTestMixIn,
test.NoDBTestCase):