Detach volume after deleting instance with no host

If an instance is booted from a volume, shelved, and goes into an error
state due to some reason, the volume from which instance is booted
remains even the instance is deleted because instance has no host
associated with it.

Called _local_delete() to detach volume and destroy bdm if instance is
in shelved_offloaded state or has no host associated with it. This will
cleanup both volumes and the networks.

Note:
Ankit had submitted same patch [1] earlier which was reverted [2] due
to a race condition on jenkins if an instance is deleted when it is in
building state.  The patch was then rebumitted [3] fixing the
the failure of race condition by reverting the ObjectActionError
exception handling in _delete.  This patch was later re-reverted [4]
due to continued jenkins race conditions.

The current patch avoids the jenkins race condition by leaving the flow
for instances in the BUILDING state unchanged and only calling
_local_delete() on instances in the shelved_offloaded or error states
when the instance has no host associated with it.  This addresses the
concerns of the referenced bugs.

[1] Ic630ae7d026a9697afec46ac9ea40aea0f5b5ffb
[2] Id4e405e7579530ed1c1f22ccc972d45b6d185f41
[3] Ic107d8edc7ee7a4ebb04eac58ef0cdbf506d6173
[4] Ibcbe35b5d329b183c4d0e8233e8ada26ebc512c2

Co-Authored-By: Ankit Agrawal <ankit11.agrawal@nttdata.com>

Closes-Bug: 1404867
Closes-Bug: 1408527

Change-Id: I928a397c75b857e94bf5c002e50ec43a2bed9848
This commit is contained in:
ankitagrawal 2015-09-23 03:58:19 -07:00 committed by Samuel Matzek
parent 89a2f42837
commit b7f8333765
2 changed files with 162 additions and 37 deletions

View File

@ -1581,9 +1581,8 @@ class API(base.Base):
cb(context, instance, bdms, reservations=None)
quotas.commit()
return
shelved_offloaded = (instance.vm_state
== vm_states.SHELVED_OFFLOADED)
if not instance.host and not shelved_offloaded:
local_delete = self._is_local_delete(context, instance)
if not instance.host and not local_delete:
try:
compute_utils.notify_about_instance_usage(
self.notifier, context, instance,
@ -1596,48 +1595,46 @@ class API(base.Base):
quotas.commit()
return
except exception.ObjectActionError:
# The instance's host likely changed under us as
# this instance could be building and has since been
# scheduled. Continue with attempts to delete it.
instance.refresh()
if instance.vm_state == vm_states.RESIZED:
self._confirm_resize_on_deleting(context, instance)
is_local_delete = True
try:
if not shelved_offloaded:
service = objects.Service.get_by_compute_host(
context.elevated(), instance.host)
is_local_delete = not self.servicegroup_api.service_is_up(
service)
if not is_local_delete:
if original_task_state in (task_states.DELETING,
task_states.SOFT_DELETING):
LOG.info(_LI('Instance is already in deleting state, '
'ignoring this request'),
instance=instance)
quotas.rollback()
return
self._record_action_start(context, instance,
instance_actions.DELETE)
if local_delete:
try:
self._local_delete(context, instance, bdms, delete_type,
cb)
quotas.commit()
return
except exception.ObjectActionError:
# The instance may have been in SHELVED_OFFLOADED
# state and may have been unshelved.
# Continue with attempts to delete it.
instance.refresh()
# NOTE(snikitin): If instance's vm_state is 'soft-delete',
# we should not count reservations here, because instance
# in soft-delete vm_state have already had quotas
# decremented. More details:
# https://bugs.launchpad.net/nova/+bug/1333145
if instance.vm_state == vm_states.SOFT_DELETED:
quotas.rollback()
if original_task_state in (task_states.DELETING,
task_states.SOFT_DELETING):
LOG.info(_LI('Instance is already in deleting state, '
'ignoring this request'),
instance=instance)
quotas.rollback()
return
self._record_action_start(context, instance,
instance_actions.DELETE)
cb(context, instance, bdms,
reservations=quotas.reservations)
except exception.ComputeHostNotFound:
pass
# NOTE(snikitin): If instance's vm_state is 'soft-delete',
# we should not count reservations here, because instance
# in soft-delete vm_state have already had quotas
# decremented. More details:
# https://bugs.launchpad.net/nova/+bug/1333145
if instance.vm_state == vm_states.SOFT_DELETED:
quotas.rollback()
if is_local_delete:
# If instance is in shelved_offloaded state or compute node
# isn't up, delete instance from db and clean bdms info and
# network info
self._local_delete(context, instance, bdms, delete_type, cb)
quotas.commit()
cb(context, instance, bdms,
reservations=quotas.reservations)
except exception.InstanceNotFound:
# NOTE(comstud): Race condition. Instance already gone.
@ -1648,6 +1645,23 @@ class API(base.Base):
if quotas:
quotas.rollback()
def _is_local_delete(self, context, instance):
if instance.vm_state == vm_states.SHELVED_OFFLOADED:
return True
if not instance.host:
if instance.vm_state == vm_states.ERROR:
return True
return False
else:
try:
# Check if compute service is up.
service = objects.Service.get_by_compute_host(
context.elevated(), instance.host)
return not self.servicegroup_api.service_is_up(service)
except exception.ComputeHostNotFound:
pass
return True
def _confirm_resize_on_deleting(self, context, instance):
# If in the middle of a resize, use confirm_resize to
# ensure the original instance is cleaned up too
@ -1741,6 +1755,22 @@ class API(base.Base):
connector)
self.volume_api.detach(elevated, bdm.volume_id,
instance.uuid)
except Exception as exc:
err_str = _LW("Ignoring volume cleanup failure due to %s")
LOG.warn(err_str % exc, instance=instance)
# This block handles the following case:
# 1. Instance scheduled to host and fails on the host.
# 2. Compute manager's cleanup calls terminate_connection
# and detach if the spawn made it that far.
# 3. Instance fails to boot on all other reschedule attempts.
# 4. Instance is left in error state with no assigned host.
# 5. Volumes in the instance's BDMs are left in the available
# state.
# When this is the case the terminate_connection and detach
# calls above will fail. We attempt the volume delete in a
# separate try-except to clean up these volume and avoid the
# storage leak.
try:
if bdm.delete_on_termination:
self.volume_api.delete(context, bdm.volume_id)
except Exception as exc:

View File

@ -3711,6 +3711,101 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase):
self.assertRaises(exception.CannotResizeToSameFlavor,
self._test_resize, same_flavor=True)
@mock.patch.object(objects.BlockDeviceMappingList, 'get_by_instance_uuid')
@mock.patch('nova.context.RequestContext.elevated')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(quota.QUOTAS, 'reserve')
@mock.patch.object(compute_utils, 'notify_about_instance_usage')
@mock.patch.object(objects.BlockDeviceMapping, 'destroy')
@mock.patch.object(objects.Instance, 'destroy')
def _test_delete_volume_backed_instance(
self, vm_state, mock_instance_destroy, bdm_destroy,
notify_about_instance_usage, mock_reserve,
mock_save, mock_elevated, bdm_get_by_instance_uuid):
volume_id = uuidutils.generate_uuid()
bdms = [objects.BlockDeviceMapping(
**fake_block_device.FakeDbBlockDeviceDict(
{'id': 42, 'volume_id': volume_id,
'source_type': 'volume', 'destination_type': 'volume',
'delete_on_termination': False}))]
reservations = ['fake-resv']
bdm_get_by_instance_uuid.return_value = bdms
mock_reserve.return_value = reservations
mock_elevated.return_value = self.context
params = {'host': None, 'vm_state': vm_state}
inst = self._create_instance_obj(params=params)
connector = {'ip': '127.0.0.1', 'initiator': 'iqn.fake'}
with mock.patch.object(self.compute_api.network_api,
'deallocate_for_instance') as mock_deallocate, \
mock.patch.object(self.compute_api.volume_api,
'terminate_connection') as mock_terminate_conn, \
mock.patch.object(self.compute_api.volume_api,
'detach') as mock_detach:
self.compute_api.delete(self.context, inst)
mock_deallocate.assert_called_once_with(self.context, inst)
mock_detach.assert_called_once_with(self.context, volume_id,
inst.uuid)
mock_terminate_conn.assert_called_once_with(self.context,
volume_id, connector)
bdm_destroy.assert_called_once_with()
def test_delete_volume_backed_instance_in_error(self):
self._test_delete_volume_backed_instance(vm_states.ERROR)
def test_delete_volume_backed_instance_in_shelved_offloaded(self):
self._test_delete_volume_backed_instance(vm_states.SHELVED_OFFLOADED)
def test_is_local_delete(self):
updates = {'host': None}
fake_inst = fake_instance.fake_instance_obj(self.context, **updates)
# Test the shelved_offloaded VM state.
fake_inst.vm_state = vm_states.SHELVED_OFFLOADED
self.assertTrue(self.compute_api._is_local_delete(self.context,
fake_inst))
# Test instance in error state with no host
fake_inst.vm_state = vm_states.ERROR
self.assertTrue(self.compute_api._is_local_delete(self.context,
fake_inst))
# Test instance in building state with no host
fake_inst.vm_state = vm_states.BUILDING
self.assertFalse(self.compute_api._is_local_delete(self.context,
fake_inst))
# Set host for tests were instance is on a host
fake_inst.host = 'fake_host'
# Test instance with a host that is down
with mock.patch.object(self.context, 'elevated'), \
mock.patch('nova.objects.Service.get_by_compute_host'), \
mock.patch.object(self.compute_api.servicegroup_api,
'service_is_up') as mock_service_is_up:
mock_service_is_up.return_value = False
self.assertTrue(self.compute_api._is_local_delete(self.context,
fake_inst))
# Test instance with a host that is up
with mock.patch.object(self.context, 'elevated'), \
mock.patch('nova.objects.Service.get_by_compute_host'), \
mock.patch.object(self.compute_api.servicegroup_api,
'service_is_up') as mock_service_is_up:
mock_service_is_up.return_value = True
self.assertFalse(self.compute_api._is_local_delete(self.context,
fake_inst))
# Test instance with a host that is not found
with mock.patch.object(self.context, 'elevated'), \
mock.patch('nova.objects.Service.get_by_compute_host') \
as mock_get_host:
mock_get_host.side_effect = exception.ComputeHostNotFound('host')
self.assertTrue(self.compute_api._is_local_delete(self.context,
fake_inst))
class ComputeAPIAPICellUnitTestCase(_ComputeAPIUnitTestMixIn,
test.NoDBTestCase):