Detach volume after deleting instance with no host

If an instance is booted from a volume, shelved, and goes into an error
state due to some reason, the volume from which instance is booted
remains even the instance is deleted because instance has no host
associated with it.

Called _local_delete() to detach volume and destroy bdm if instance is
in shelved_offloaded state or has no host associated with it. This will
cleanup both volumes and the networks.

Note:
Ankit had submitted same patch [1] earlier which was reverted [2] due
to a race condition on jenkins if an instance is deleted when it is in
building state.  The patch was then rebumitted [3] fixing the
the failure of race condition by reverting the ObjectActionError
exception handling in _delete.  This patch was later re-reverted [4]
due to continued jenkins race conditions.

The patch [5] intended to avoid the jenkins race condition by leaving
the flow for instances in the BUILDING state unchanged and only calling
_local_delete() on instances in the shelved_offloaded or error states
when the instance has no host associated with it. It however also had
to be reverted [6] because of yet another race condition.

This version takes a more minimal approach of adding the ERROR state
to the logic for doing a local delete plus cleanup of resources on
a compute host. Comments have also been added to the existing code
to explain more about the different flows.

[1] Ic630ae7d026a9697afec46ac9ea40aea0f5b5ffb
[2] Id4e405e7579530ed1c1f22ccc972d45b6d185f41
[3] Ic107d8edc7ee7a4ebb04eac58ef0cdbf506d6173
[4] Ibcbe35b5d329b183c4d0e8233e8ada26ebc512c2
[5] I928a397c75b857e94bf5c002e50ec43a2bed9848
[6] I6b9b886e0d6f2ec86141c048fb50969bccf5cb30

Co-Authored-By: Ankit Agrawal <ankit11.agrawal@nttdata.com>
Co-Authored-By: Samuel Matzek <smatzek@us.ibm.com>
Co-Authored-By: melanie witt <melwittt@gmail.com>

Closes-Bug: 1404867
Closes-Bug: 1408527

Change-Id: I2192ef513a2cd15d21e9d5d5fe22c5a5fbae0941
This commit is contained in:
ankitagrawal 2015-09-23 03:58:19 -07:00 committed by melanie witt
parent 9464c2ac1b
commit 5ce74fa06c
2 changed files with 94 additions and 8 deletions

View File

@ -1588,9 +1588,12 @@ class API(base.Base):
cb(context, instance, bdms, reservations=None)
quotas.commit()
return
shelved_offloaded = (instance.vm_state
== vm_states.SHELVED_OFFLOADED)
if not instance.host and not shelved_offloaded:
# NOTE(melwitt): We expect instances in certain states to have no
# host set but also have resources on a compute host. If this is
# the case, we want to try to clean up their resources. Otherwse,
# we can destroy the instance here and return.
expect_no_instance_host = self._expect_no_host(instance)
if not instance.host and not expect_no_instance_host:
try:
compute_utils.notify_about_instance_usage(
self.notifier, context, instance,
@ -1602,7 +1605,17 @@ class API(base.Base):
system_metadata=instance.system_metadata)
quotas.commit()
return
except exception.ObjectActionError:
except exception.ObjectActionError as ex:
# The instance's host likely changed under us as
# this instance could be building and has since been
# scheduled. Continue with attempts to delete it.
# NOTE(melwitt): Often instance.host is None after the
# refresh because the claim was aborted on the compute
# host. The service up check will raise ComputeHostNotFound
# in this case and we will do a local delete with compute
# host cleanup
LOG.debug('Refreshing instance because: %s', ex,
instance=instance)
instance.refresh()
if instance.vm_state == vm_states.RESIZED:
@ -1610,7 +1623,7 @@ class API(base.Base):
is_local_delete = True
try:
if not shelved_offloaded:
if not expect_no_instance_host:
service = objects.Service.get_by_compute_host(
context.elevated(), instance.host)
is_local_delete = not self.servicegroup_api.service_is_up(
@ -1637,6 +1650,9 @@ class API(base.Base):
cb(context, instance, bdms,
reservations=quotas.reservations)
except exception.ComputeHostNotFound:
# NOTE(melwitt): We expect this if instance.host has been
# set to None by the compute host during a claim abort
# and we pick it up in the instance.refresh()
pass
if is_local_delete:
@ -1655,6 +1671,13 @@ class API(base.Base):
if quotas:
quotas.rollback()
def _expect_no_host(self, instance):
# NOTE(melwitt): Instances in ERROR state have their host and node
# set to None as part of exception handling.
if instance.vm_state in (vm_states.SHELVED_OFFLOADED, vm_states.ERROR):
return True
return False
def _confirm_resize_on_deleting(self, context, instance):
# If in the middle of a resize, use confirm_resize to
# ensure the original instance is cleaned up too
@ -1748,6 +1771,22 @@ class API(base.Base):
connector)
self.volume_api.detach(elevated, bdm.volume_id,
instance.uuid)
except Exception as exc:
err_str = _LW("Ignoring volume cleanup failure due to %s")
LOG.warn(err_str % exc, instance=instance)
# This block handles the following case:
# 1. Instance scheduled to host and fails on the host.
# 2. Compute manager's cleanup calls terminate_connection
# and detach if the spawn made it that far.
# 3. Instance fails to boot on all other reschedule attempts.
# 4. Instance is left in error state with no assigned host.
# 5. Volumes in the instance's BDMs are left in the available
# state.
# When this is the case the terminate_connection and detach
# calls above will fail. We attempt the volume delete in a
# separate try-except to clean up these volume and avoid the
# storage leak.
try:
if bdm.delete_on_termination:
self.volume_api.delete(context, bdm.volume_id)
except Exception as exc:

View File

@ -1039,7 +1039,7 @@ class _ComputeAPIUnitTestMixIn(object):
self._test_delete_resized_part(inst)
if inst.vm_state == vm_states.SOFT_DELETED:
soft_delete = True
if inst.vm_state != vm_states.SHELVED_OFFLOADED:
if not self.compute_api._expect_no_host(inst):
self.context.elevated().AndReturn(self.context)
objects.Service.get_by_compute_host(self.context,
inst.host).AndReturn(objects.Service())
@ -1048,8 +1048,7 @@ class _ComputeAPIUnitTestMixIn(object):
inst.host != 'down-host')
if (inst.host == 'down-host' or
inst.vm_state == vm_states.SHELVED_OFFLOADED):
self.compute_api._expect_no_host(inst)):
self._test_downed_host_part(inst, updates, delete_time,
delete_type)
cast = False
@ -3766,6 +3765,54 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase):
self.assertRaises(exception.CannotResizeToSameFlavor,
self._test_resize, same_flavor=True)
@mock.patch.object(objects.BlockDeviceMappingList, 'get_by_instance_uuid')
@mock.patch('nova.context.RequestContext.elevated')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(quota.QUOTAS, 'reserve')
@mock.patch.object(compute_utils, 'notify_about_instance_usage')
@mock.patch.object(objects.BlockDeviceMapping, 'destroy')
@mock.patch.object(objects.Instance, 'destroy')
def _test_delete_volume_backed_instance(
self, vm_state, mock_instance_destroy, bdm_destroy,
notify_about_instance_usage, mock_reserve,
mock_save, mock_elevated, bdm_get_by_instance_uuid):
volume_id = uuidutils.generate_uuid()
bdms = [objects.BlockDeviceMapping(
**fake_block_device.FakeDbBlockDeviceDict(
{'id': 42, 'volume_id': volume_id,
'source_type': 'volume', 'destination_type': 'volume',
'delete_on_termination': False}))]
reservations = ['fake-resv']
bdm_get_by_instance_uuid.return_value = bdms
mock_reserve.return_value = reservations
mock_elevated.return_value = self.context
params = {'host': None, 'vm_state': vm_state}
inst = self._create_instance_obj(params=params)
connector = {'ip': '127.0.0.1', 'initiator': 'iqn.fake'}
with mock.patch.object(self.compute_api.network_api,
'deallocate_for_instance') as mock_deallocate, \
mock.patch.object(self.compute_api.volume_api,
'terminate_connection') as mock_terminate_conn, \
mock.patch.object(self.compute_api.volume_api,
'detach') as mock_detach:
self.compute_api.delete(self.context, inst)
mock_deallocate.assert_called_once_with(self.context, inst)
mock_detach.assert_called_once_with(self.context, volume_id,
inst.uuid)
mock_terminate_conn.assert_called_once_with(self.context,
volume_id, connector)
bdm_destroy.assert_called_once_with()
def test_delete_volume_backed_instance_in_error(self):
self._test_delete_volume_backed_instance(vm_states.ERROR)
def test_delete_volume_backed_instance_in_shelved_offloaded(self):
self._test_delete_volume_backed_instance(vm_states.SHELVED_OFFLOADED)
class ComputeAPIAPICellUnitTestCase(_ComputeAPIUnitTestMixIn,
test.NoDBTestCase):