consumer gen: more tests for delete allocation cases

User confirming migration as well as a successfull live migration also
triggers the delete allocation code path. This patch adds test coverage
for these code paths.

If the deletion of the source allocation of a confirmed migration fails
then nova puts the instance to ERROR state. The instance still has two
allocations in this state and deleting the instance only deletes the one that
is held by the instance_uuid. This patch logs an ERROR describing that in this
case the allocation held by the migration_uuid is leaked. The same true
for live migration failing to delete allocaton on the source host.

As this makes every caller of _delete_allocation_after_move logging the
same error for AllocationDeleteFailed exception this patch moves that logging
into _delete_allocation_after_move.

Blueprint: use-nested-allocation-candidates
Change-Id: I99427a52676826990d2a2ffc82cf30ad945b939c
This commit is contained in:
Balazs Gibizer 2018-09-13 10:07:42 -06:00
parent 53ca096750
commit 3a43a931d4
3 changed files with 179 additions and 18 deletions

View File

@ -3893,7 +3893,8 @@ class ComputeManager(manager.Manager):
rt = self._get_resource_tracker()
rt.drop_move_claim(context, instance, migration.source_node,
old_instance_type, prefix='old_')
self._delete_allocation_after_move(context, instance, migration,
self._delete_allocation_after_move(context, instance,
migration,
old_instance_type,
migration.source_node)
instance.drop_migration_context()
@ -3933,18 +3934,28 @@ class ComputeManager(manager.Manager):
if migration.source_node == nodename:
if migration.status in ('confirmed', 'completed'):
# NOTE(danms): We're finishing on the source node, so try to
# delete the allocation based on the migration uuid
deleted = self.reportclient.delete_allocation_for_instance(
context, migration.uuid)
if deleted:
LOG.info(_('Source node %(node)s confirmed migration '
'%(mig)s; deleted migration-based '
'allocation'),
{'node': nodename, 'mig': migration.uuid})
# NOTE(danms): We succeeded, which means we do not
# need to do the complex double allocation dance
return
try:
# NOTE(danms): We're finishing on the source node, so try
# to delete the allocation based on the migration uuid
deleted = self.reportclient.delete_allocation_for_instance(
context, migration.uuid)
if deleted:
LOG.info(_('Source node %(node)s confirmed migration '
'%(mig)s; deleted migration-based '
'allocation'),
{'node': nodename, 'mig': migration.uuid})
# NOTE(danms): We succeeded, which means we do not
# need to do the complex double allocation dance
return
except exception.AllocationDeleteFailed:
LOG.error('Deleting allocation in placement for migration '
'%(migration_uuid)s failed. The instance '
'%(instance_uuid)s will be put to ERROR state '
'but the allocation held by the migration is '
'leaked.',
{'instance_uuid': instance.uuid,
'migration_uuid': migration.uuid})
raise
else:
# We're reverting (or failed) on the source, so we
# need to check if our migration holds a claim and if

View File

@ -17,6 +17,7 @@ import datetime
import time
import zlib
from keystoneauth1 import adapter
import mock
from oslo_log import log as logging
from oslo_serialization import base64
@ -4498,6 +4499,59 @@ class ConsumerGenerationConflictTest(
self._delete_and_check_allocations(server)
def test_confirm_migrate_delete_alloc_on_source_fails(self):
source_hostname = self.compute1.host
dest_hostname = self.compute2.host
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
server = self._boot_and_check_allocations(self.flavor, source_hostname)
self._migrate_and_check_allocations(
server, self.flavor, source_rp_uuid, dest_rp_uuid)
rsp = fake_requests.FakeResponse(
409,
jsonutils.dumps(
{'errors': [
{'code': 'placement.concurrent_update',
'detail': 'consumer generation conflict'}]}))
with mock.patch('keystoneauth1.adapter.Adapter.put',
autospec=True) as mock_put:
mock_put.return_value = rsp
post = {'confirmResize': None}
self.api.post_server_action(
server['id'], post, check_response_status=[204])
server = self._wait_for_state_change(self.api, server, 'ERROR')
self.assertIn('Failed to delete allocations',
server['fault']['message'])
self.assertEqual(1, mock_put.call_count)
migrations = self.api.get_migrations()
self.assertEqual(1, len(migrations))
self.assertEqual('migration', migrations[0]['migration_type'])
self.assertEqual(server['id'], migrations[0]['instance_uuid'])
self.assertEqual(source_hostname, migrations[0]['source_compute'])
# NOTE(gibi): it might be better to mark the migration as error
self.assertEqual('confirmed', migrations[0]['status'])
# NOTE(gibi): Nova leaks the allocation held by the migration_uuid even
# after the instance is deleted. At least nova logs a fat ERROR.
self.assertIn('Deleting allocation in placement for migration %s '
'failed. The instance %s will be put to ERROR state but '
'the allocation held by the migration is leaked.' %
(migrations[0]['uuid'], server['id']),
self.stdlog.logger.output)
self.api.delete_server(server['id'])
self._wait_until_deleted(server)
fake_notifier.wait_for_versioned_notifications('instance.delete.end')
allocations = self._get_allocations_by_server_uuid(
migrations[0]['uuid'])
self.assertEqual(1, len(allocations))
def test_revert_migrate_delete_dest_allocation_fails_due_to_conflict(self):
source_hostname = self.compute1.host
dest_hostname = self.compute2.host
@ -4599,6 +4653,100 @@ class ConsumerGenerationConflictTest(
migrations[0]['uuid'])
self.assertEqual(1, len(allocations))
def test_live_migrate_drop_allocation_on_source_fails(self):
source_hostname = self.compute1.host
dest_hostname = self.compute2.host
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
server = self._boot_and_check_allocations(
self.flavor, source_hostname)
fake_notifier.stub_notifier(self)
self.addCleanup(fake_notifier.reset)
orig_put = adapter.Adapter.put
rsp = fake_requests.FakeResponse(
409,
jsonutils.dumps(
{'errors': [
{'code': 'placement.concurrent_update',
'detail': 'consumer generation conflict'}]}))
def fake_put(_self, url, *args, **kwargs):
migration_uuid = self.get_migration_uuid_for_instance(server['id'])
if url == '/allocations/%s' % migration_uuid:
return rsp
else:
return orig_put(_self, url, *args, **kwargs)
with mock.patch('keystoneauth1.adapter.Adapter.put',
autospec=True) as mock_put:
mock_put.side_effect = fake_put
post = {
'os-migrateLive': {
'host': dest_hostname,
'block_migration': True,
'force': True,
}
}
self.api.post_server_action(server['id'], post)
# nova does the source host cleanup _after_ setting the migration
# to completed and sending end notifications so we have to wait
# here a bit.
time.sleep(1)
# Nova failed to clean up on the source host. This right now puts
# the instance to ERROR state and fails the migration.
server = self._wait_for_server_parameter(self.api, server,
{'OS-EXT-SRV-ATTR:host': dest_hostname,
'status': 'ERROR'})
self._wait_for_migration_status(server, ['error'])
fake_notifier.wait_for_versioned_notifications(
'instance.live_migration_post.end')
# 1 claim on destination, 1 normal delete on dest that fails,
self.assertEqual(2, mock_put.call_count)
source_usages = self._get_provider_usages(source_rp_uuid)
# As the cleanup on the source host failed Nova leaks the allocation
# held by the migration.
self.assertFlavorMatchesAllocation(self.flavor, source_usages)
migration_uuid = self.get_migration_uuid_for_instance(server['id'])
allocations = self._get_allocations_by_server_uuid(migration_uuid)
self.assertEqual(1, len(allocations))
self.assertIn(source_rp_uuid, allocations)
source_allocation = allocations[source_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(self.flavor, source_allocation)
dest_usages = self._get_provider_usages(dest_rp_uuid)
self.assertFlavorMatchesAllocation(self.flavor, dest_usages)
allocations = self._get_allocations_by_server_uuid(server['id'])
self.assertEqual(1, len(allocations))
self.assertNotIn(source_rp_uuid, allocations)
dest_allocation = allocations[dest_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(self.flavor, dest_allocation)
# NOTE(gibi): Nova leaks the allocation held by the migration_uuid even
# after the instance is deleted. At least nova logs a fat ERROR.
self.assertIn('Deleting allocation in placement for migration %s '
'failed. The instance %s will be put to ERROR state but '
'the allocation held by the migration is leaked.' %
(migration_uuid, server['id']),
self.stdlog.logger.output)
self.api.delete_server(server['id'])
self._wait_until_deleted(server)
fake_notifier.wait_for_versioned_notifications('instance.delete.end')
allocations = self._get_allocations_by_server_uuid(migration_uuid)
self.assertEqual(1, len(allocations))
def test_server_delete_fails_due_to_conflict(self):
source_hostname = self.compute1.host

View File

@ -2,9 +2,11 @@
issues:
- |
Nova leaks resource allocations in placement during
``POST /servers/{server_id}/action (revertResize Action)`` if the
``POST /servers/{server_id}/action (revertResize Action)`` and
``POST /servers/{server_id}/action (confirmResize Action)`` and
``POST /servers/{server_id}/action (os-migrateLive Action)`` and if the
allocation held by the migration_uuid is modified in parallel with the
revert operation. Nova will log an ERROR and will put the server into ERROR
state but will not delete the migration allocation. We assume that this can
only happen if somebody outside of nova is actively changing the migration
allocation in placement. Therefore it is not considered as a bug.
lifecycle operation. Nova will log an ERROR and will put the server into
ERROR state but will not delete the migration allocation. We assume that
this can only happen if somebody outside of nova is actively changing the
migration allocation in placement. Therefore it is not considered as a bug.