From 3a43a931d4e46a0aa66683da22482f3e78f26cd7 Mon Sep 17 00:00:00 2001 From: Balazs Gibizer Date: Thu, 13 Sep 2018 10:07:42 -0600 Subject: [PATCH] consumer gen: more tests for delete allocation cases User confirming migration as well as a successfull live migration also triggers the delete allocation code path. This patch adds test coverage for these code paths. If the deletion of the source allocation of a confirmed migration fails then nova puts the instance to ERROR state. The instance still has two allocations in this state and deleting the instance only deletes the one that is held by the instance_uuid. This patch logs an ERROR describing that in this case the allocation held by the migration_uuid is leaked. The same true for live migration failing to delete allocaton on the source host. As this makes every caller of _delete_allocation_after_move logging the same error for AllocationDeleteFailed exception this patch moves that logging into _delete_allocation_after_move. Blueprint: use-nested-allocation-candidates Change-Id: I99427a52676826990d2a2ffc82cf30ad945b939c --- nova/compute/manager.py | 37 +++-- nova/tests/functional/test_servers.py | 148 ++++++++++++++++++ ...cations_in_placement-bd0a6f2a30e2e3d2.yaml | 12 +- 3 files changed, 179 insertions(+), 18 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 879263da940f..931c38c56d66 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -3893,7 +3893,8 @@ class ComputeManager(manager.Manager): rt = self._get_resource_tracker() rt.drop_move_claim(context, instance, migration.source_node, old_instance_type, prefix='old_') - self._delete_allocation_after_move(context, instance, migration, + self._delete_allocation_after_move(context, instance, + migration, old_instance_type, migration.source_node) instance.drop_migration_context() @@ -3933,18 +3934,28 @@ class ComputeManager(manager.Manager): if migration.source_node == nodename: if migration.status in ('confirmed', 'completed'): - # NOTE(danms): We're finishing on the source node, so try to - # delete the allocation based on the migration uuid - deleted = self.reportclient.delete_allocation_for_instance( - context, migration.uuid) - if deleted: - LOG.info(_('Source node %(node)s confirmed migration ' - '%(mig)s; deleted migration-based ' - 'allocation'), - {'node': nodename, 'mig': migration.uuid}) - # NOTE(danms): We succeeded, which means we do not - # need to do the complex double allocation dance - return + try: + # NOTE(danms): We're finishing on the source node, so try + # to delete the allocation based on the migration uuid + deleted = self.reportclient.delete_allocation_for_instance( + context, migration.uuid) + if deleted: + LOG.info(_('Source node %(node)s confirmed migration ' + '%(mig)s; deleted migration-based ' + 'allocation'), + {'node': nodename, 'mig': migration.uuid}) + # NOTE(danms): We succeeded, which means we do not + # need to do the complex double allocation dance + return + except exception.AllocationDeleteFailed: + LOG.error('Deleting allocation in placement for migration ' + '%(migration_uuid)s failed. The instance ' + '%(instance_uuid)s will be put to ERROR state ' + 'but the allocation held by the migration is ' + 'leaked.', + {'instance_uuid': instance.uuid, + 'migration_uuid': migration.uuid}) + raise else: # We're reverting (or failed) on the source, so we # need to check if our migration holds a claim and if diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index fd055bd5d9b9..0ec8c962dc38 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -17,6 +17,7 @@ import datetime import time import zlib +from keystoneauth1 import adapter import mock from oslo_log import log as logging from oslo_serialization import base64 @@ -4498,6 +4499,59 @@ class ConsumerGenerationConflictTest( self._delete_and_check_allocations(server) + def test_confirm_migrate_delete_alloc_on_source_fails(self): + source_hostname = self.compute1.host + dest_hostname = self.compute2.host + source_rp_uuid = self._get_provider_uuid_by_host(source_hostname) + dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname) + + server = self._boot_and_check_allocations(self.flavor, source_hostname) + self._migrate_and_check_allocations( + server, self.flavor, source_rp_uuid, dest_rp_uuid) + + rsp = fake_requests.FakeResponse( + 409, + jsonutils.dumps( + {'errors': [ + {'code': 'placement.concurrent_update', + 'detail': 'consumer generation conflict'}]})) + + with mock.patch('keystoneauth1.adapter.Adapter.put', + autospec=True) as mock_put: + mock_put.return_value = rsp + + post = {'confirmResize': None} + self.api.post_server_action( + server['id'], post, check_response_status=[204]) + server = self._wait_for_state_change(self.api, server, 'ERROR') + self.assertIn('Failed to delete allocations', + server['fault']['message']) + + self.assertEqual(1, mock_put.call_count) + + migrations = self.api.get_migrations() + self.assertEqual(1, len(migrations)) + self.assertEqual('migration', migrations[0]['migration_type']) + self.assertEqual(server['id'], migrations[0]['instance_uuid']) + self.assertEqual(source_hostname, migrations[0]['source_compute']) + # NOTE(gibi): it might be better to mark the migration as error + self.assertEqual('confirmed', migrations[0]['status']) + + # NOTE(gibi): Nova leaks the allocation held by the migration_uuid even + # after the instance is deleted. At least nova logs a fat ERROR. + self.assertIn('Deleting allocation in placement for migration %s ' + 'failed. The instance %s will be put to ERROR state but ' + 'the allocation held by the migration is leaked.' % + (migrations[0]['uuid'], server['id']), + self.stdlog.logger.output) + self.api.delete_server(server['id']) + self._wait_until_deleted(server) + fake_notifier.wait_for_versioned_notifications('instance.delete.end') + + allocations = self._get_allocations_by_server_uuid( + migrations[0]['uuid']) + self.assertEqual(1, len(allocations)) + def test_revert_migrate_delete_dest_allocation_fails_due_to_conflict(self): source_hostname = self.compute1.host dest_hostname = self.compute2.host @@ -4599,6 +4653,100 @@ class ConsumerGenerationConflictTest( migrations[0]['uuid']) self.assertEqual(1, len(allocations)) + def test_live_migrate_drop_allocation_on_source_fails(self): + source_hostname = self.compute1.host + dest_hostname = self.compute2.host + source_rp_uuid = self._get_provider_uuid_by_host(source_hostname) + dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname) + + server = self._boot_and_check_allocations( + self.flavor, source_hostname) + + fake_notifier.stub_notifier(self) + self.addCleanup(fake_notifier.reset) + + orig_put = adapter.Adapter.put + + rsp = fake_requests.FakeResponse( + 409, + jsonutils.dumps( + {'errors': [ + {'code': 'placement.concurrent_update', + 'detail': 'consumer generation conflict'}]})) + + def fake_put(_self, url, *args, **kwargs): + migration_uuid = self.get_migration_uuid_for_instance(server['id']) + if url == '/allocations/%s' % migration_uuid: + return rsp + else: + return orig_put(_self, url, *args, **kwargs) + + with mock.patch('keystoneauth1.adapter.Adapter.put', + autospec=True) as mock_put: + mock_put.side_effect = fake_put + + post = { + 'os-migrateLive': { + 'host': dest_hostname, + 'block_migration': True, + 'force': True, + } + } + + self.api.post_server_action(server['id'], post) + + # nova does the source host cleanup _after_ setting the migration + # to completed and sending end notifications so we have to wait + # here a bit. + time.sleep(1) + + # Nova failed to clean up on the source host. This right now puts + # the instance to ERROR state and fails the migration. + server = self._wait_for_server_parameter(self.api, server, + {'OS-EXT-SRV-ATTR:host': dest_hostname, + 'status': 'ERROR'}) + self._wait_for_migration_status(server, ['error']) + fake_notifier.wait_for_versioned_notifications( + 'instance.live_migration_post.end') + + # 1 claim on destination, 1 normal delete on dest that fails, + self.assertEqual(2, mock_put.call_count) + + source_usages = self._get_provider_usages(source_rp_uuid) + # As the cleanup on the source host failed Nova leaks the allocation + # held by the migration. + self.assertFlavorMatchesAllocation(self.flavor, source_usages) + migration_uuid = self.get_migration_uuid_for_instance(server['id']) + allocations = self._get_allocations_by_server_uuid(migration_uuid) + self.assertEqual(1, len(allocations)) + self.assertIn(source_rp_uuid, allocations) + source_allocation = allocations[source_rp_uuid]['resources'] + self.assertFlavorMatchesAllocation(self.flavor, source_allocation) + + dest_usages = self._get_provider_usages(dest_rp_uuid) + self.assertFlavorMatchesAllocation(self.flavor, dest_usages) + + allocations = self._get_allocations_by_server_uuid(server['id']) + self.assertEqual(1, len(allocations)) + self.assertNotIn(source_rp_uuid, allocations) + dest_allocation = allocations[dest_rp_uuid]['resources'] + self.assertFlavorMatchesAllocation(self.flavor, dest_allocation) + + # NOTE(gibi): Nova leaks the allocation held by the migration_uuid even + # after the instance is deleted. At least nova logs a fat ERROR. + self.assertIn('Deleting allocation in placement for migration %s ' + 'failed. The instance %s will be put to ERROR state but ' + 'the allocation held by the migration is leaked.' % + (migration_uuid, server['id']), + self.stdlog.logger.output) + + self.api.delete_server(server['id']) + self._wait_until_deleted(server) + fake_notifier.wait_for_versioned_notifications('instance.delete.end') + + allocations = self._get_allocations_by_server_uuid(migration_uuid) + self.assertEqual(1, len(allocations)) + def test_server_delete_fails_due_to_conflict(self): source_hostname = self.compute1.host diff --git a/releasenotes/notes/leaking_migration_allocations_in_placement-bd0a6f2a30e2e3d2.yaml b/releasenotes/notes/leaking_migration_allocations_in_placement-bd0a6f2a30e2e3d2.yaml index 7c3af1ed0eb0..64123efd3f9b 100644 --- a/releasenotes/notes/leaking_migration_allocations_in_placement-bd0a6f2a30e2e3d2.yaml +++ b/releasenotes/notes/leaking_migration_allocations_in_placement-bd0a6f2a30e2e3d2.yaml @@ -2,9 +2,11 @@ issues: - | Nova leaks resource allocations in placement during - ``POST /servers/{server_id}/action (revertResize Action)`` if the + ``POST /servers/{server_id}/action (revertResize Action)`` and + ``POST /servers/{server_id}/action (confirmResize Action)`` and + ``POST /servers/{server_id}/action (os-migrateLive Action)`` and if the allocation held by the migration_uuid is modified in parallel with the - revert operation. Nova will log an ERROR and will put the server into ERROR - state but will not delete the migration allocation. We assume that this can - only happen if somebody outside of nova is actively changing the migration - allocation in placement. Therefore it is not considered as a bug. + lifecycle operation. Nova will log an ERROR and will put the server into + ERROR state but will not delete the migration allocation. We assume that + this can only happen if somebody outside of nova is actively changing the + migration allocation in placement. Therefore it is not considered as a bug.