consumer gen: move_allocations

This patch renames the set_and_clear_allocations function in the
scheduler report client to move_allocations and adds handling of
consumer generation conflict for it. This call now moves everything from
one consumer to another and raises AllocationMoveFailed to the caller if
the move fails due to consumer generation conflict.

When migration or resize fails to move the source host allocation to the
migration_uuid then the API returns HTTP 409 and the migration is aborted.

If reverting a migration, a resize, or a resize to same host fails to move
the source host allocation back to the instance_uuid due consumer generation
conflict the instance will be put into ERROR state. The instance still has two
allocations in this state and deleting the instance only deletes the one that
is held by the instance_uuid. This patch logs an ERROR describing that in this
case the allocation held by the migration_uuid is leaked.

Blueprint: use-nested-allocation-candidates
Change-Id: Ie991d4b53e9bb5e7ec26da99219178ab7695abf6
This commit is contained in:
Balazs Gibizer 2018-08-14 19:23:32 +02:00 committed by Eric Fried
parent 1ced590571
commit 53ca096750
14 changed files with 501 additions and 247 deletions

View File

@ -57,7 +57,8 @@ class MigrateServerController(wsgi.Controller):
except (exception.TooManyInstances, exception.QuotaError) as e:
raise exc.HTTPForbidden(explanation=e.format_message())
except (exception.InstanceIsLocked,
exception.CannotMigrateWithTargetHost) as e:
exception.CannotMigrateWithTargetHost,
exception.AllocationMoveFailed) as e:
raise exc.HTTPConflict(explanation=e.format_message())
except exception.InstanceInvalidState as state_error:
common.raise_http_conflict_for_instance_invalid_state(state_error,

View File

@ -830,7 +830,8 @@ class ServersController(wsgi.Controller):
except exception.QuotaError as error:
raise exc.HTTPForbidden(
explanation=error.format_message())
except exception.InstanceIsLocked as e:
except (exception.InstanceIsLocked,
exception.AllocationMoveFailed) as e:
raise exc.HTTPConflict(explanation=e.format_message())
except exception.InstanceInvalidState as state_error:
common.raise_http_conflict_for_instance_invalid_state(state_error,

View File

@ -3949,6 +3949,8 @@ class ComputeManager(manager.Manager):
# We're reverting (or failed) on the source, so we
# need to check if our migration holds a claim and if
# so, avoid doing the legacy behavior below.
# NOTE(gibi): We are only hitting this in case of reverting
# a resize-on-same-host operation
mig_allocs = (
self.reportclient.get_allocations_for_consumer_by_provider(
context, cn_uuid, migration.uuid))
@ -4097,7 +4099,16 @@ class ComputeManager(manager.Manager):
instance.node = migration.source_node
instance.save()
self._revert_allocation(context, instance, migration)
try:
self._revert_allocation(context, instance, migration)
except exception.AllocationMoveFailed:
LOG.error('Reverting allocation in placement for migration '
'%(migration_uuid)s failed. The instance '
'%(instance_uuid)s will be put into ERROR state but '
'the allocation held by the migration is leaked.',
{'instance_uuid': instance.uuid,
'migration_uuid': migration.uuid})
raise
self.network_api.setup_networks_on_host(context, instance,
migration.source_compute)
@ -4190,9 +4201,6 @@ class ComputeManager(manager.Manager):
# We only have a claim against one provider, it is the source node
cn_uuid = list(orig_alloc.keys())[0]
# Get just the resources part of the one allocation we need below
orig_alloc = orig_alloc[cn_uuid].get('resources', {})
# FIXME(danms): This method is flawed in that it asssumes allocations
# against only one provider. So, this may overwite allocations against
# a shared provider, if we had one.
@ -4201,9 +4209,8 @@ class ComputeManager(manager.Manager):
{'node': cn_uuid, 'mig': migration.uuid},
instance=instance)
# TODO(cdent): Should we be doing anything with return values here?
self.reportclient.set_and_clear_allocations(
context, cn_uuid, instance.uuid, orig_alloc, instance.project_id,
instance.user_id, consumer_to_clear=migration.uuid)
self.reportclient.move_allocations(context, migration.uuid,
instance.uuid)
return True
def _prep_resize(self, context, image, instance, instance_type,

View File

@ -140,8 +140,7 @@ class LiveMigrationTask(base.TaskBase):
migrate.revert_allocation_for_migration(self.context,
self._source_cn,
self.instance,
self.migration,
self._held_allocations)
self.migration)
def _check_instance_is_active(self):
if self.instance.power_state not in (power_state.RUNNING,

View File

@ -57,9 +57,8 @@ def replace_allocation_with_migration(context, instance, migration):
# FIXME(danms): This method is flawed in that it asssumes allocations
# against only one provider. So, this may overwite allocations against
# a shared provider, if we had one.
success = reportclient.set_and_clear_allocations(
context, source_cn.uuid, migration.uuid, orig_alloc,
instance.project_id, instance.user_id, consumer_to_clear=instance.uuid)
success = reportclient.move_allocations(context, instance.uuid,
migration.uuid)
if not success:
LOG.error('Unable to replace resource claim on source '
'host %(host)s node %(node)s for instance',
@ -78,8 +77,7 @@ def replace_allocation_with_migration(context, instance, migration):
return source_cn, orig_alloc
def revert_allocation_for_migration(context, source_cn, instance, migration,
orig_alloc):
def revert_allocation_for_migration(context, source_cn, instance, migration):
"""Revert an allocation made for a migration back to the instance."""
schedclient = scheduler_client.SchedulerClient()
@ -88,10 +86,8 @@ def revert_allocation_for_migration(context, source_cn, instance, migration,
# FIXME(danms): This method is flawed in that it asssumes allocations
# against only one provider. So, this may overwite allocations against
# a shared provider, if we had one.
success = reportclient.set_and_clear_allocations(
context, source_cn.uuid, instance.uuid, orig_alloc,
instance.project_id, instance.user_id,
consumer_to_clear=migration.uuid)
success = reportclient.move_allocations(context, migration.uuid,
instance.uuid)
if not success:
LOG.error('Unable to replace resource claim on source '
'host %(host)s node %(node)s for instance',
@ -320,5 +316,4 @@ class MigrationTask(base.TaskBase):
# now.
revert_allocation_for_migration(self.context, self._source_cn,
self.instance, self._migration,
self._held_allocations)
self.instance, self._migration)

View File

@ -2322,6 +2322,12 @@ class AllocationUpdateFailed(NovaException):
'Error: %(error)s')
class AllocationMoveFailed(NovaException):
msg_fmt = _('Failed to move allocations from consumer %(source_consumer)s '
'to consumer %(target_consumer)s. '
'Error: %(error)s')
class AllocationDeleteFailed(NovaException):
msg_fmt = _('Failed to delete allocations for consumer %(consumer_uuid)s. '
'Error: %(error)s')

View File

@ -1922,68 +1922,81 @@ class SchedulerReportClient(object):
@safe_connect
@retries
def set_and_clear_allocations(self, context, rp_uuid, consumer_uuid,
alloc_data, project_id, user_id,
consumer_to_clear=None):
"""Create allocation records for the supplied consumer UUID while
simultaneously clearing any allocations identified by the uuid
in consumer_to_clear, for example a migration uuid when moving an
instance to another host. This is for atomically managing so-called
"doubled" migration records.
def move_allocations(self, context, source_consumer_uuid,
target_consumer_uuid):
"""Move allocations from one consumer to the other
:note Currently we only allocate against a single resource provider.
Once shared storage and things like NUMA allocations are a
reality, this will change to allocate against multiple providers.
Note that this call moves the current allocation from the source
consumer to the target consumer. If parallel update happens on either
or both consumers during this call then Placement will detect that and
this code will re-read the new state of the consumers and retry the
operation. If you want to move a known piece of allocation from source
to target then this function might not be what you want as it always
moves what source has in Placement.
:param context: The security context
:param rp_uuid: The UUID of the resource provider to allocate against.
:param consumer_uuid: The consumer UUID for which allocations are
being set.
:param alloc_data: Dict, keyed by resource class, of amounts to
consume.
:param project_id: The project_id associated with the allocations.
:param user_id: The user_id associated with the allocations.
:param consumer_to_clear: A UUID identifying allocations for a
consumer that should be cleared.
:returns: True if the allocations were created, False otherwise.
:raises: Retry if the operation should be retried due to a concurrent
update.
:param source_consumer_uuid: the UUID of the consumer from which
allocations are moving
:param target_consumer_uuid: the UUID of the target consumer for the
allocations
:returns: True if the move was successful False otherwise.
:raises AllocationMoveFailed: If the source or the target consumer has
been modified while this call tries to
move allocations.
"""
# FIXME(cdent): Fair amount of duplicate with put in here, but now
# just working things through.
payload = {
consumer_uuid: {
'allocations': {
rp_uuid: {
'resources': alloc_data
}
},
'project_id': project_id,
'user_id': user_id,
source_alloc = self.get_allocs_for_consumer(
context, source_consumer_uuid)
target_alloc = self.get_allocs_for_consumer(
context, target_consumer_uuid)
if target_alloc and target_alloc['allocations']:
LOG.warning('Overwriting current allocation %(allocation)s on '
'consumer %(consumer)s',
{'allocation': target_alloc,
'consumer': target_consumer_uuid})
new_allocs = {
source_consumer_uuid: {
# 'allocations': {} means we are removing the allocation from
# the source consumer
'allocations': {},
'project_id': source_alloc['project_id'],
'user_id': source_alloc['user_id'],
'consumer_generation': source_alloc['consumer_generation']},
target_consumer_uuid: {
'allocations': source_alloc['allocations'],
# NOTE(gibi): Is there any case when we need to keep the
# project_id and user_id of the target allocation that we are
# about to overwrite?
'project_id': source_alloc['project_id'],
'user_id': source_alloc['user_id'],
'consumer_generation': target_alloc.get('consumer_generation')
}
}
if consumer_to_clear:
payload[consumer_to_clear] = {
'allocations': {},
'project_id': project_id,
'user_id': user_id,
}
r = self.post('/allocations', payload,
version=POST_ALLOCATIONS_API_VERSION,
r = self.post('/allocations', new_allocs,
version=CONSUMER_GENERATION_VERSION,
global_request_id=context.global_id)
if r.status_code != 204:
# NOTE(jaypipes): Yes, it sucks doing string comparison like this
# but we have no error codes, only error messages.
if 'concurrently updated' in r.text:
err = r.json()['errors'][0]
if err['code'] == 'placement.concurrent_update':
# NOTE(jaypipes): Yes, it sucks doing string comparison like
# this but we have no error codes, only error messages.
# TODO(gibi): Use more granular error codes when available
if 'consumer generation conflict' in err['detail']:
raise exception.AllocationMoveFailed(
source_consumer=source_consumer_uuid,
target_consumer=target_consumer_uuid,
error=r.text)
reason = ('another process changed the resource providers '
'involved in our attempt to post allocations for '
'consumer %s' % consumer_uuid)
raise Retry('set_and_clear_allocations', reason)
'consumer %s' % target_consumer_uuid)
raise Retry('move_allocations', reason)
else:
LOG.warning(
'Unable to post allocations for instance '
'Unable to post allocations for consumer '
'%(uuid)s (%(code)i %(text)s)',
{'uuid': consumer_uuid,
{'uuid': target_consumer_uuid,
'code': r.status_code,
'text': r.text})
return r.status_code == 204

View File

@ -28,6 +28,7 @@ import nova.conf
from nova import context
from nova.db import api as db
import nova.image.glance
from nova import objects
from nova import test
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional.api import client as api_client
@ -643,3 +644,103 @@ class ProviderUsageBaseTestCase(test.TestCase, InstanceHelperMixin):
compute.manager.host)
compute.manager.update_available_resource(ctx)
LOG.info('Finished with periodics')
def _move_and_check_allocations(self, server, request, old_flavor,
new_flavor, source_rp_uuid, dest_rp_uuid):
self.api.post_server_action(server['id'], request)
self._wait_for_state_change(self.api, server, 'VERIFY_RESIZE')
def _check_allocation():
source_usages = self._get_provider_usages(source_rp_uuid)
self.assertFlavorMatchesAllocation(old_flavor, source_usages)
dest_usages = self._get_provider_usages(dest_rp_uuid)
self.assertFlavorMatchesAllocation(new_flavor, dest_usages)
# The instance should own the new_flavor allocation against the
# destination host created by the scheduler
allocations = self._get_allocations_by_server_uuid(server['id'])
self.assertEqual(1, len(allocations))
dest_alloc = allocations[dest_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(new_flavor, dest_alloc)
# The migration should own the old_flavor allocation against the
# source host created by conductor
migration_uuid = self.get_migration_uuid_for_instance(server['id'])
allocations = self._get_allocations_by_server_uuid(migration_uuid)
source_alloc = allocations[source_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(old_flavor, source_alloc)
# OK, so the move operation has run, but we have not yet confirmed or
# reverted the move operation. Before we run periodics, make sure
# that we have allocations/usages on BOTH the source and the
# destination hosts.
_check_allocation()
self._run_periodics()
_check_allocation()
# Make sure the RequestSpec.flavor matches the new_flavor.
ctxt = context.get_admin_context()
reqspec = objects.RequestSpec.get_by_instance_uuid(ctxt, server['id'])
self.assertEqual(new_flavor['id'], reqspec.flavor.flavorid)
def _migrate_and_check_allocations(self, server, flavor, source_rp_uuid,
dest_rp_uuid):
request = {
'migrate': None
}
self._move_and_check_allocations(
server, request=request, old_flavor=flavor, new_flavor=flavor,
source_rp_uuid=source_rp_uuid, dest_rp_uuid=dest_rp_uuid)
def _resize_to_same_host_and_check_allocations(self, server, old_flavor,
new_flavor, rp_uuid):
# Resize the server to the same host and check usages in VERIFY_RESIZE
# state
self.flags(allow_resize_to_same_host=True)
resize_req = {
'resize': {
'flavorRef': new_flavor['id']
}
}
self.api.post_server_action(server['id'], resize_req)
self._wait_for_state_change(self.api, server, 'VERIFY_RESIZE')
usages = self._get_provider_usages(rp_uuid)
self.assertFlavorsMatchAllocation(old_flavor, new_flavor, usages)
# The instance should hold a new_flavor allocation
allocations = self._get_allocations_by_server_uuid(server['id'])
self.assertEqual(1, len(allocations))
allocation = allocations[rp_uuid]['resources']
self.assertFlavorMatchesAllocation(new_flavor, allocation)
# The migration should hold an old_flavor allocation
migration_uuid = self.get_migration_uuid_for_instance(server['id'])
allocations = self._get_allocations_by_server_uuid(migration_uuid)
self.assertEqual(1, len(allocations))
allocation = allocations[rp_uuid]['resources']
self.assertFlavorMatchesAllocation(old_flavor, allocation)
# We've resized to the same host and have doubled allocations for both
# the old and new flavor on the same host. Run the periodic on the
# compute to see if it tramples on what the scheduler did.
self._run_periodics()
usages = self._get_provider_usages(rp_uuid)
# In terms of usage, it's still double on the host because the instance
# and the migration each hold an allocation for the new and old
# flavors respectively.
self.assertFlavorsMatchAllocation(old_flavor, new_flavor, usages)
# The instance should hold a new_flavor allocation
allocations = self._get_allocations_by_server_uuid(server['id'])
self.assertEqual(1, len(allocations))
allocation = allocations[rp_uuid]['resources']
self.assertFlavorMatchesAllocation(new_flavor, allocation)
# The migration should hold an old_flavor allocation
allocations = self._get_allocations_by_server_uuid(migration_uuid)
self.assertEqual(1, len(allocations))
allocation = allocations[rp_uuid]['resources']
self.assertFlavorMatchesAllocation(old_flavor, allocation)

View File

@ -2023,53 +2023,6 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase):
self.compute2.manager.update_available_resource(ctx)
LOG.info('Finished with periodics')
def _migrate_and_check_allocations(self, server, flavor, source_rp_uuid,
dest_rp_uuid):
request = {
'migrate': None
}
self._move_and_check_allocations(
server, request=request, old_flavor=flavor, new_flavor=flavor,
source_rp_uuid=source_rp_uuid, dest_rp_uuid=dest_rp_uuid)
def _move_and_check_allocations(self, server, request, old_flavor,
new_flavor, source_rp_uuid, dest_rp_uuid):
self.api.post_server_action(server['id'], request)
self._wait_for_state_change(self.api, server, 'VERIFY_RESIZE')
def _check_allocation():
source_usages = self._get_provider_usages(source_rp_uuid)
self.assertFlavorMatchesAllocation(old_flavor, source_usages)
dest_usages = self._get_provider_usages(dest_rp_uuid)
self.assertFlavorMatchesAllocation(new_flavor, dest_usages)
# The instance should own the new_flavor allocation against the
# destination host created by the scheduler
allocations = self._get_allocations_by_server_uuid(server['id'])
self.assertEqual(1, len(allocations))
dest_alloc = allocations[dest_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(new_flavor, dest_alloc)
# The migration should own the old_flavor allocation against the
# source host created by conductor
migration_uuid = self.get_migration_uuid_for_instance(server['id'])
allocations = self._get_allocations_by_server_uuid(migration_uuid)
source_alloc = allocations[source_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(old_flavor, source_alloc)
# OK, so the move operation has run, but we have not yet confirmed or
# reverted the move operation. Before we run periodics, make sure
# that we have allocations/usages on BOTH the source and the
# destination hosts.
_check_allocation()
self._run_periodics()
_check_allocation()
# Make sure the RequestSpec.flavor matches the new_flavor.
ctxt = context.get_admin_context()
reqspec = objects.RequestSpec.get_by_instance_uuid(ctxt, server['id'])
self.assertEqual(new_flavor['id'], reqspec.flavor.flavorid)
def test_resize_revert(self):
self._test_resize_revert(dest_hostname='host1')
@ -2204,59 +2157,6 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase):
self._delete_and_check_allocations(server)
def _resize_to_same_host_and_check_allocations(self, server, old_flavor,
new_flavor, rp_uuid):
# Resize the server to the same host and check usages in VERIFY_RESIZE
# state
self.flags(allow_resize_to_same_host=True)
resize_req = {
'resize': {
'flavorRef': new_flavor['id']
}
}
self.api.post_server_action(server['id'], resize_req)
self._wait_for_state_change(self.api, server, 'VERIFY_RESIZE')
usages = self._get_provider_usages(rp_uuid)
self.assertFlavorsMatchAllocation(old_flavor, new_flavor, usages)
# The instance should hold a new_flavor allocation
allocations = self._get_allocations_by_server_uuid(server['id'])
self.assertEqual(1, len(allocations))
allocation = allocations[rp_uuid]['resources']
self.assertFlavorMatchesAllocation(new_flavor, allocation)
# The migration should hold an old_flavor allocation
migration_uuid = self.get_migration_uuid_for_instance(server['id'])
allocations = self._get_allocations_by_server_uuid(migration_uuid)
self.assertEqual(1, len(allocations))
allocation = allocations[rp_uuid]['resources']
self.assertFlavorMatchesAllocation(old_flavor, allocation)
# We've resized to the same host and have doubled allocations for both
# the old and new flavor on the same host. Run the periodic on the
# compute to see if it tramples on what the scheduler did.
self._run_periodics()
usages = self._get_provider_usages(rp_uuid)
# In terms of usage, it's still double on the host because the instance
# and the migration each hold an allocation for the new and old
# flavors respectively.
self.assertFlavorsMatchAllocation(old_flavor, new_flavor, usages)
# The instance should hold a new_flavor allocation
allocations = self._get_allocations_by_server_uuid(server['id'])
self.assertEqual(1, len(allocations))
allocation = allocations[rp_uuid]['resources']
self.assertFlavorMatchesAllocation(new_flavor, allocation)
# The migration should hold an old_flavor allocation
allocations = self._get_allocations_by_server_uuid(migration_uuid)
self.assertEqual(1, len(allocations))
allocation = allocations[rp_uuid]['resources']
self.assertFlavorMatchesAllocation(old_flavor, allocation)
def test_resize_revert_same_host(self):
# make sure that the test only uses a single host
compute2_service_id = self.admin_api.get_services(
@ -4548,6 +4448,157 @@ class ConsumerGenerationConflictTest(
self.compute1 = self._start_compute('compute1')
self.compute2 = self._start_compute('compute2')
def test_migrate_move_allocation_fails_due_to_conflict(self):
source_hostname = self.compute1.host
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
server = self._boot_and_check_allocations(self.flavor, source_hostname)
rsp = fake_requests.FakeResponse(
409,
jsonutils.dumps(
{'errors': [
{'code': 'placement.concurrent_update',
'detail': 'consumer generation conflict'}]}))
with mock.patch('keystoneauth1.adapter.Adapter.post',
autospec=True) as mock_post:
mock_post.return_value = rsp
request = {'migrate': None}
exception = self.assertRaises(client.OpenStackApiException,
self.api.post_server_action,
server['id'], request)
self.assertEqual(1, mock_post.call_count)
self.assertEqual(409, exception.response.status_code)
self.assertIn('Failed to move allocations', exception.response.text)
migrations = self.api.get_migrations()
self.assertEqual(1, len(migrations))
self.assertEqual('migration', migrations[0]['migration_type'])
self.assertEqual(server['id'], migrations[0]['instance_uuid'])
self.assertEqual(source_hostname, migrations[0]['source_compute'])
self.assertEqual('error', migrations[0]['status'])
# The migration is aborted so the instance is ACTIVE on the source
# host instead of being in VERIFY_RESIZE state.
server = self.api.get_server(server['id'])
self.assertEqual('ACTIVE', server['status'])
self.assertEqual(source_hostname, server['OS-EXT-SRV-ATTR:host'])
source_usages = self._get_provider_usages(source_rp_uuid)
self.assertFlavorMatchesAllocation(self.flavor, source_usages)
allocations = self._get_allocations_by_server_uuid(server['id'])
self.assertEqual(1, len(allocations))
alloc = allocations[source_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(self.flavor, alloc)
self._delete_and_check_allocations(server)
def test_revert_migrate_delete_dest_allocation_fails_due_to_conflict(self):
source_hostname = self.compute1.host
dest_hostname = self.compute2.host
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
server = self._boot_and_check_allocations(self.flavor, source_hostname)
self._migrate_and_check_allocations(
server, self.flavor, source_rp_uuid, dest_rp_uuid)
rsp = fake_requests.FakeResponse(
409,
jsonutils.dumps(
{'errors': [
{'code': 'placement.concurrent_update',
'detail': 'consumer generation conflict'}]}))
with mock.patch('keystoneauth1.adapter.Adapter.post',
autospec=True) as mock_post:
mock_post.return_value = rsp
post = {'revertResize': None}
self.api.post_server_action(server['id'], post)
server = self._wait_for_state_change(self.api, server, 'ERROR',)
self.assertEqual(1, mock_post.call_count)
migrations = self.api.get_migrations()
self.assertEqual(1, len(migrations))
self.assertEqual('migration', migrations[0]['migration_type'])
self.assertEqual(server['id'], migrations[0]['instance_uuid'])
self.assertEqual(source_hostname, migrations[0]['source_compute'])
self.assertEqual('error', migrations[0]['status'])
# NOTE(gibi): Nova leaks the allocation held by the migration_uuid even
# after the instance is deleted. At least nova logs a fat ERROR.
self.assertIn('Reverting allocation in placement for migration %s '
'failed. The instance %s will be put into ERROR state '
'but the allocation held by the migration is leaked.' %
(migrations[0]['uuid'], server['id']),
self.stdlog.logger.output)
self.api.delete_server(server['id'])
self._wait_until_deleted(server)
fake_notifier.wait_for_versioned_notifications('instance.delete.end')
allocations = self._get_allocations_by_server_uuid(
migrations[0]['uuid'])
self.assertEqual(1, len(allocations))
def test_revert_resize_same_host_delete_dest_fails_due_to_conflict(self):
# make sure that the test only uses a single host
compute2_service_id = self.admin_api.get_services(
host=self.compute2.host, binary='nova-compute')[0]['id']
self.admin_api.put_service(compute2_service_id, {'status': 'disabled'})
hostname = self.compute1.manager.host
rp_uuid = self._get_provider_uuid_by_host(hostname)
server = self._boot_and_check_allocations(self.flavor, hostname)
self._resize_to_same_host_and_check_allocations(
server, self.flavor, self.other_flavor, rp_uuid)
rsp = fake_requests.FakeResponse(
409,
jsonutils.dumps(
{'errors': [
{'code': 'placement.concurrent_update',
'detail': 'consumer generation conflict'}]}))
with mock.patch('keystoneauth1.adapter.Adapter.post',
autospec=True) as mock_post:
mock_post.return_value = rsp
post = {'revertResize': None}
self.api.post_server_action(server['id'], post)
server = self._wait_for_state_change(self.api, server, 'ERROR',)
self.assertEqual(1, mock_post.call_count)
migrations = self.api.get_migrations()
self.assertEqual(1, len(migrations))
self.assertEqual('resize', migrations[0]['migration_type'])
self.assertEqual(server['id'], migrations[0]['instance_uuid'])
self.assertEqual(hostname, migrations[0]['source_compute'])
self.assertEqual('error', migrations[0]['status'])
# NOTE(gibi): Nova leaks the allocation held by the migration_uuid even
# after the instance is deleted. At least nova logs a fat ERROR.
self.assertIn('Reverting allocation in placement for migration %s '
'failed. The instance %s will be put into ERROR state '
'but the allocation held by the migration is leaked.' %
(migrations[0]['uuid'], server['id']),
self.stdlog.logger.output)
self.api.delete_server(server['id'])
self._wait_until_deleted(server)
fake_notifier.wait_for_versioned_notifications('instance.delete.end')
allocations = self._get_allocations_by_server_uuid(
migrations[0]['uuid'])
self.assertEqual(1, len(allocations))
def test_server_delete_fails_due_to_conflict(self):
source_hostname = self.compute1.host

View File

@ -6210,11 +6210,8 @@ class ComputeTestCase(BaseTestCase,
instance=instance, migration=migration,
migrate_data=migrate_data)
mock_setup.assert_called_once_with(c, instance, self.compute.host)
mock_client.set_and_clear_allocations.assert_called_once_with(
c, mock.sentinel.source, instance.uuid,
mock.sentinel.allocs,
instance.project_id, instance.user_id,
consumer_to_clear=migration.uuid)
mock_client.move_allocations.assert_called_once_with(
c, migration.uuid, instance.uuid)
do_it()

View File

@ -7151,10 +7151,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
self.instance, self.migration)
self.assertTrue(r)
mock_report.set_and_clear_allocations.assert_called_once_with(
mock.sentinel.ctx, cu, self.instance.uuid, {'DISK_GB': 1},
self.instance.project_id, self.instance.user_id,
consumer_to_clear=self.migration.uuid)
mock_report.move_allocations.assert_called_once_with(
mock.sentinel.ctx, self.migration.uuid, self.instance.uuid)
doit()
@ -7170,7 +7168,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
self.instance, self.migration)
self.assertFalse(r)
self.assertFalse(mock_report.set_and_clear_allocations.called)
self.assertFalse(mock_report.move_allocations.called)
doit()
@ -7195,7 +7193,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
self.instance, self.migration)
self.assertTrue(r)
self.assertTrue(mock_report.set_and_clear_allocations.called)
self.assertTrue(mock_report.move_allocations.called)
doit()

View File

@ -227,8 +227,7 @@ class MigrationTaskTestCase(test.NoDBTestCase):
task._migration.save.assert_called_once_with()
self.assertEqual('error', task._migration.status)
mock_ra.assert_called_once_with(task.context, task._source_cn,
task.instance, task._migration,
task._held_allocations)
task.instance, task._migration)
class MigrationTaskAllocationUtils(test.NoDBTestCase):

View File

@ -1112,10 +1112,10 @@ class TestPutAllocations(SchedulerReportClientTestCase):
self.assertFalse(res)
class TestSetAndClearAllocations(SchedulerReportClientTestCase):
class TestMoveAllocations(SchedulerReportClientTestCase):
def setUp(self):
super(TestSetAndClearAllocations, self).setUp()
super(TestMoveAllocations, self).setUp()
# We want to reuse the mock throughout the class, but with
# different return values.
patcher = mock.patch(
@ -1126,86 +1126,153 @@ class TestSetAndClearAllocations(SchedulerReportClientTestCase):
self.rp_uuid = mock.sentinel.rp
self.consumer_uuid = mock.sentinel.consumer
self.data = {"MEMORY_MB": 1024}
patcher = mock.patch(
'nova.scheduler.client.report.SchedulerReportClient.get')
self.mock_get = patcher.start()
self.addCleanup(patcher.stop)
self.project_id = mock.sentinel.project_id
self.user_id = mock.sentinel.user_id
self.mock_post.return_value.status_code = 204
self.rp_uuid = mock.sentinel.rp
self.source_consumer_uuid = mock.sentinel.source_consumer
self.target_consumer_uuid = mock.sentinel.target_consumer
self.source_consumer_data = {
"allocations": {
self.rp_uuid: {
"generation": 1,
"resources": {
"MEMORY_MB": 1024
}
}
},
"consumer_generation": 2,
"project_id": self.project_id,
"user_id": self.user_id
}
self.source_rsp = mock.Mock()
self.source_rsp.json.return_value = self.source_consumer_data
self.target_consumer_data = {
"allocations": {
self.rp_uuid: {
"generation": 1,
"resources": {
"MEMORY_MB": 2048
}
}
},
"consumer_generation": 1,
"project_id": self.project_id,
"user_id": self.user_id
}
self.target_rsp = mock.Mock()
self.target_rsp.json.return_value = self.target_consumer_data
self.mock_get.side_effect = [self.source_rsp, self.target_rsp]
self.expected_url = '/allocations'
self.expected_microversion = '1.28'
def test_url_microversion(self):
expected_microversion = '1.13'
resp = self.client.set_and_clear_allocations(
self.context, self.rp_uuid, self.consumer_uuid, self.data,
self.project_id, self.user_id)
resp = self.client.move_allocations(
self.context, self.source_consumer_uuid, self.target_consumer_uuid)
self.assertTrue(resp)
self.mock_post.assert_called_once_with(
self.expected_url, mock.ANY,
version=expected_microversion,
version=self.expected_microversion,
global_request_id=self.context.global_id)
def test_payload_no_clear(self):
def test_move_to_empty_target(self):
self.target_consumer_data = {"allocations": {}}
target_rsp = mock.Mock()
target_rsp.json.return_value = self.target_consumer_data
self.mock_get.side_effect = [self.source_rsp, target_rsp]
expected_payload = {
self.consumer_uuid: {
'user_id': self.user_id,
'project_id': self.project_id,
'allocations': {
self.target_consumer_uuid: {
"allocations": {
self.rp_uuid: {
'resources': {
'MEMORY_MB': 1024
}
"resources": {
"MEMORY_MB": 1024
},
"generation": 1
}
}
}
}
resp = self.client.set_and_clear_allocations(
self.context, self.rp_uuid, self.consumer_uuid, self.data,
self.project_id, self.user_id)
self.assertTrue(resp)
args, kwargs = self.mock_post.call_args
payload = args[1]
self.assertEqual(expected_payload, payload)
def test_payload_with_clear(self):
expected_payload = {
self.consumer_uuid: {
'user_id': self.user_id,
'project_id': self.project_id,
'allocations': {
self.rp_uuid: {
'resources': {
'MEMORY_MB': 1024
}
}
}
},
"consumer_generation": None,
"project_id": self.project_id,
"user_id": self.user_id,
},
mock.sentinel.migration_uuid: {
'user_id': self.user_id,
'project_id': self.project_id,
'allocations': {}
self.source_consumer_uuid: {
"allocations": {},
"consumer_generation": 2,
"project_id": self.project_id,
"user_id": self.user_id,
}
}
resp = self.client.set_and_clear_allocations(
self.context, self.rp_uuid, self.consumer_uuid, self.data,
self.project_id, self.user_id,
consumer_to_clear=mock.sentinel.migration_uuid)
resp = self.client.move_allocations(
self.context, self.source_consumer_uuid, self.target_consumer_uuid)
self.assertTrue(resp)
args, kwargs = self.mock_post.call_args
payload = args[1]
self.assertEqual(expected_payload, payload)
self.mock_post.assert_called_once_with(
self.expected_url, expected_payload,
version=self.expected_microversion,
global_request_id=self.context.global_id)
def test_move_to_non_empty_target(self):
self.mock_get.side_effect = [self.source_rsp, self.target_rsp]
expected_payload = {
self.target_consumer_uuid: {
"allocations": {
self.rp_uuid: {
"resources": {
"MEMORY_MB": 1024
},
"generation": 1
}
},
"consumer_generation": 1,
"project_id": self.project_id,
"user_id": self.user_id,
},
self.source_consumer_uuid: {
"allocations": {},
"consumer_generation": 2,
"project_id": self.project_id,
"user_id": self.user_id,
}
}
resp = self.client.move_allocations(
self.context, self.source_consumer_uuid, self.target_consumer_uuid)
self.assertTrue(resp)
self.mock_post.assert_called_once_with(
self.expected_url, expected_payload,
version=self.expected_microversion,
global_request_id=self.context.global_id)
self.assertIn('Overwriting current allocation',
self.stdlog.logger.output)
@mock.patch('time.sleep')
def test_409_concurrent_update(self, mock_sleep):
self.mock_post.return_value.status_code = 409
self.mock_post.return_value.text = 'concurrently updated'
def test_409_concurrent_provider_update(self, mock_sleep):
# there will be 1 normal call and 3 retries
self.mock_get.side_effect = [self.source_rsp, self.target_rsp,
self.source_rsp, self.target_rsp,
self.source_rsp, self.target_rsp,
self.source_rsp, self.target_rsp]
rsp = fake_requests.FakeResponse(
409,
jsonutils.dumps(
{'errors': [
{'code': 'placement.concurrent_update',
'detail': ''}]}))
resp = self.client.set_and_clear_allocations(
self.context, self.rp_uuid, self.consumer_uuid, self.data,
self.project_id, self.user_id,
consumer_to_clear=mock.sentinel.migration_uuid)
self.mock_post.return_value = rsp
resp = self.client.move_allocations(
self.context, self.source_consumer_uuid, self.target_consumer_uuid)
self.assertFalse(resp)
# Post was attempted four times.
@ -1217,10 +1284,8 @@ class TestSetAndClearAllocations(SchedulerReportClientTestCase):
self.mock_post.return_value.status_code = 503
self.mock_post.return_value.text = error_message
resp = self.client.set_and_clear_allocations(
self.context, self.rp_uuid, self.consumer_uuid, self.data,
self.project_id, self.user_id,
consumer_to_clear=mock.sentinel.migration_uuid)
resp = self.client.move_allocations(
self.context, self.source_consumer_uuid, self.target_consumer_uuid)
self.assertFalse(resp)
args, kwargs = mock_log.call_args
@ -1229,6 +1294,17 @@ class TestSetAndClearAllocations(SchedulerReportClientTestCase):
self.assertIn('Unable to post allocations', log_message)
self.assertEqual(error_message, log_args['text'])
def test_409_concurrent_consumer_update(self):
self.mock_post.return_value = fake_requests.FakeResponse(
status_code=409,
content=jsonutils.dumps(
{'errors': [{'code': 'placement.concurrent_update',
'detail': 'consumer generation conflict'}]}))
self.assertRaises(exception.AllocationMoveFailed,
self.client.move_allocations, self.context,
self.source_consumer_uuid, self.target_consumer_uuid)
class TestProviderOperations(SchedulerReportClientTestCase):
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'

View File

@ -0,0 +1,10 @@
---
issues:
- |
Nova leaks resource allocations in placement during
``POST /servers/{server_id}/action (revertResize Action)`` if the
allocation held by the migration_uuid is modified in parallel with the
revert operation. Nova will log an ERROR and will put the server into ERROR
state but will not delete the migration allocation. We assume that this can
only happen if somebody outside of nova is actively changing the migration
allocation in placement. Therefore it is not considered as a bug.