Consumer gen support for delete instance allocations

The placement API version 1.28 introduced consumer generation as a way
to make updating allocation safe even if it is done from multiple
places.

This patch changes delete_allocation_for_instance to use PUT
/allocations instead of DELETE /allocations to benefit from the consumer
generation handling.

In this patch the report client will GET the current allocation of the
instance including the consumer generation and then try to PUT an empty
allocation with that generation. If this fails due to a consumer
generation conflict, meaning something modified the allocation of the
instance in between GET and PUT then the report client will raise
AllocationDeleteFailed exception. This will cause that the instance
goes to ERROR state.

This patch only detects a small portion of possible cases when
allocation is modified outside of the delete code path. The rest can
only be detected if nova would cache at least the consumer generation
of the instance.

To be able to put the instance state to ERROR the instance.destroy()
call is moved to the end to of the deletion call path. To keep the
instance.delete.end notification behavior consistent with this move
(e.g. deleted_at field is filled) the notification sending needed to
be moved too.

Blueprint: use-nested-allocation-candidates
Change-Id: I77f34788dd7ab8fdf60d668a4f76452e03cf9888
This commit is contained in:
Balazs Gibizer 2018-08-14 10:37:03 +02:00
parent 337b24ca41
commit 6f1a1f5e8e
9 changed files with 184 additions and 44 deletions

View File

@ -58,12 +58,14 @@ changed or be partially complete at this time.
* `filter allocation candidates by aggregate membership`_
* `perform granular allocation candidate requests`_
* `inventory and allocation data migration`_ (reshaping provider trees)
* `handle allocation updates in a safe way`_
.. _Nested Resource Providers: http://specs.openstack.org/openstack/nova-specs/specs/queens/approved/nested-resource-providers.html
.. _Request Traits During Scheduling: https://specs.openstack.org/openstack/nova-specs/specs/queens/approved/request-traits-in-nova.html
.. _filter allocation candidates by aggregate membership: https://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/alloc-candidates-member-of.html
.. _perform granular allocation candidate requests: http://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/granular-resource-requests.html
.. _inventory and allocation data migration: http://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/reshape-provider-tree.html
.. _handle allocation updates in a safe way: https://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/add-consumer-generation.html
Deployment
==========

View File

@ -865,7 +865,8 @@ class ServersController(wsgi.Controller):
raise exc.HTTPNotFound(explanation=msg)
except exception.InstanceUnknownCell as e:
raise exc.HTTPNotFound(explanation=e.format_message())
except exception.InstanceIsLocked as e:
except (exception.InstanceIsLocked,
exception.AllocationDeleteFailed) as e:
raise exc.HTTPConflict(explanation=e.format_message())
except exception.InstanceInvalidState as state_error:
common.raise_http_conflict_for_instance_invalid_state(state_error,

View File

@ -749,19 +749,18 @@ class ComputeManager(manager.Manager):
bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
context, instance.uuid)
self._complete_deletion(context,
instance,
bdms)
instance)
self._notify_about_instance_usage(context, instance, "delete.end")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.DELETE,
phase=fields.NotificationPhase.END, bdms=bdms)
def _complete_deletion(self, context, instance, bdms):
def _complete_deletion(self, context, instance):
self._update_resource_tracker(context, instance)
rt = self._get_resource_tracker()
rt.reportclient.delete_allocation_for_instance(context, instance.uuid)
self._notify_about_instance_usage(context, instance, "delete.end")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.DELETE,
phase=fields.NotificationPhase.END, bdms=bdms)
self._clean_instance_console_tokens(context, instance)
self._delete_scheduler_instance_info(context, instance.uuid)
@ -2624,11 +2623,17 @@ class ComputeManager(manager.Manager):
instance.power_state = power_state.NOSTATE
instance.terminated_at = timeutils.utcnow()
instance.save()
self._complete_deletion(context, instance)
# only destroy the instance in the db if the _complete_deletion
# doesn't raise and therefore allocation is successfully
# deleted in placement
instance.destroy()
self._complete_deletion(context,
instance,
bdms)
self._notify_about_instance_usage(context, instance, "delete.end")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.DELETE,
phase=fields.NotificationPhase.END, bdms=bdms)
@wrap_exception()
@reverts_task_state

View File

@ -2322,6 +2322,11 @@ class AllocationUpdateFailed(NovaException):
'Error: %(error)s')
class AllocationDeleteFailed(NovaException):
msg_fmt = _('Failed to delete allocations for consumer %(consumer_uuid)s. '
'Error: %(error)s')
class TooManyComputesForHost(NovaException):
msg_fmt = _('Unexpected number of compute node records '
'(%(num_computes)d) found for host %(host)s. There should '

View File

@ -2047,21 +2047,63 @@ class SchedulerReportClient(object):
@safe_connect
def delete_allocation_for_instance(self, context, uuid):
"""Delete the instance allocation from placement
:param context: The security context
:param uuid: the instance UUID which will be used as the consumer UUID
towards placement
:return: Returns True if the allocation is successfully deleted by this
call. Returns False if the allocation does not exist.
:raises AllocationDeleteFailed: If the allocation cannot be read from
placement or it is changed by another process while we tried to
delete it.
"""
url = '/allocations/%s' % uuid
r = self.delete(url, global_request_id=context.global_id)
if r:
# We read the consumer generation then try to put an empty allocation
# for that consumer. If between the GET and the PUT the consumer
# generation changes then we raise AllocationDeleteFailed.
# NOTE(gibi): This only detect a small portion of possible cases when
# allocation is modified outside of the delete code path. The rest can
# only be detected if nova would cache at least the consumer generation
# of the instance.
# NOTE(gibi): placement does not return 404 for non-existing consumer
# but returns an empty consumer instead. Putting an empty allocation to
# that non-existing consumer won't be 404 or other error either.
r = self.get(url, global_request_id=context.global_id,
version=CONSUMER_GENERATION_VERSION)
if not r:
# at the moment there is no way placement returns a failure so we
# could even delete this code
LOG.warning('Unable to delete allocation for instance '
'%(uuid)s: (%(code)i %(text)s)',
{'uuid': uuid,
'code': r.status_code,
'text': r.text})
raise exception.AllocationDeleteFailed(consumer_uuid=uuid,
error=r.text)
allocations = r.json()
if allocations['allocations'] == {}:
# the consumer did not exist in the first place
LOG.debug('Cannot delete allocation for %s consumer in placement '
'as consumer does not exists', uuid)
return False
# removing all resources from the allocation will auto delete the
# consumer in placement
allocations['allocations'] = {}
r = self.put(url, allocations, global_request_id=context.global_id,
version=CONSUMER_GENERATION_VERSION)
if r.status_code == 204:
LOG.info('Deleted allocation for instance %s', uuid)
return True
else:
# Check for 404 since we don't need to log a warning if we tried to
# delete something which doesn't actually exist.
if r.status_code != 404:
LOG.warning('Unable to delete allocation for instance '
'%(uuid)s: (%(code)i %(text)s)',
{'uuid': uuid,
'code': r.status_code,
'text': r.text})
return False
LOG.warning('Unable to delete allocation for instance '
'%(uuid)s: (%(code)i %(text)s)',
{'uuid': uuid,
'code': r.status_code,
'text': r.text})
raise exception.AllocationDeleteFailed(consumer_uuid=uuid,
error=r.text)
def get_allocations_for_resource_provider(self, context, rp_uuid):
"""Retrieves the allocations for a specific provider.

View File

@ -20,6 +20,7 @@ import zlib
import mock
from oslo_log import log as logging
from oslo_serialization import base64
from oslo_serialization import jsonutils
from oslo_utils.fixture import uuidsentinel as uuids
from oslo_utils import timeutils
import six
@ -40,6 +41,7 @@ from nova.tests.functional import integrated_helpers
from nova.tests.unit.api.openstack import fakes
from nova.tests.unit import fake_block_device
from nova.tests.unit import fake_notifier
from nova.tests.unit import fake_requests
import nova.tests.unit.image.fake
from nova.virt import fake
from nova import volume
@ -4529,3 +4531,87 @@ class ServerTestV256RescheduleTestCase(ServerTestV256Common):
found_server = self.api.get_server(server['id'])
# Check that rescheduling is not occurred.
self.assertNotEqual(other_host, found_server['OS-EXT-SRV-ATTR:host'])
class ConsumerGenerationConflictTest(
integrated_helpers.ProviderUsageBaseTestCase):
# we need the medium driver to be able to allocate resource not just for
# a single instance
compute_driver = 'fake.MediumFakeDriver'
def setUp(self):
super(ConsumerGenerationConflictTest, self).setUp()
flavors = self.api.get_flavors()
self.flavor = flavors[0]
self.other_flavor = flavors[1]
self.compute1 = self._start_compute('compute1')
self.compute2 = self._start_compute('compute2')
def test_server_delete_fails_due_to_conflict(self):
source_hostname = self.compute1.host
server = self._boot_and_check_allocations(self.flavor, source_hostname)
rsp = fake_requests.FakeResponse(
409, jsonutils.dumps({'text': 'consumer generation conflict'}))
with mock.patch('keystoneauth1.adapter.Adapter.put',
autospec=True) as mock_put:
mock_put.return_value = rsp
self.api.delete_server(server['id'])
server = self._wait_for_state_change(self.admin_api, server,
'ERROR')
self.assertEqual(1, mock_put.call_count)
# We still have the allocations as deletion failed
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
source_usages = self._get_provider_usages(source_rp_uuid)
self.assertFlavorMatchesAllocation(self.flavor, source_usages)
allocations = self._get_allocations_by_server_uuid(server['id'])
self.assertEqual(1, len(allocations))
source_allocation = allocations[source_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(self.flavor, source_allocation)
# retry the delete to make sure that allocations are removed this time
self._delete_and_check_allocations(server)
def test_server_local_delete_fails_due_to_conflict(self):
source_hostname = self.compute1.host
server = self._boot_and_check_allocations(self.flavor, source_hostname)
source_compute_id = self.admin_api.get_services(
host=self.compute1.host, binary='nova-compute')[0]['id']
self.compute1.stop()
self.admin_api.put_service(
source_compute_id, {'forced_down': 'true'})
rsp = fake_requests.FakeResponse(
409, jsonutils.dumps({'text': 'consumer generation conflict'}))
with mock.patch('keystoneauth1.adapter.Adapter.put',
autospec=True) as mock_put:
mock_put.return_value = rsp
ex = self.assertRaises(client.OpenStackApiException,
self.api.delete_server, server['id'])
self.assertEqual(409, ex.response.status_code)
self.assertIn('Failed to delete allocations for consumer',
jsonutils.loads(ex.response.content)[
'conflictingRequest']['message'])
self.assertEqual(1, mock_put.call_count)
# We still have the allocations as deletion failed
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
source_usages = self._get_provider_usages(source_rp_uuid)
self.assertFlavorMatchesAllocation(self.flavor, source_usages)
allocations = self._get_allocations_by_server_uuid(server['id'])
self.assertEqual(1, len(allocations))
source_allocation = allocations[source_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(self.flavor, source_allocation)
# retry the delete to make sure that allocations are removed this time
self._delete_and_check_allocations(server)

View File

@ -7646,6 +7646,12 @@ class ComputeTestCase(BaseTestCase,
self.stub_out('nova.objects.quotas.Quotas.reserve',
lambda *a, **k: None)
self.stub_out('nova.compute.utils.notify_about_instance_usage',
lambda *a, **k: None)
self.stub_out('nova.compute.utils.notify_about_instance_action',
lambda *a, **k: None)
self.compute._complete_partial_deletion(admin_context, instance)
self.assertNotEqual(0, instance.deleted)

View File

@ -232,14 +232,22 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
methods_called = [n for n, a, k in call_tracker.mock_calls]
self.assertEqual(['clear_events_for_instance',
'_notify_about_instance_usage',
'_shutdown_instance'],
'_shutdown_instance',
'_notify_about_instance_usage'],
methods_called)
mock_notify.assert_called_once_with(self.context,
mock_inst,
specd_compute.host,
action='delete',
phase='start',
bdms=mock_bdms)
mock_notify.assert_has_calls([
mock.call(self.context,
mock_inst,
specd_compute.host,
action='delete',
phase='start',
bdms=mock_bdms),
mock.call(self.context,
mock_inst,
specd_compute.host,
action='delete',
phase='end',
bdms=mock_bdms)])
def _make_compute_node(self, hyp_hostname, cn_id):
cn = mock.Mock(spec_set=['hypervisor_hostname', 'id',

View File

@ -3222,21 +3222,6 @@ class TestAllocations(SchedulerReportClientTestCase):
}
self.assertEqual(expected, result)
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'delete')
@mock.patch('nova.scheduler.client.report.LOG')
def test_delete_allocation_for_instance_ignore_404(self, mock_log,
mock_delete):
"""Tests that we don't log a warning on a 404 response when trying to
delete an allocation record.
"""
mock_delete.return_value = fake_requests.FakeResponse(404)
self.client.delete_allocation_for_instance(self.context, uuids.rp_uuid)
# make sure we didn't screw up the logic or the mock
mock_log.info.assert_not_called()
# make sure warning wasn't called for the 404
mock_log.warning.assert_not_called()
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"delete")
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."