diff --git a/doc/source/user/placement.rst b/doc/source/user/placement.rst index f22421230f90..c20833109dc8 100644 --- a/doc/source/user/placement.rst +++ b/doc/source/user/placement.rst @@ -58,12 +58,14 @@ changed or be partially complete at this time. * `filter allocation candidates by aggregate membership`_ * `perform granular allocation candidate requests`_ * `inventory and allocation data migration`_ (reshaping provider trees) +* `handle allocation updates in a safe way`_ .. _Nested Resource Providers: http://specs.openstack.org/openstack/nova-specs/specs/queens/approved/nested-resource-providers.html .. _Request Traits During Scheduling: https://specs.openstack.org/openstack/nova-specs/specs/queens/approved/request-traits-in-nova.html .. _filter allocation candidates by aggregate membership: https://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/alloc-candidates-member-of.html .. _perform granular allocation candidate requests: http://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/granular-resource-requests.html .. _inventory and allocation data migration: http://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/reshape-provider-tree.html +.. _handle allocation updates in a safe way: https://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/add-consumer-generation.html Deployment ========== diff --git a/nova/api/openstack/compute/servers.py b/nova/api/openstack/compute/servers.py index e46e9ed329d9..1c2a773ecf22 100644 --- a/nova/api/openstack/compute/servers.py +++ b/nova/api/openstack/compute/servers.py @@ -865,7 +865,8 @@ class ServersController(wsgi.Controller): raise exc.HTTPNotFound(explanation=msg) except exception.InstanceUnknownCell as e: raise exc.HTTPNotFound(explanation=e.format_message()) - except exception.InstanceIsLocked as e: + except (exception.InstanceIsLocked, + exception.AllocationDeleteFailed) as e: raise exc.HTTPConflict(explanation=e.format_message()) except exception.InstanceInvalidState as state_error: common.raise_http_conflict_for_instance_invalid_state(state_error, diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 4dfd6bd4a5c4..f6715dd7fbec 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -749,19 +749,18 @@ class ComputeManager(manager.Manager): bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( context, instance.uuid) self._complete_deletion(context, - instance, - bdms) + instance) + self._notify_about_instance_usage(context, instance, "delete.end") + compute_utils.notify_about_instance_action(context, instance, + self.host, action=fields.NotificationAction.DELETE, + phase=fields.NotificationPhase.END, bdms=bdms) - def _complete_deletion(self, context, instance, bdms): + def _complete_deletion(self, context, instance): self._update_resource_tracker(context, instance) rt = self._get_resource_tracker() rt.reportclient.delete_allocation_for_instance(context, instance.uuid) - self._notify_about_instance_usage(context, instance, "delete.end") - compute_utils.notify_about_instance_action(context, instance, - self.host, action=fields.NotificationAction.DELETE, - phase=fields.NotificationPhase.END, bdms=bdms) self._clean_instance_console_tokens(context, instance) self._delete_scheduler_instance_info(context, instance.uuid) @@ -2624,11 +2623,17 @@ class ComputeManager(manager.Manager): instance.power_state = power_state.NOSTATE instance.terminated_at = timeutils.utcnow() instance.save() + + self._complete_deletion(context, instance) + # only destroy the instance in the db if the _complete_deletion + # doesn't raise and therefore allocation is successfully + # deleted in placement instance.destroy() - self._complete_deletion(context, - instance, - bdms) + self._notify_about_instance_usage(context, instance, "delete.end") + compute_utils.notify_about_instance_action(context, instance, + self.host, action=fields.NotificationAction.DELETE, + phase=fields.NotificationPhase.END, bdms=bdms) @wrap_exception() @reverts_task_state diff --git a/nova/exception.py b/nova/exception.py index 4e5a5f79486c..fb5f7785c4a5 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -2322,6 +2322,11 @@ class AllocationUpdateFailed(NovaException): 'Error: %(error)s') +class AllocationDeleteFailed(NovaException): + msg_fmt = _('Failed to delete allocations for consumer %(consumer_uuid)s. ' + 'Error: %(error)s') + + class TooManyComputesForHost(NovaException): msg_fmt = _('Unexpected number of compute node records ' '(%(num_computes)d) found for host %(host)s. There should ' diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py index 380a1f721148..a6deb80fd839 100644 --- a/nova/scheduler/client/report.py +++ b/nova/scheduler/client/report.py @@ -2047,21 +2047,63 @@ class SchedulerReportClient(object): @safe_connect def delete_allocation_for_instance(self, context, uuid): + """Delete the instance allocation from placement + + :param context: The security context + :param uuid: the instance UUID which will be used as the consumer UUID + towards placement + :return: Returns True if the allocation is successfully deleted by this + call. Returns False if the allocation does not exist. + :raises AllocationDeleteFailed: If the allocation cannot be read from + placement or it is changed by another process while we tried to + delete it. + """ url = '/allocations/%s' % uuid - r = self.delete(url, global_request_id=context.global_id) - if r: + # We read the consumer generation then try to put an empty allocation + # for that consumer. If between the GET and the PUT the consumer + # generation changes then we raise AllocationDeleteFailed. + # NOTE(gibi): This only detect a small portion of possible cases when + # allocation is modified outside of the delete code path. The rest can + # only be detected if nova would cache at least the consumer generation + # of the instance. + # NOTE(gibi): placement does not return 404 for non-existing consumer + # but returns an empty consumer instead. Putting an empty allocation to + # that non-existing consumer won't be 404 or other error either. + r = self.get(url, global_request_id=context.global_id, + version=CONSUMER_GENERATION_VERSION) + if not r: + # at the moment there is no way placement returns a failure so we + # could even delete this code + LOG.warning('Unable to delete allocation for instance ' + '%(uuid)s: (%(code)i %(text)s)', + {'uuid': uuid, + 'code': r.status_code, + 'text': r.text}) + raise exception.AllocationDeleteFailed(consumer_uuid=uuid, + error=r.text) + allocations = r.json() + if allocations['allocations'] == {}: + # the consumer did not exist in the first place + LOG.debug('Cannot delete allocation for %s consumer in placement ' + 'as consumer does not exists', uuid) + return False + + # removing all resources from the allocation will auto delete the + # consumer in placement + allocations['allocations'] = {} + r = self.put(url, allocations, global_request_id=context.global_id, + version=CONSUMER_GENERATION_VERSION) + if r.status_code == 204: LOG.info('Deleted allocation for instance %s', uuid) return True else: - # Check for 404 since we don't need to log a warning if we tried to - # delete something which doesn't actually exist. - if r.status_code != 404: - LOG.warning('Unable to delete allocation for instance ' - '%(uuid)s: (%(code)i %(text)s)', - {'uuid': uuid, - 'code': r.status_code, - 'text': r.text}) - return False + LOG.warning('Unable to delete allocation for instance ' + '%(uuid)s: (%(code)i %(text)s)', + {'uuid': uuid, + 'code': r.status_code, + 'text': r.text}) + raise exception.AllocationDeleteFailed(consumer_uuid=uuid, + error=r.text) def get_allocations_for_resource_provider(self, context, rp_uuid): """Retrieves the allocations for a specific provider. diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index 9962cdec9dd6..3b57deb1515c 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -20,6 +20,7 @@ import zlib import mock from oslo_log import log as logging from oslo_serialization import base64 +from oslo_serialization import jsonutils from oslo_utils.fixture import uuidsentinel as uuids from oslo_utils import timeutils import six @@ -40,6 +41,7 @@ from nova.tests.functional import integrated_helpers from nova.tests.unit.api.openstack import fakes from nova.tests.unit import fake_block_device from nova.tests.unit import fake_notifier +from nova.tests.unit import fake_requests import nova.tests.unit.image.fake from nova.virt import fake from nova import volume @@ -4529,3 +4531,87 @@ class ServerTestV256RescheduleTestCase(ServerTestV256Common): found_server = self.api.get_server(server['id']) # Check that rescheduling is not occurred. self.assertNotEqual(other_host, found_server['OS-EXT-SRV-ATTR:host']) + + +class ConsumerGenerationConflictTest( + integrated_helpers.ProviderUsageBaseTestCase): + + # we need the medium driver to be able to allocate resource not just for + # a single instance + compute_driver = 'fake.MediumFakeDriver' + + def setUp(self): + super(ConsumerGenerationConflictTest, self).setUp() + flavors = self.api.get_flavors() + self.flavor = flavors[0] + self.other_flavor = flavors[1] + self.compute1 = self._start_compute('compute1') + self.compute2 = self._start_compute('compute2') + + def test_server_delete_fails_due_to_conflict(self): + source_hostname = self.compute1.host + + server = self._boot_and_check_allocations(self.flavor, source_hostname) + + rsp = fake_requests.FakeResponse( + 409, jsonutils.dumps({'text': 'consumer generation conflict'})) + + with mock.patch('keystoneauth1.adapter.Adapter.put', + autospec=True) as mock_put: + mock_put.return_value = rsp + + self.api.delete_server(server['id']) + server = self._wait_for_state_change(self.admin_api, server, + 'ERROR') + self.assertEqual(1, mock_put.call_count) + + # We still have the allocations as deletion failed + source_rp_uuid = self._get_provider_uuid_by_host(source_hostname) + source_usages = self._get_provider_usages(source_rp_uuid) + self.assertFlavorMatchesAllocation(self.flavor, source_usages) + + allocations = self._get_allocations_by_server_uuid(server['id']) + self.assertEqual(1, len(allocations)) + source_allocation = allocations[source_rp_uuid]['resources'] + self.assertFlavorMatchesAllocation(self.flavor, source_allocation) + + # retry the delete to make sure that allocations are removed this time + self._delete_and_check_allocations(server) + + def test_server_local_delete_fails_due_to_conflict(self): + source_hostname = self.compute1.host + + server = self._boot_and_check_allocations(self.flavor, source_hostname) + source_compute_id = self.admin_api.get_services( + host=self.compute1.host, binary='nova-compute')[0]['id'] + self.compute1.stop() + self.admin_api.put_service( + source_compute_id, {'forced_down': 'true'}) + + rsp = fake_requests.FakeResponse( + 409, jsonutils.dumps({'text': 'consumer generation conflict'})) + + with mock.patch('keystoneauth1.adapter.Adapter.put', + autospec=True) as mock_put: + mock_put.return_value = rsp + + ex = self.assertRaises(client.OpenStackApiException, + self.api.delete_server, server['id']) + self.assertEqual(409, ex.response.status_code) + self.assertIn('Failed to delete allocations for consumer', + jsonutils.loads(ex.response.content)[ + 'conflictingRequest']['message']) + self.assertEqual(1, mock_put.call_count) + + # We still have the allocations as deletion failed + source_rp_uuid = self._get_provider_uuid_by_host(source_hostname) + source_usages = self._get_provider_usages(source_rp_uuid) + self.assertFlavorMatchesAllocation(self.flavor, source_usages) + + allocations = self._get_allocations_by_server_uuid(server['id']) + self.assertEqual(1, len(allocations)) + source_allocation = allocations[source_rp_uuid]['resources'] + self.assertFlavorMatchesAllocation(self.flavor, source_allocation) + + # retry the delete to make sure that allocations are removed this time + self._delete_and_check_allocations(server) diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 8484973f57be..142cb3b9083e 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -7646,6 +7646,12 @@ class ComputeTestCase(BaseTestCase, self.stub_out('nova.objects.quotas.Quotas.reserve', lambda *a, **k: None) + self.stub_out('nova.compute.utils.notify_about_instance_usage', + lambda *a, **k: None) + + self.stub_out('nova.compute.utils.notify_about_instance_action', + lambda *a, **k: None) + self.compute._complete_partial_deletion(admin_context, instance) self.assertNotEqual(0, instance.deleted) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index afa0c33b02f0..e9e54a77e3cc 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -232,14 +232,22 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): methods_called = [n for n, a, k in call_tracker.mock_calls] self.assertEqual(['clear_events_for_instance', '_notify_about_instance_usage', - '_shutdown_instance'], + '_shutdown_instance', + '_notify_about_instance_usage'], methods_called) - mock_notify.assert_called_once_with(self.context, - mock_inst, - specd_compute.host, - action='delete', - phase='start', - bdms=mock_bdms) + mock_notify.assert_has_calls([ + mock.call(self.context, + mock_inst, + specd_compute.host, + action='delete', + phase='start', + bdms=mock_bdms), + mock.call(self.context, + mock_inst, + specd_compute.host, + action='delete', + phase='end', + bdms=mock_bdms)]) def _make_compute_node(self, hyp_hostname, cn_id): cn = mock.Mock(spec_set=['hypervisor_hostname', 'id', diff --git a/nova/tests/unit/scheduler/client/test_report.py b/nova/tests/unit/scheduler/client/test_report.py index e0b389064776..b37b4ad0b76b 100644 --- a/nova/tests/unit/scheduler/client/test_report.py +++ b/nova/tests/unit/scheduler/client/test_report.py @@ -3222,21 +3222,6 @@ class TestAllocations(SchedulerReportClientTestCase): } self.assertEqual(expected, result) - @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' - 'delete') - @mock.patch('nova.scheduler.client.report.LOG') - def test_delete_allocation_for_instance_ignore_404(self, mock_log, - mock_delete): - """Tests that we don't log a warning on a 404 response when trying to - delete an allocation record. - """ - mock_delete.return_value = fake_requests.FakeResponse(404) - self.client.delete_allocation_for_instance(self.context, uuids.rp_uuid) - # make sure we didn't screw up the logic or the mock - mock_log.info.assert_not_called() - # make sure warning wasn't called for the 404 - mock_log.warning.assert_not_called() - @mock.patch("nova.scheduler.client.report.SchedulerReportClient." "delete") @mock.patch("nova.scheduler.client.report.SchedulerReportClient."