From cd50dcaf3e51722c9510d417c1724d8cdafe450b Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Wed, 11 Apr 2018 21:24:43 -0400 Subject: [PATCH] Delete allocations from API if nova-compute is down When performing a "local delete" of an instance, we need to delete the allocations that the instance has against any resource providers in Placement. It should be noted that without this change, restarting the nova-compute service will delete the allocations for its compute node (assuming the compute node UUID is the same as before the instance was deleted). That is shown in the existing functional test modified here. The more important reason for this change is that in order to fix bug 1756179, we need to make sure the resource provider allocations for a given compute node are gone by the time the compute service is deleted. This adds a new functional test and a release note for the new behavior and need to configure nova-api for talking to placement, which is idempotent if not configured thanks to the @safe_connect decorator used in SchedulerReportClient. Closes-Bug: #1679750 Related-Bug: #1756179 Conflicts: nova/compute/api.py NOTE(mriedem): The compute/api conflict is due to not having change I393118861d1f921cc2d71011ddedaf43a2e8dbdf in Pike. In addition to this, the call to delete_allocation_for_instance() does not include the context parameter which was introduced in change If38e4a6d49910f0aa5016e1bcb61aac2be416fa7 which is also not in Pike. Change-Id: If507e23f0b7e5fa417041c3870d77786498f741d (cherry picked from commit ea9d0af31395fbe1686fa681cd91226ee580796e) (cherry picked from commit cba1a3e2c1b161204a3662a0d9fbf33da38aa7d3) --- nova/compute/api.py | 10 +++ .../regressions/test_bug_1679750.py | 87 +++++++++++++++++-- nova/tests/functional/wsgi/test_services.py | 14 +-- ...l-delete-allocations-cb7bfbcb6c36b6a2.yaml | 12 +++ 4 files changed, 102 insertions(+), 21 deletions(-) create mode 100644 releasenotes/notes/bug-1679750-local-delete-allocations-cb7bfbcb6c36b6a2.yaml diff --git a/nova/compute/api.py b/nova/compute/api.py index 031587db99bc..686fe63b8f88 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -249,6 +249,13 @@ class API(base.Base): self.image_api = image_api or image.API() self.network_api = network_api or network.API() self.volume_api = volume_api or cinder.API() + # NOTE(mriedem): This looks a bit weird but we get the reportclient + # via SchedulerClient since it lazy-loads SchedulerReportClient on + # the first usage which helps to avoid a bunch of lockutils spam in + # the nova-api logs every time the service is restarted (remember + # that pretty much all of the REST API controllers construct this + # API class). + self.placementclient = scheduler_client.SchedulerClient().reportclient self.security_group_api = (security_group_api or openstack_driver.get_openstack_security_group_driver()) self.consoleauth_rpcapi = consoleauth_rpcapi.ConsoleAuthAPI() @@ -2117,6 +2124,9 @@ class API(base.Base): # cleanup volumes self._local_cleanup_bdm_volumes(bdms, instance, context) + # Cleanup allocations in Placement since we can't do it from the + # compute service. + self.placementclient.delete_allocation_for_instance(instance.uuid) cb(context, instance, bdms, local=True) sys_meta = instance.system_metadata instance.destroy() diff --git a/nova/tests/functional/regressions/test_bug_1679750.py b/nova/tests/functional/regressions/test_bug_1679750.py index b93ce5107c5c..7493c92f6994 100644 --- a/nova/tests/functional/regressions/test_bug_1679750.py +++ b/nova/tests/functional/regressions/test_bug_1679750.py @@ -10,6 +10,7 @@ # License for the specific language governing permissions and limitations # under the License. +from nova.scheduler.client import report as reportclient from nova import test from nova.tests import fixtures as nova_fixtures from nova.tests.functional import integrated_helpers @@ -57,12 +58,87 @@ class TestLocalDeleteAllocations(test.TestCase, resp = self.placement_api.get(fmt % {'uuid': rp_uuid}) return resp.body['usages'] - def test_local_delete_removes_allocations(self): + # NOTE(mriedem): It would be preferable to use the PlacementFixture as + # a context manager but that causes some issues when trying to delete the + # server in test_local_delete_removes_allocations_after_compute_restart. + def _stub_compute_api_to_not_configure_placement(self): + """Prior to the compute API deleting allocations in the "local delete" + case, nova.conf for nova-api might not be configured for talking to + the placement service, so we can mock that behavior by stubbing out + the placement client in the compute API to no-op as if safe_connect + failed and returned None to the caller. + """ + orig_delete_alloc = ( + reportclient.SchedulerReportClient.delete_allocation_for_instance) + self.call_count = 0 + + def fake_delete_allocation_for_instance(*args, **kwargs): + # The first call will be from the API, so ignore that one and + # return None like the @safe_connect decorator would if nova-api + # wasn't configured to talk to placement. + if self.call_count: + orig_delete_alloc(*args, **kwargs) + else: + self.call_count += 1 + + self.stub_out('nova.scheduler.client.report.SchedulerReportClient.' + 'delete_allocation_for_instance', + fake_delete_allocation_for_instance) + + def test_local_delete_removes_allocations_after_compute_restart(self): """Tests that allocations are removed after a local delete. This tests the scenario where a server is local deleted (because the compute host is down) and we want to make sure that its allocations - have been cleaned up. + have been cleaned up once the nova-compute service restarts. + """ + self._stub_compute_api_to_not_configure_placement() + # Get allocations, make sure they are 0. + resp = self.placement_api.get('/resource_providers') + rp_uuid = resp.body['resource_providers'][0]['uuid'] + usages_before = self._get_usages(rp_uuid) + for usage in usages_before.values(): + self.assertEqual(0, usage) + + # Create a server. + server = self._build_minimal_create_server_request(self.api, + 'local-delete-test', self.image_id, self.flavor_id, 'none') + server = self.admin_api.post_server({'server': server}) + server = self._wait_for_state_change(self.api, server, 'ACTIVE') + + # Assert usages are non zero now. + usages_during = self._get_usages(rp_uuid) + for usage in usages_during.values(): + self.assertNotEqual(0, usage) + + # Force-down compute to trigger local delete. + self.compute.stop() + compute_service_id = self.admin_api.get_services( + host=self.compute.host, binary='nova-compute')[0]['id'] + self.admin_api.put_service(compute_service_id, {'forced_down': True}) + + # Delete the server (will be a local delete because compute is down). + self.api.delete_server(server['id']) + + # Assert usages are still non-zero. + usages_during = self._get_usages(rp_uuid) + for usage in usages_during.values(): + self.assertNotEqual(0, usage) + + # Start the compute service again. Before it comes up, it will call the + # update_available_resource code in the ResourceTracker which is what + # "heals" the allocations for the deleted instance. + self.compute.start() + + # Get the allocations again to check against the original. + usages_after = self._get_usages(rp_uuid) + + # They should match. + self.assertEqual(usages_before, usages_after) + + def test_local_delete_removes_allocations_from_api(self): + """Tests that the compute API deletes allocations when the compute + service on which the instance was running is down. """ # Get allocations, make sure they are 0. resp = self.placement_api.get('/resource_providers') @@ -91,12 +167,7 @@ class TestLocalDeleteAllocations(test.TestCase, # Delete the server (will be a local delete because compute is down). self.api.delete_server(server['id']) - # Start the compute service again. Before it comes up, it will call the - # update_available_resource code in the ResourceTracker which is what - # "heals" the allocations for the deleted instance. - self.compute.start() - - # Get the allocations again to check against the original. + # Get the allocations again to make sure they were deleted. usages_after = self._get_usages(rp_uuid) # They should match. diff --git a/nova/tests/functional/wsgi/test_services.py b/nova/tests/functional/wsgi/test_services.py index 26860056de28..eb3902fa468c 100644 --- a/nova/tests/functional/wsgi/test_services.py +++ b/nova/tests/functional/wsgi/test_services.py @@ -86,12 +86,7 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase): self.assertEqual(409, ex.response.status_code) # Now delete the instance and wait for it to be gone. - # Note that we can't use self._delete_and_check_allocations here - # because of bug 1679750 where allocations are not deleted when - # an instance is deleted and the compute service it's running on - # is down. - self.api.delete_server(server['id']) - self._wait_until_deleted(server) + self._delete_and_check_allocations(server) # Now we can delete the service. self.admin_api.api_delete('/os-services/%s' % service['id']) @@ -120,10 +115,3 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase): # in a 404 response. resp = self.placement_api.get('/resource_providers/%s' % rp_uuid) self.assertEqual(200, resp.status) - # Assert the allocations still exist for the server against the - # compute node resource provider. Once the bug is fixed, there should - # be no allocations for the server. - allocations = self._get_allocations_by_server_uuid(server['id']) - self.assertEqual(1, len(allocations)) - allocation = allocations[rp_uuid]['resources'] - self.assertFlavorMatchesAllocation(flavor, allocation) diff --git a/releasenotes/notes/bug-1679750-local-delete-allocations-cb7bfbcb6c36b6a2.yaml b/releasenotes/notes/bug-1679750-local-delete-allocations-cb7bfbcb6c36b6a2.yaml new file mode 100644 index 000000000000..178e810cba3a --- /dev/null +++ b/releasenotes/notes/bug-1679750-local-delete-allocations-cb7bfbcb6c36b6a2.yaml @@ -0,0 +1,12 @@ +--- +upgrade: + - | + The ``nova-api`` service now requires the ``[placement]`` section to be + configured in nova.conf if you are using a separate config file just for + that service. This is because the ``nova-api`` service now needs to talk + to the placement service in order to delete resource provider allocations + when deleting an instance and the ``nova-compute`` service on which that + instance is running is down. This change is idempotent if + ``[placement]`` is not configured in ``nova-api`` but it will result in + new warnings in the logs until configured. See bug + https://bugs.launchpad.net/nova/+bug/1679750 for more details.