Merge "Delete allocations from API if nova-compute is down" into stable/pike

This commit is contained in:
Zuul 2018-07-09 19:31:26 +00:00 committed by Gerrit Code Review
commit 826489881f
4 changed files with 102 additions and 21 deletions

View File

@ -249,6 +249,13 @@ class API(base.Base):
self.image_api = image_api or image.API()
self.network_api = network_api or network.API()
self.volume_api = volume_api or cinder.API()
# NOTE(mriedem): This looks a bit weird but we get the reportclient
# via SchedulerClient since it lazy-loads SchedulerReportClient on
# the first usage which helps to avoid a bunch of lockutils spam in
# the nova-api logs every time the service is restarted (remember
# that pretty much all of the REST API controllers construct this
# API class).
self.placementclient = scheduler_client.SchedulerClient().reportclient
self.security_group_api = (security_group_api or
openstack_driver.get_openstack_security_group_driver())
self.consoleauth_rpcapi = consoleauth_rpcapi.ConsoleAuthAPI()
@ -2117,6 +2124,9 @@ class API(base.Base):
# cleanup volumes
self._local_cleanup_bdm_volumes(bdms, instance, context)
# Cleanup allocations in Placement since we can't do it from the
# compute service.
self.placementclient.delete_allocation_for_instance(instance.uuid)
cb(context, instance, bdms, local=True)
sys_meta = instance.system_metadata
instance.destroy()

View File

@ -10,6 +10,7 @@
# License for the specific language governing permissions and limitations
# under the License.
from nova.scheduler.client import report as reportclient
from nova import test
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional import integrated_helpers
@ -57,12 +58,87 @@ class TestLocalDeleteAllocations(test.TestCase,
resp = self.placement_api.get(fmt % {'uuid': rp_uuid})
return resp.body['usages']
def test_local_delete_removes_allocations(self):
# NOTE(mriedem): It would be preferable to use the PlacementFixture as
# a context manager but that causes some issues when trying to delete the
# server in test_local_delete_removes_allocations_after_compute_restart.
def _stub_compute_api_to_not_configure_placement(self):
"""Prior to the compute API deleting allocations in the "local delete"
case, nova.conf for nova-api might not be configured for talking to
the placement service, so we can mock that behavior by stubbing out
the placement client in the compute API to no-op as if safe_connect
failed and returned None to the caller.
"""
orig_delete_alloc = (
reportclient.SchedulerReportClient.delete_allocation_for_instance)
self.call_count = 0
def fake_delete_allocation_for_instance(*args, **kwargs):
# The first call will be from the API, so ignore that one and
# return None like the @safe_connect decorator would if nova-api
# wasn't configured to talk to placement.
if self.call_count:
orig_delete_alloc(*args, **kwargs)
else:
self.call_count += 1
self.stub_out('nova.scheduler.client.report.SchedulerReportClient.'
'delete_allocation_for_instance',
fake_delete_allocation_for_instance)
def test_local_delete_removes_allocations_after_compute_restart(self):
"""Tests that allocations are removed after a local delete.
This tests the scenario where a server is local deleted (because the
compute host is down) and we want to make sure that its allocations
have been cleaned up.
have been cleaned up once the nova-compute service restarts.
"""
self._stub_compute_api_to_not_configure_placement()
# Get allocations, make sure they are 0.
resp = self.placement_api.get('/resource_providers')
rp_uuid = resp.body['resource_providers'][0]['uuid']
usages_before = self._get_usages(rp_uuid)
for usage in usages_before.values():
self.assertEqual(0, usage)
# Create a server.
server = self._build_minimal_create_server_request(self.api,
'local-delete-test', self.image_id, self.flavor_id, 'none')
server = self.admin_api.post_server({'server': server})
server = self._wait_for_state_change(self.api, server, 'ACTIVE')
# Assert usages are non zero now.
usages_during = self._get_usages(rp_uuid)
for usage in usages_during.values():
self.assertNotEqual(0, usage)
# Force-down compute to trigger local delete.
self.compute.stop()
compute_service_id = self.admin_api.get_services(
host=self.compute.host, binary='nova-compute')[0]['id']
self.admin_api.put_service(compute_service_id, {'forced_down': True})
# Delete the server (will be a local delete because compute is down).
self.api.delete_server(server['id'])
# Assert usages are still non-zero.
usages_during = self._get_usages(rp_uuid)
for usage in usages_during.values():
self.assertNotEqual(0, usage)
# Start the compute service again. Before it comes up, it will call the
# update_available_resource code in the ResourceTracker which is what
# "heals" the allocations for the deleted instance.
self.compute.start()
# Get the allocations again to check against the original.
usages_after = self._get_usages(rp_uuid)
# They should match.
self.assertEqual(usages_before, usages_after)
def test_local_delete_removes_allocations_from_api(self):
"""Tests that the compute API deletes allocations when the compute
service on which the instance was running is down.
"""
# Get allocations, make sure they are 0.
resp = self.placement_api.get('/resource_providers')
@ -91,12 +167,7 @@ class TestLocalDeleteAllocations(test.TestCase,
# Delete the server (will be a local delete because compute is down).
self.api.delete_server(server['id'])
# Start the compute service again. Before it comes up, it will call the
# update_available_resource code in the ResourceTracker which is what
# "heals" the allocations for the deleted instance.
self.compute.start()
# Get the allocations again to check against the original.
# Get the allocations again to make sure they were deleted.
usages_after = self._get_usages(rp_uuid)
# They should match.

View File

@ -86,12 +86,7 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase):
self.assertEqual(409, ex.response.status_code)
# Now delete the instance and wait for it to be gone.
# Note that we can't use self._delete_and_check_allocations here
# because of bug 1679750 where allocations are not deleted when
# an instance is deleted and the compute service it's running on
# is down.
self.api.delete_server(server['id'])
self._wait_until_deleted(server)
self._delete_and_check_allocations(server)
# Now we can delete the service.
self.admin_api.api_delete('/os-services/%s' % service['id'])
@ -120,10 +115,3 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase):
# in a 404 response.
resp = self.placement_api.get('/resource_providers/%s' % rp_uuid)
self.assertEqual(200, resp.status)
# Assert the allocations still exist for the server against the
# compute node resource provider. Once the bug is fixed, there should
# be no allocations for the server.
allocations = self._get_allocations_by_server_uuid(server['id'])
self.assertEqual(1, len(allocations))
allocation = allocations[rp_uuid]['resources']
self.assertFlavorMatchesAllocation(flavor, allocation)

View File

@ -0,0 +1,12 @@
---
upgrade:
- |
The ``nova-api`` service now requires the ``[placement]`` section to be
configured in nova.conf if you are using a separate config file just for
that service. This is because the ``nova-api`` service now needs to talk
to the placement service in order to delete resource provider allocations
when deleting an instance and the ``nova-compute`` service on which that
instance is running is down. This change is idempotent if
``[placement]`` is not configured in ``nova-api`` but it will result in
new warnings in the logs until configured. See bug
https://bugs.launchpad.net/nova/+bug/1679750 for more details.