From a817b78dc44cf2cb4157531b2d92b03a4d0ca7d1 Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Wed, 11 Apr 2018 18:00:39 -0400 Subject: [PATCH] Block deleting compute services which are hosting instances This change makes "DELETE /os-services/{service_id}" fail with a 409 response when attempting to delete a nova-compute service which is still hosting instances. Deleting a compute service also results in deleting the related compute_nodes table entry for that service host. The compute node resource provider in placement is tied to the compute node via the UUID, and if we allow deleting the compute service and node then the resource provider for that node is effectively orphaned in Placement, along with the instances which have allocations against that resource provider. Furthermore, restarting the compute service will create a new service and compute_nodes record, and the compute node would have a new UUID and resource provider. This will affect scheduling for that host since Placement will be reporting it as having available capacity which in reality is not accurate. A release note is included for the (justified) behavior change in the API. A new microversion should not be required for this since admins should not have to opt out of broken behavior. Since this API did not previously expect to return a 409 response, the "expected_errors" decorator is updated and again, should not require a microversion per the guidelines: https://docs.openstack.org/nova/latest/contributor/microversions.html#when-a-microversion-is-not-needed Conflicts: nova/tests/functional/wsgi/test_services.py NOTE(mriedem): This is due to the rc_fields move from change Iea182341f9419cb514a044f76864d6bec60a3683 in Rocky. Change-Id: I0bd63b655ad3d3d39af8d15c781ce0a45efc8e3a Closes-Bug: #1763183 (cherry picked from commit 42f62f1ed2ad76829eb9d40a8b9646a523f6381f) --- api-ref/source/os-services.inc | 7 ++++- nova/api/openstack/compute/services.py | 18 ++++++++++++- nova/tests/functional/wsgi/test_services.py | 27 ++++++++++++++----- ...elete-with-instances-d7c5c47e4ce31239.yaml | 9 +++++++ 4 files changed, 53 insertions(+), 8 deletions(-) create mode 100644 releasenotes/notes/bug-1763183-service-delete-with-instances-d7c5c47e4ce31239.yaml diff --git a/api-ref/source/os-services.inc b/api-ref/source/os-services.inc index 298dc0750e3e..0f2c5cbc2711 100644 --- a/api-ref/source/os-services.inc +++ b/api-ref/source/os-services.inc @@ -318,6 +318,10 @@ Delete Compute Service Deletes a service. If it's a ``nova-compute`` service, then the corresponding host will be removed from all the host aggregates as well. +Attempts to delete a ``nova-compute`` service which is still hosting instances +will result in a 409 HTTPConflict response. The instances will need to be +migrated or deleted before a compute service can be deleted. + .. important:: Be sure to stop the actual ``nova-compute`` process on the physical host *before* deleting the service with this API. Failing to do so can lead to the running service re-creating @@ -325,7 +329,8 @@ corresponding host will be removed from all the host aggregates as well. Normal response codes: 204 -Error response codes: badRequest(400), unauthorized(401), forbidden(403), itemNotFound(404) +Error response codes: badRequest(400), unauthorized(401), forbidden(403), +itemNotFound(404), conflict(409) Request ------- diff --git a/nova/api/openstack/compute/services.py b/nova/api/openstack/compute/services.py index 07db925dea07..608d783c1891 100644 --- a/nova/api/openstack/compute/services.py +++ b/nova/api/openstack/compute/services.py @@ -24,6 +24,7 @@ from nova import availability_zones from nova import compute from nova import exception from nova.i18n import _ +from nova import objects from nova.policies import services as services_policies from nova import servicegroup from nova import utils @@ -189,7 +190,7 @@ class ServiceController(wsgi.Controller): return action(body, context) @wsgi.response(204) - @wsgi.expected_errors((400, 404)) + @wsgi.expected_errors((400, 404, 409)) def delete(self, req, id): """Deletes the specified service.""" context = req.environ['nova.context'] @@ -211,6 +212,21 @@ class ServiceController(wsgi.Controller): service = self.host_api.service_get_by_id(context, id) # remove the service from all the aggregates in which it's included if service.binary == 'nova-compute': + # Check to see if there are any instances on this compute host + # because if there are, we need to block the service (and + # related compute_nodes record) delete since it will impact + # resource accounting in Placement and orphan the compute node + # resource provider. + # TODO(mriedem): Use a COUNT SQL query-based function instead + # of InstanceList.get_uuids_by_host for performance. + instance_uuids = objects.InstanceList.get_uuids_by_host( + context, service['host']) + if instance_uuids: + raise webob.exc.HTTPConflict( + explanation=_('Unable to delete compute service that ' + 'is hosting instances. Migrate or ' + 'delete the instances first.')) + aggrs = self.aggregate_api.get_aggregates_by_host(context, service.host) for ag in aggrs: diff --git a/nova/tests/functional/wsgi/test_services.py b/nova/tests/functional/wsgi/test_services.py index eebccbdc2afe..26860056de28 100644 --- a/nova/tests/functional/wsgi/test_services.py +++ b/nova/tests/functional/wsgi/test_services.py @@ -10,9 +10,12 @@ # License for the specific language governing permissions and limitations # under the License. +import six + from nova import context as nova_context from nova import objects from nova.objects import fields as rc_fields +from nova.tests.functional.api import client as api_client from nova.tests.functional import test_servers @@ -73,12 +76,24 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase): # update_available_resource periodic task. self.admin_api.put_service(service['id'], {'forced_down': True}) compute.stop() - # FIXME(mriedem): This is bug 1763183 where the compute node has - # an instance running on it but we allow you to delete the service - # and compute node anyway, which will affect the allocations for the - # instance and orphans the compute node resource provider in Placement. - # Once the bug is fixed, this should fail until the instance is either - # migrated or deleted. + # The first attempt should fail since there is an instance on the + # compute host. + ex = self.assertRaises(api_client.OpenStackApiException, + self.admin_api.api_delete, + '/os-services/%s' % service['id']) + self.assertIn('Unable to delete compute service that is hosting ' + 'instances.', six.text_type(ex)) + self.assertEqual(409, ex.response.status_code) + + # Now delete the instance and wait for it to be gone. + # Note that we can't use self._delete_and_check_allocations here + # because of bug 1679750 where allocations are not deleted when + # an instance is deleted and the compute service it's running on + # is down. + self.api.delete_server(server['id']) + self._wait_until_deleted(server) + + # Now we can delete the service. self.admin_api.api_delete('/os-services/%s' % service['id']) # Make sure the service is deleted. diff --git a/releasenotes/notes/bug-1763183-service-delete-with-instances-d7c5c47e4ce31239.yaml b/releasenotes/notes/bug-1763183-service-delete-with-instances-d7c5c47e4ce31239.yaml new file mode 100644 index 000000000000..72af00754795 --- /dev/null +++ b/releasenotes/notes/bug-1763183-service-delete-with-instances-d7c5c47e4ce31239.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + The ``DELETE /os-services/{service_id}`` compute API will now return a + ``409 HTTPConflict`` response when trying to delete a ``nova-compute`` + service which is still hosting instances. This is because doing so would + orphan the compute node resource provider in the placement service on + which those instances have resource allocations, which affects scheduling. + See https://bugs.launchpad.net/nova/+bug/1763183 for more details.