From 3f0605c28987f46cf4a05af0140e2e5de7d5ad0a Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Tue, 2 Jul 2019 18:57:38 -0400 Subject: [PATCH] Sync COMPUTE_STATUS_DISABLED from API This adds the os-services API change which will call the compute service when the service's disabled value changes to sync the COMPUTE_STATUS_DISABLED trait on the compute node resource providers managed by the updated compute service. If the compute service is down or not yet upgraded to the service version from change Ia95de2f23f12b002b2189c9294ec190569a628ab then the API will not call the service. In this case the change from I3005b46221ac3c0e559e1072131a7e4846c9867c will sync the trait when the compute service is restarted. Since the compute service could be running the ironic driver and managing hundreds or over 1000 compute nodes, the set_host_enabled RPC call now uses the long_rpc_timeout configuration option. A functional test is added which covers the 2.53+ PUT /os-services/{service_id} API and pre-2.53 os-services APIs for enabling/disabling and forcing down a service. The functional test also covers the sync-on-restart behavior from change I3005b46221ac3c0e559e1072131a7e4846c9867c. The scheduler pre-filter added in change I317cabbe49a337848325f96df79d478fd65811d9 is also tested as part of the functional test. Closes-Bug: #1805984 Implements blueprint pre-filter-disabled-computes Change-Id: If32bca070185937ef83f689b7163d965a89ec10a --- nova/compute/api.py | 59 +++++++- nova/compute/rpcapi.py | 4 +- nova/conf/rpc.py | 1 + .../api_sample_tests/test_services.py | 6 + nova/tests/functional/wsgi/test_services.py | 132 ++++++++++++++++++ nova/tests/unit/compute/test_host_api.py | 58 +++++++- nova/tests/unit/compute/test_rpcapi.py | 4 +- 7 files changed, 258 insertions(+), 6 deletions(-) diff --git a/nova/compute/api.py b/nova/compute/api.py index 20aa27e1caf3..ee3b3a4d294c 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -107,6 +107,7 @@ CINDER_V3_ATTACH_MIN_COMPUTE_VERSION = 24 MIN_COMPUTE_TRUSTED_CERTS = 31 MIN_COMPUTE_ABORT_QUEUED_LIVE_MIGRATION = 34 MIN_COMPUTE_VOLUME_TYPE = 36 +MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED = 38 # FIXME(danms): Keep a global cache of the cells we find the # first time we look. This needs to be refreshed on a timer or @@ -5090,17 +5091,69 @@ class HostAPI(base.Base): """Get service entry for the given compute hostname.""" return objects.Service.get_by_compute_host(context, host_name) + def _update_compute_provider_status(self, context, service): + """Calls the compute service to sync the COMPUTE_STATUS_DISABLED trait. + + There are two cases where the API will not call the compute service: + + * The compute service is down. In this case the trait is synchronized + when the compute service is restarted. + * The compute service is old. In this case the trait is synchronized + when the compute service is upgraded and restarted. + + :param context: nova auth RequestContext + :param service: nova.objects.Service object which has been enabled + or disabled (see ``service_update``). + """ + # Make sure the service is up so we can make the RPC call. + if not self.servicegroup_api.service_is_up(service): + LOG.info('Compute service on host %s is down. The ' + 'COMPUTE_STATUS_DISABLED trait will be synchronized ' + 'when the service is restarted.', service.host) + return + + # Make sure the compute service is new enough for the trait sync + # behavior. + # TODO(mriedem): Remove this compat check in the U release. + if service.version < MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED: + LOG.info('Compute service on host %s is too old to sync the ' + 'COMPUTE_STATUS_DISABLED trait in Placement. The ' + 'trait will be synchronized when the service is ' + 'upgraded and restarted.', service.host) + return + + enabled = not service.disabled + # Avoid leaking errors out of the API. + try: + LOG.debug('Calling the compute service on host %s to sync the ' + 'COMPUTE_STATUS_DISABLED trait.', service.host) + self.rpcapi.set_host_enabled(context, service.host, enabled) + except Exception: + LOG.exception('An error occurred while updating host enabled ' + 'status to "%s" for compute host: %s', + 'enabled' if enabled else 'disabled', + service.host) + def service_update(self, context, service): """Performs the actual service update operation. + If the "disabled" field is changed, potentially calls the compute + service to sync the COMPUTE_STATUS_DISABLED trait on the compute node + resource providers managed by this compute service. + :param context: nova auth RequestContext :param service: nova.objects.Service object with changes already set on the object """ + # Before persisting changes and resetting the changed fields on the + # Service object, determine if the disabled field changed. + update_placement = 'disabled' in service.obj_what_changed() + # Persist the Service object changes to the database. service.save() - # TODO(mriedem): Reflect COMPUTE_STATUS_DISABLED trait changes to the - # associated compute node resource providers if the service's disabled - # status changed. + # If the disabled field changed, potentially call the compute service + # to sync the COMPUTE_STATUS_DISABLED trait. + if update_placement: + self._update_compute_provider_status(context, service) return service @target_host_cell diff --git a/nova/compute/rpcapi.py b/nova/compute/rpcapi.py index 3be891d21d0d..2106ff545890 100644 --- a/nova/compute/rpcapi.py +++ b/nova/compute/rpcapi.py @@ -913,7 +913,9 @@ class ComputeAPI(object): def set_host_enabled(self, ctxt, host, enabled): version = '5.0' cctxt = self.router.client(ctxt).prepare( - server=host, version=version) + server=host, version=version, + call_monitor_timeout=CONF.rpc_response_timeout, + timeout=CONF.long_rpc_timeout) return cctxt.call(ctxt, 'set_host_enabled', enabled=enabled) def swap_volume(self, ctxt, instance, old_volume_id, new_volume_id, diff --git a/nova/conf/rpc.py b/nova/conf/rpc.py index e5bd1b989861..8dae38aad2b4 100644 --- a/nova/conf/rpc.py +++ b/nova/conf/rpc.py @@ -28,6 +28,7 @@ Operations with RPC calls that utilize this value: * live migration * scheduling +* enabling/disabling a compute service Related options: diff --git a/nova/tests/functional/api_sample_tests/test_services.py b/nova/tests/functional/api_sample_tests/test_services.py index 68fc2c4e1243..7f6377d8bb57 100644 --- a/nova/tests/functional/api_sample_tests/test_services.py +++ b/nova/tests/functional/api_sample_tests/test_services.py @@ -37,6 +37,12 @@ class ServicesJsonTest(api_sample_base.ApiSampleTestBaseV21): test_services.fake_service_get_by_host_binary) self.stub_out("nova.db.api.service_update", test_services.fake_service_update) + # If we are not using real services, we need to stub out + # HostAPI._update_compute_provider_status so we don't actually + # try to call a fake service over RPC. + self.stub_out('nova.compute.api.HostAPI.' + '_update_compute_provider_status', + lambda *args, **kwargs: None) self.useFixture(utils_fixture.TimeFixture(test_services.fake_utcnow())) def test_services_list(self): diff --git a/nova/tests/functional/wsgi/test_services.py b/nova/tests/functional/wsgi/test_services.py index 3afe81d8c30a..607100ad2b0f 100644 --- a/nova/tests/functional/wsgi/test_services.py +++ b/nova/tests/functional/wsgi/test_services.py @@ -11,6 +11,7 @@ # under the License. import os_resource_classes as orc +import os_traits import six from nova import context as nova_context @@ -18,6 +19,8 @@ from nova import exception from nova import objects from nova.tests.functional.api import client as api_client from nova.tests.functional import integrated_helpers +from nova.tests.unit.image import fake as fake_image +from nova import utils class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase): @@ -171,3 +174,132 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase): log_output = self.stdlog.logger.output self.assertIn('Error updating resources for node host1.', log_output) self.assertIn('Failed to create resource provider host1', log_output) + + +class ComputeStatusFilterTest(integrated_helpers.ProviderUsageBaseTestCase): + """Tests the API, compute service and Placement interaction with the + COMPUTE_STATUS_DISABLED trait when a compute service is enable/disabled. + + This version of the test uses the 2.latest microversion for testing the + 2.53+ behavior of the PUT /os-services/{service_id} API. + """ + compute_driver = 'fake.SmallFakeDriver' + + def _update_service(self, service, disabled, forced_down=None): + """Update the service using the 2.53 request schema. + + :param service: dict representing the service resource in the API + :param disabled: True if the service should be disabled, False if the + service should be enabled + :param forced_down: Optionally change the forced_down value. + """ + status = 'disabled' if disabled else 'enabled' + req = {'status': status} + if forced_down is not None: + req['forced_down'] = forced_down + self.admin_api.put_service(service['id'], req) + + def test_compute_status_filter(self): + """Tests the compute_status_filter placement request filter""" + # Start a compute service so a compute node and resource provider is + # created. + compute = self._start_compute('host1') + # Get the UUID of the resource provider that was created. + rp_uuid = self._get_provider_uuid_by_host('host1') + # Get the service from the compute API. + services = self.admin_api.get_services(binary='nova-compute', + host='host1') + self.assertEqual(1, len(services)) + service = services[0] + + # At this point, the service should be enabled and the + # COMPUTE_STATUS_DISABLED trait should not be set on the + # resource provider in placement. + self.assertEqual('enabled', service['status']) + rp_traits = self._get_provider_traits(rp_uuid) + trait = os_traits.COMPUTE_STATUS_DISABLED + self.assertNotIn(trait, rp_traits) + + # Now disable the compute service via the API. + self._update_service(service, disabled=True) + + # The update to placement should be synchronous so check the provider + # traits and COMPUTE_STATUS_DISABLED should be set. + rp_traits = self._get_provider_traits(rp_uuid) + self.assertIn(trait, rp_traits) + + # Try creating a server which should fail because nothing is available. + networks = [{'port': self.neutron.port_1['id']}] + server_req = self._build_minimal_create_server_request( + self.api, 'test_compute_status_filter', + image_uuid=fake_image.get_valid_image_id(), networks=networks) + server = self.api.post_server({'server': server_req}) + server = self._wait_for_state_change(self.api, server, 'ERROR') + # There should be a NoValidHost fault recorded. + self.assertIn('fault', server) + self.assertIn('No valid host', server['fault']['message']) + + # Now enable the service and the trait should be gone. + self._update_service(service, disabled=False) + rp_traits = self._get_provider_traits(rp_uuid) + self.assertNotIn(trait, rp_traits) + + # Try creating another server and it should be OK. + server = self.api.post_server({'server': server_req}) + self._wait_for_state_change(self.api, server, 'ACTIVE') + + # Stop, force-down and disable the service so the API cannot call + # the compute service to sync the trait. + compute.stop() + self._update_service(service, disabled=True, forced_down=True) + # The API should have logged a message about the service being down. + self.assertIn('Compute service on host host1 is down. The ' + 'COMPUTE_STATUS_DISABLED trait will be synchronized ' + 'when the service is restarted.', + self.stdlog.logger.output) + # The trait should not be on the provider even though the node is + # disabled. + rp_traits = self._get_provider_traits(rp_uuid) + self.assertNotIn(trait, rp_traits) + # Restart the compute service which should sync and set the trait on + # the provider in placement. + self.restart_compute_service(compute) + rp_traits = self._get_provider_traits(rp_uuid) + self.assertIn(trait, rp_traits) + + +class ComputeStatusFilterTest211(ComputeStatusFilterTest): + """Extends ComputeStatusFilterTest and uses the 2.11 API for the + legacy os-services disable/enable/force-down API behavior + """ + microversion = '2.11' + + def _update_service(self, service, disabled, forced_down=None): + """Update the service using the 2.11 request schema. + + :param service: dict representing the service resource in the API + :param disabled: True if the service should be disabled, False if the + service should be enabled + :param forced_down: Optionally change the forced_down value. + """ + # Before 2.53 the service is uniquely identified by host and binary. + body = { + 'host': service['host'], + 'binary': service['binary'] + } + # Handle forced_down first if provided since the enable/disable + # behavior in the API depends on it. + if forced_down is not None: + body['forced_down'] = forced_down + self.admin_api.api_put('/os-services/force-down', body) + + if disabled: + self.admin_api.api_put('/os-services/disable', body) + else: + self.admin_api.api_put('/os-services/enable', body) + + def _get_provider_uuid_by_host(self, host): + # We have to temporarily mutate to 2.53 to get the hypervisor UUID. + with utils.temporary_mutation(self.admin_api, microversion='2.53'): + return super(ComputeStatusFilterTest211, + self)._get_provider_uuid_by_host(host) diff --git a/nova/tests/unit/compute/test_host_api.py b/nova/tests/unit/compute/test_host_api.py index f53d438ba5aa..26c7d1d1491a 100644 --- a/nova/tests/unit/compute/test_host_api.py +++ b/nova/tests/unit/compute/test_host_api.py @@ -16,6 +16,7 @@ import fixtures import mock +import oslo_messaging as messaging from oslo_utils.fixture import uuidsentinel as uuids from nova.api.openstack.compute import services @@ -327,18 +328,73 @@ class ComputeHostAPITestCase(test.TestCase): service_id = 42 expected_result = dict(test_service.fake_service, id=service_id) + @mock.patch.object(self.host_api, '_update_compute_provider_status') @mock.patch.object(self.host_api.db, 'service_get_by_host_and_binary') @mock.patch.object(self.host_api.db, 'service_update') - def _do_test(mock_service_update, mock_service_get_by_host_and_binary): + def _do_test(mock_service_update, mock_service_get_by_host_and_binary, + mock_update_compute_provider_status): mock_service_get_by_host_and_binary.return_value = expected_result mock_service_update.return_value = expected_result result = self.host_api.service_update_by_host_and_binary( self.ctxt, host_name, binary, params_to_update) self._compare_obj(result, expected_result) + mock_update_compute_provider_status.assert_called_once_with( + self.ctxt, test.MatchType(objects.Service)) _do_test() + @mock.patch('nova.compute.api.HostAPI._update_compute_provider_status', + new_callable=mock.NonCallableMock) + def test_service_update_no_update_provider_status(self, mock_ucps): + """Tests the scenario that the service is updated but the disabled + field is not changed, for example the forced_down field is only + updated. In this case _update_compute_provider_status should not be + called. + """ + service = objects.Service(forced_down=True) + self.assertIn('forced_down', service.obj_what_changed()) + with mock.patch.object(service, 'save') as mock_save: + retval = self.host_api.service_update(self.ctxt, service) + self.assertIs(retval, service) + mock_save.assert_called_once_with() + + @mock.patch('nova.compute.rpcapi.ComputeAPI.set_host_enabled', + new_callable=mock.NonCallableMock) + def test_update_compute_provider_status_service_too_old(self, mock_she): + """Tests the scenario that the service is up but is too old to sync the + COMPUTE_STATUS_DISABLED trait. + """ + service = objects.Service(host='fake-host') + service.version = compute.MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED - 1 + with mock.patch.object( + self.host_api.servicegroup_api, 'service_is_up', + return_value=True) as service_is_up: + self.host_api._update_compute_provider_status(self.ctxt, service) + service_is_up.assert_called_once_with(service) + self.assertIn('Compute service on host fake-host is too old to sync ' + 'the COMPUTE_STATUS_DISABLED trait in Placement.', + self.stdlog.logger.output) + + @mock.patch('nova.compute.rpcapi.ComputeAPI.set_host_enabled', + side_effect=messaging.MessagingTimeout) + def test_update_compute_provider_status_service_rpc_error(self, mock_she): + """Tests the scenario that the RPC call to the compute service raised + some exception. + """ + service = objects.Service(host='fake-host', disabled=True) + with mock.patch.object( + self.host_api.servicegroup_api, 'service_is_up', + return_value=True) as service_is_up: + self.host_api._update_compute_provider_status(self.ctxt, service) + service_is_up.assert_called_once_with(service) + mock_she.assert_called_once_with(self.ctxt, 'fake-host', False) + log_output = self.stdlog.logger.output + self.assertIn('An error occurred while updating host enabled ' + 'status to "disabled" for compute host: fake-host', + log_output) + self.assertIn('MessagingTimeout', log_output) + @mock.patch.object(objects.InstanceList, 'get_by_host', return_value = ['fake-responses']) def test_instance_get_all_by_host(self, mock_get): diff --git a/nova/tests/unit/compute/test_rpcapi.py b/nova/tests/unit/compute/test_rpcapi.py index 5dd86a7b8afb..68da4d20fb31 100644 --- a/nova/tests/unit/compute/test_rpcapi.py +++ b/nova/tests/unit/compute/test_rpcapi.py @@ -468,8 +468,10 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): version='5.0') def test_set_host_enabled(self): + self.flags(long_rpc_timeout=600, rpc_response_timeout=120) self._test_compute_api('set_host_enabled', 'call', - enabled='enabled', host='host') + enabled='enabled', host='host', + call_monitor_timeout=120, timeout=600) def test_get_host_uptime(self): self._test_compute_api('get_host_uptime', 'call', host='host')