Sync COMPUTE_STATUS_DISABLED from API

This adds the os-services API change which will
call the compute service when the service's disabled
value changes to sync the COMPUTE_STATUS_DISABLED trait
on the compute node resource providers managed by the
updated compute service.

If the compute service is down or not yet upgraded to
the service version from change
Ia95de2f23f12b002b2189c9294ec190569a628ab then the
API will not call the service. In this case the change
from I3005b46221ac3c0e559e1072131a7e4846c9867c will
sync the trait when the compute service is restarted.

Since the compute service could be running the ironic
driver and managing hundreds or over 1000 compute nodes,
the set_host_enabled RPC call now uses the long_rpc_timeout
configuration option.

A functional test is added which covers the 2.53+
PUT /os-services/{service_id} API and pre-2.53 os-services
APIs for enabling/disabling and forcing down a service.
The functional test also covers the sync-on-restart behavior
from change I3005b46221ac3c0e559e1072131a7e4846c9867c.
The scheduler pre-filter added in change
I317cabbe49a337848325f96df79d478fd65811d9 is also tested
as part of the functional test.

Closes-Bug: #1805984

Implements blueprint pre-filter-disabled-computes

Change-Id: If32bca070185937ef83f689b7163d965a89ec10a
This commit is contained in:
Matt Riedemann 2019-07-02 18:57:38 -04:00
parent 1c51443304
commit 3f0605c289
7 changed files with 258 additions and 6 deletions

View File

@ -107,6 +107,7 @@ CINDER_V3_ATTACH_MIN_COMPUTE_VERSION = 24
MIN_COMPUTE_TRUSTED_CERTS = 31
MIN_COMPUTE_ABORT_QUEUED_LIVE_MIGRATION = 34
MIN_COMPUTE_VOLUME_TYPE = 36
MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED = 38
# FIXME(danms): Keep a global cache of the cells we find the
# first time we look. This needs to be refreshed on a timer or
@ -5090,17 +5091,69 @@ class HostAPI(base.Base):
"""Get service entry for the given compute hostname."""
return objects.Service.get_by_compute_host(context, host_name)
def _update_compute_provider_status(self, context, service):
"""Calls the compute service to sync the COMPUTE_STATUS_DISABLED trait.
There are two cases where the API will not call the compute service:
* The compute service is down. In this case the trait is synchronized
when the compute service is restarted.
* The compute service is old. In this case the trait is synchronized
when the compute service is upgraded and restarted.
:param context: nova auth RequestContext
:param service: nova.objects.Service object which has been enabled
or disabled (see ``service_update``).
"""
# Make sure the service is up so we can make the RPC call.
if not self.servicegroup_api.service_is_up(service):
LOG.info('Compute service on host %s is down. The '
'COMPUTE_STATUS_DISABLED trait will be synchronized '
'when the service is restarted.', service.host)
return
# Make sure the compute service is new enough for the trait sync
# behavior.
# TODO(mriedem): Remove this compat check in the U release.
if service.version < MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED:
LOG.info('Compute service on host %s is too old to sync the '
'COMPUTE_STATUS_DISABLED trait in Placement. The '
'trait will be synchronized when the service is '
'upgraded and restarted.', service.host)
return
enabled = not service.disabled
# Avoid leaking errors out of the API.
try:
LOG.debug('Calling the compute service on host %s to sync the '
'COMPUTE_STATUS_DISABLED trait.', service.host)
self.rpcapi.set_host_enabled(context, service.host, enabled)
except Exception:
LOG.exception('An error occurred while updating host enabled '
'status to "%s" for compute host: %s',
'enabled' if enabled else 'disabled',
service.host)
def service_update(self, context, service):
"""Performs the actual service update operation.
If the "disabled" field is changed, potentially calls the compute
service to sync the COMPUTE_STATUS_DISABLED trait on the compute node
resource providers managed by this compute service.
:param context: nova auth RequestContext
:param service: nova.objects.Service object with changes already
set on the object
"""
# Before persisting changes and resetting the changed fields on the
# Service object, determine if the disabled field changed.
update_placement = 'disabled' in service.obj_what_changed()
# Persist the Service object changes to the database.
service.save()
# TODO(mriedem): Reflect COMPUTE_STATUS_DISABLED trait changes to the
# associated compute node resource providers if the service's disabled
# status changed.
# If the disabled field changed, potentially call the compute service
# to sync the COMPUTE_STATUS_DISABLED trait.
if update_placement:
self._update_compute_provider_status(context, service)
return service
@target_host_cell

View File

@ -913,7 +913,9 @@ class ComputeAPI(object):
def set_host_enabled(self, ctxt, host, enabled):
version = '5.0'
cctxt = self.router.client(ctxt).prepare(
server=host, version=version)
server=host, version=version,
call_monitor_timeout=CONF.rpc_response_timeout,
timeout=CONF.long_rpc_timeout)
return cctxt.call(ctxt, 'set_host_enabled', enabled=enabled)
def swap_volume(self, ctxt, instance, old_volume_id, new_volume_id,

View File

@ -28,6 +28,7 @@ Operations with RPC calls that utilize this value:
* live migration
* scheduling
* enabling/disabling a compute service
Related options:

View File

@ -37,6 +37,12 @@ class ServicesJsonTest(api_sample_base.ApiSampleTestBaseV21):
test_services.fake_service_get_by_host_binary)
self.stub_out("nova.db.api.service_update",
test_services.fake_service_update)
# If we are not using real services, we need to stub out
# HostAPI._update_compute_provider_status so we don't actually
# try to call a fake service over RPC.
self.stub_out('nova.compute.api.HostAPI.'
'_update_compute_provider_status',
lambda *args, **kwargs: None)
self.useFixture(utils_fixture.TimeFixture(test_services.fake_utcnow()))
def test_services_list(self):

View File

@ -11,6 +11,7 @@
# under the License.
import os_resource_classes as orc
import os_traits
import six
from nova import context as nova_context
@ -18,6 +19,8 @@ from nova import exception
from nova import objects
from nova.tests.functional.api import client as api_client
from nova.tests.functional import integrated_helpers
from nova.tests.unit.image import fake as fake_image
from nova import utils
class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
@ -171,3 +174,132 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
log_output = self.stdlog.logger.output
self.assertIn('Error updating resources for node host1.', log_output)
self.assertIn('Failed to create resource provider host1', log_output)
class ComputeStatusFilterTest(integrated_helpers.ProviderUsageBaseTestCase):
"""Tests the API, compute service and Placement interaction with the
COMPUTE_STATUS_DISABLED trait when a compute service is enable/disabled.
This version of the test uses the 2.latest microversion for testing the
2.53+ behavior of the PUT /os-services/{service_id} API.
"""
compute_driver = 'fake.SmallFakeDriver'
def _update_service(self, service, disabled, forced_down=None):
"""Update the service using the 2.53 request schema.
:param service: dict representing the service resource in the API
:param disabled: True if the service should be disabled, False if the
service should be enabled
:param forced_down: Optionally change the forced_down value.
"""
status = 'disabled' if disabled else 'enabled'
req = {'status': status}
if forced_down is not None:
req['forced_down'] = forced_down
self.admin_api.put_service(service['id'], req)
def test_compute_status_filter(self):
"""Tests the compute_status_filter placement request filter"""
# Start a compute service so a compute node and resource provider is
# created.
compute = self._start_compute('host1')
# Get the UUID of the resource provider that was created.
rp_uuid = self._get_provider_uuid_by_host('host1')
# Get the service from the compute API.
services = self.admin_api.get_services(binary='nova-compute',
host='host1')
self.assertEqual(1, len(services))
service = services[0]
# At this point, the service should be enabled and the
# COMPUTE_STATUS_DISABLED trait should not be set on the
# resource provider in placement.
self.assertEqual('enabled', service['status'])
rp_traits = self._get_provider_traits(rp_uuid)
trait = os_traits.COMPUTE_STATUS_DISABLED
self.assertNotIn(trait, rp_traits)
# Now disable the compute service via the API.
self._update_service(service, disabled=True)
# The update to placement should be synchronous so check the provider
# traits and COMPUTE_STATUS_DISABLED should be set.
rp_traits = self._get_provider_traits(rp_uuid)
self.assertIn(trait, rp_traits)
# Try creating a server which should fail because nothing is available.
networks = [{'port': self.neutron.port_1['id']}]
server_req = self._build_minimal_create_server_request(
self.api, 'test_compute_status_filter',
image_uuid=fake_image.get_valid_image_id(), networks=networks)
server = self.api.post_server({'server': server_req})
server = self._wait_for_state_change(self.api, server, 'ERROR')
# There should be a NoValidHost fault recorded.
self.assertIn('fault', server)
self.assertIn('No valid host', server['fault']['message'])
# Now enable the service and the trait should be gone.
self._update_service(service, disabled=False)
rp_traits = self._get_provider_traits(rp_uuid)
self.assertNotIn(trait, rp_traits)
# Try creating another server and it should be OK.
server = self.api.post_server({'server': server_req})
self._wait_for_state_change(self.api, server, 'ACTIVE')
# Stop, force-down and disable the service so the API cannot call
# the compute service to sync the trait.
compute.stop()
self._update_service(service, disabled=True, forced_down=True)
# The API should have logged a message about the service being down.
self.assertIn('Compute service on host host1 is down. The '
'COMPUTE_STATUS_DISABLED trait will be synchronized '
'when the service is restarted.',
self.stdlog.logger.output)
# The trait should not be on the provider even though the node is
# disabled.
rp_traits = self._get_provider_traits(rp_uuid)
self.assertNotIn(trait, rp_traits)
# Restart the compute service which should sync and set the trait on
# the provider in placement.
self.restart_compute_service(compute)
rp_traits = self._get_provider_traits(rp_uuid)
self.assertIn(trait, rp_traits)
class ComputeStatusFilterTest211(ComputeStatusFilterTest):
"""Extends ComputeStatusFilterTest and uses the 2.11 API for the
legacy os-services disable/enable/force-down API behavior
"""
microversion = '2.11'
def _update_service(self, service, disabled, forced_down=None):
"""Update the service using the 2.11 request schema.
:param service: dict representing the service resource in the API
:param disabled: True if the service should be disabled, False if the
service should be enabled
:param forced_down: Optionally change the forced_down value.
"""
# Before 2.53 the service is uniquely identified by host and binary.
body = {
'host': service['host'],
'binary': service['binary']
}
# Handle forced_down first if provided since the enable/disable
# behavior in the API depends on it.
if forced_down is not None:
body['forced_down'] = forced_down
self.admin_api.api_put('/os-services/force-down', body)
if disabled:
self.admin_api.api_put('/os-services/disable', body)
else:
self.admin_api.api_put('/os-services/enable', body)
def _get_provider_uuid_by_host(self, host):
# We have to temporarily mutate to 2.53 to get the hypervisor UUID.
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
return super(ComputeStatusFilterTest211,
self)._get_provider_uuid_by_host(host)

View File

@ -16,6 +16,7 @@
import fixtures
import mock
import oslo_messaging as messaging
from oslo_utils.fixture import uuidsentinel as uuids
from nova.api.openstack.compute import services
@ -327,18 +328,73 @@ class ComputeHostAPITestCase(test.TestCase):
service_id = 42
expected_result = dict(test_service.fake_service, id=service_id)
@mock.patch.object(self.host_api, '_update_compute_provider_status')
@mock.patch.object(self.host_api.db, 'service_get_by_host_and_binary')
@mock.patch.object(self.host_api.db, 'service_update')
def _do_test(mock_service_update, mock_service_get_by_host_and_binary):
def _do_test(mock_service_update, mock_service_get_by_host_and_binary,
mock_update_compute_provider_status):
mock_service_get_by_host_and_binary.return_value = expected_result
mock_service_update.return_value = expected_result
result = self.host_api.service_update_by_host_and_binary(
self.ctxt, host_name, binary, params_to_update)
self._compare_obj(result, expected_result)
mock_update_compute_provider_status.assert_called_once_with(
self.ctxt, test.MatchType(objects.Service))
_do_test()
@mock.patch('nova.compute.api.HostAPI._update_compute_provider_status',
new_callable=mock.NonCallableMock)
def test_service_update_no_update_provider_status(self, mock_ucps):
"""Tests the scenario that the service is updated but the disabled
field is not changed, for example the forced_down field is only
updated. In this case _update_compute_provider_status should not be
called.
"""
service = objects.Service(forced_down=True)
self.assertIn('forced_down', service.obj_what_changed())
with mock.patch.object(service, 'save') as mock_save:
retval = self.host_api.service_update(self.ctxt, service)
self.assertIs(retval, service)
mock_save.assert_called_once_with()
@mock.patch('nova.compute.rpcapi.ComputeAPI.set_host_enabled',
new_callable=mock.NonCallableMock)
def test_update_compute_provider_status_service_too_old(self, mock_she):
"""Tests the scenario that the service is up but is too old to sync the
COMPUTE_STATUS_DISABLED trait.
"""
service = objects.Service(host='fake-host')
service.version = compute.MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED - 1
with mock.patch.object(
self.host_api.servicegroup_api, 'service_is_up',
return_value=True) as service_is_up:
self.host_api._update_compute_provider_status(self.ctxt, service)
service_is_up.assert_called_once_with(service)
self.assertIn('Compute service on host fake-host is too old to sync '
'the COMPUTE_STATUS_DISABLED trait in Placement.',
self.stdlog.logger.output)
@mock.patch('nova.compute.rpcapi.ComputeAPI.set_host_enabled',
side_effect=messaging.MessagingTimeout)
def test_update_compute_provider_status_service_rpc_error(self, mock_she):
"""Tests the scenario that the RPC call to the compute service raised
some exception.
"""
service = objects.Service(host='fake-host', disabled=True)
with mock.patch.object(
self.host_api.servicegroup_api, 'service_is_up',
return_value=True) as service_is_up:
self.host_api._update_compute_provider_status(self.ctxt, service)
service_is_up.assert_called_once_with(service)
mock_she.assert_called_once_with(self.ctxt, 'fake-host', False)
log_output = self.stdlog.logger.output
self.assertIn('An error occurred while updating host enabled '
'status to "disabled" for compute host: fake-host',
log_output)
self.assertIn('MessagingTimeout', log_output)
@mock.patch.object(objects.InstanceList, 'get_by_host',
return_value = ['fake-responses'])
def test_instance_get_all_by_host(self, mock_get):

View File

@ -468,8 +468,10 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
version='5.0')
def test_set_host_enabled(self):
self.flags(long_rpc_timeout=600, rpc_response_timeout=120)
self._test_compute_api('set_host_enabled', 'call',
enabled='enabled', host='host')
enabled='enabled', host='host',
call_monitor_timeout=120, timeout=600)
def test_get_host_uptime(self):
self._test_compute_api('get_host_uptime', 'call', host='host')