Merge "Sync COMPUTE_STATUS_DISABLED from API"

This commit is contained in:
Zuul 2019-07-09 17:32:02 +00:00 committed by Gerrit Code Review
commit 499e753cf3
7 changed files with 258 additions and 6 deletions

View File

@ -106,6 +106,7 @@ AGGREGATE_ACTION_ADD = 'Add'
MIN_COMPUTE_TRUSTED_CERTS = 31
MIN_COMPUTE_ABORT_QUEUED_LIVE_MIGRATION = 34
MIN_COMPUTE_VOLUME_TYPE = 36
MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED = 38
# FIXME(danms): Keep a global cache of the cells we find the
# first time we look. This needs to be refreshed on a timer or
@ -5047,17 +5048,69 @@ class HostAPI(base.Base):
"""Get service entry for the given compute hostname."""
return objects.Service.get_by_compute_host(context, host_name)
def _update_compute_provider_status(self, context, service):
"""Calls the compute service to sync the COMPUTE_STATUS_DISABLED trait.
There are two cases where the API will not call the compute service:
* The compute service is down. In this case the trait is synchronized
when the compute service is restarted.
* The compute service is old. In this case the trait is synchronized
when the compute service is upgraded and restarted.
:param context: nova auth RequestContext
:param service: nova.objects.Service object which has been enabled
or disabled (see ``service_update``).
"""
# Make sure the service is up so we can make the RPC call.
if not self.servicegroup_api.service_is_up(service):
LOG.info('Compute service on host %s is down. The '
'COMPUTE_STATUS_DISABLED trait will be synchronized '
'when the service is restarted.', service.host)
return
# Make sure the compute service is new enough for the trait sync
# behavior.
# TODO(mriedem): Remove this compat check in the U release.
if service.version < MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED:
LOG.info('Compute service on host %s is too old to sync the '
'COMPUTE_STATUS_DISABLED trait in Placement. The '
'trait will be synchronized when the service is '
'upgraded and restarted.', service.host)
return
enabled = not service.disabled
# Avoid leaking errors out of the API.
try:
LOG.debug('Calling the compute service on host %s to sync the '
'COMPUTE_STATUS_DISABLED trait.', service.host)
self.rpcapi.set_host_enabled(context, service.host, enabled)
except Exception:
LOG.exception('An error occurred while updating host enabled '
'status to "%s" for compute host: %s',
'enabled' if enabled else 'disabled',
service.host)
def service_update(self, context, service):
"""Performs the actual service update operation.
If the "disabled" field is changed, potentially calls the compute
service to sync the COMPUTE_STATUS_DISABLED trait on the compute node
resource providers managed by this compute service.
:param context: nova auth RequestContext
:param service: nova.objects.Service object with changes already
set on the object
"""
# Before persisting changes and resetting the changed fields on the
# Service object, determine if the disabled field changed.
update_placement = 'disabled' in service.obj_what_changed()
# Persist the Service object changes to the database.
service.save()
# TODO(mriedem): Reflect COMPUTE_STATUS_DISABLED trait changes to the
# associated compute node resource providers if the service's disabled
# status changed.
# If the disabled field changed, potentially call the compute service
# to sync the COMPUTE_STATUS_DISABLED trait.
if update_placement:
self._update_compute_provider_status(context, service)
return service
@target_host_cell

View File

@ -913,7 +913,9 @@ class ComputeAPI(object):
def set_host_enabled(self, ctxt, host, enabled):
version = '5.0'
cctxt = self.router.client(ctxt).prepare(
server=host, version=version)
server=host, version=version,
call_monitor_timeout=CONF.rpc_response_timeout,
timeout=CONF.long_rpc_timeout)
return cctxt.call(ctxt, 'set_host_enabled', enabled=enabled)
def swap_volume(self, ctxt, instance, old_volume_id, new_volume_id,

View File

@ -28,6 +28,7 @@ Operations with RPC calls that utilize this value:
* live migration
* scheduling
* enabling/disabling a compute service
Related options:

View File

@ -37,6 +37,12 @@ class ServicesJsonTest(api_sample_base.ApiSampleTestBaseV21):
test_services.fake_service_get_by_host_binary)
self.stub_out("nova.db.api.service_update",
test_services.fake_service_update)
# If we are not using real services, we need to stub out
# HostAPI._update_compute_provider_status so we don't actually
# try to call a fake service over RPC.
self.stub_out('nova.compute.api.HostAPI.'
'_update_compute_provider_status',
lambda *args, **kwargs: None)
self.useFixture(utils_fixture.TimeFixture(test_services.fake_utcnow()))
def test_services_list(self):

View File

@ -11,6 +11,7 @@
# under the License.
import os_resource_classes as orc
import os_traits
import six
from nova import context as nova_context
@ -18,6 +19,8 @@ from nova import exception
from nova import objects
from nova.tests.functional.api import client as api_client
from nova.tests.functional import integrated_helpers
from nova.tests.unit.image import fake as fake_image
from nova import utils
class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
@ -171,3 +174,132 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
log_output = self.stdlog.logger.output
self.assertIn('Error updating resources for node host1.', log_output)
self.assertIn('Failed to create resource provider host1', log_output)
class ComputeStatusFilterTest(integrated_helpers.ProviderUsageBaseTestCase):
"""Tests the API, compute service and Placement interaction with the
COMPUTE_STATUS_DISABLED trait when a compute service is enable/disabled.
This version of the test uses the 2.latest microversion for testing the
2.53+ behavior of the PUT /os-services/{service_id} API.
"""
compute_driver = 'fake.SmallFakeDriver'
def _update_service(self, service, disabled, forced_down=None):
"""Update the service using the 2.53 request schema.
:param service: dict representing the service resource in the API
:param disabled: True if the service should be disabled, False if the
service should be enabled
:param forced_down: Optionally change the forced_down value.
"""
status = 'disabled' if disabled else 'enabled'
req = {'status': status}
if forced_down is not None:
req['forced_down'] = forced_down
self.admin_api.put_service(service['id'], req)
def test_compute_status_filter(self):
"""Tests the compute_status_filter placement request filter"""
# Start a compute service so a compute node and resource provider is
# created.
compute = self._start_compute('host1')
# Get the UUID of the resource provider that was created.
rp_uuid = self._get_provider_uuid_by_host('host1')
# Get the service from the compute API.
services = self.admin_api.get_services(binary='nova-compute',
host='host1')
self.assertEqual(1, len(services))
service = services[0]
# At this point, the service should be enabled and the
# COMPUTE_STATUS_DISABLED trait should not be set on the
# resource provider in placement.
self.assertEqual('enabled', service['status'])
rp_traits = self._get_provider_traits(rp_uuid)
trait = os_traits.COMPUTE_STATUS_DISABLED
self.assertNotIn(trait, rp_traits)
# Now disable the compute service via the API.
self._update_service(service, disabled=True)
# The update to placement should be synchronous so check the provider
# traits and COMPUTE_STATUS_DISABLED should be set.
rp_traits = self._get_provider_traits(rp_uuid)
self.assertIn(trait, rp_traits)
# Try creating a server which should fail because nothing is available.
networks = [{'port': self.neutron.port_1['id']}]
server_req = self._build_minimal_create_server_request(
self.api, 'test_compute_status_filter',
image_uuid=fake_image.get_valid_image_id(), networks=networks)
server = self.api.post_server({'server': server_req})
server = self._wait_for_state_change(self.api, server, 'ERROR')
# There should be a NoValidHost fault recorded.
self.assertIn('fault', server)
self.assertIn('No valid host', server['fault']['message'])
# Now enable the service and the trait should be gone.
self._update_service(service, disabled=False)
rp_traits = self._get_provider_traits(rp_uuid)
self.assertNotIn(trait, rp_traits)
# Try creating another server and it should be OK.
server = self.api.post_server({'server': server_req})
self._wait_for_state_change(self.api, server, 'ACTIVE')
# Stop, force-down and disable the service so the API cannot call
# the compute service to sync the trait.
compute.stop()
self._update_service(service, disabled=True, forced_down=True)
# The API should have logged a message about the service being down.
self.assertIn('Compute service on host host1 is down. The '
'COMPUTE_STATUS_DISABLED trait will be synchronized '
'when the service is restarted.',
self.stdlog.logger.output)
# The trait should not be on the provider even though the node is
# disabled.
rp_traits = self._get_provider_traits(rp_uuid)
self.assertNotIn(trait, rp_traits)
# Restart the compute service which should sync and set the trait on
# the provider in placement.
self.restart_compute_service(compute)
rp_traits = self._get_provider_traits(rp_uuid)
self.assertIn(trait, rp_traits)
class ComputeStatusFilterTest211(ComputeStatusFilterTest):
"""Extends ComputeStatusFilterTest and uses the 2.11 API for the
legacy os-services disable/enable/force-down API behavior
"""
microversion = '2.11'
def _update_service(self, service, disabled, forced_down=None):
"""Update the service using the 2.11 request schema.
:param service: dict representing the service resource in the API
:param disabled: True if the service should be disabled, False if the
service should be enabled
:param forced_down: Optionally change the forced_down value.
"""
# Before 2.53 the service is uniquely identified by host and binary.
body = {
'host': service['host'],
'binary': service['binary']
}
# Handle forced_down first if provided since the enable/disable
# behavior in the API depends on it.
if forced_down is not None:
body['forced_down'] = forced_down
self.admin_api.api_put('/os-services/force-down', body)
if disabled:
self.admin_api.api_put('/os-services/disable', body)
else:
self.admin_api.api_put('/os-services/enable', body)
def _get_provider_uuid_by_host(self, host):
# We have to temporarily mutate to 2.53 to get the hypervisor UUID.
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
return super(ComputeStatusFilterTest211,
self)._get_provider_uuid_by_host(host)

View File

@ -16,6 +16,7 @@
import fixtures
import mock
import oslo_messaging as messaging
from oslo_utils.fixture import uuidsentinel as uuids
from nova.api.openstack.compute import services
@ -327,18 +328,73 @@ class ComputeHostAPITestCase(test.TestCase):
service_id = 42
expected_result = dict(test_service.fake_service, id=service_id)
@mock.patch.object(self.host_api, '_update_compute_provider_status')
@mock.patch.object(self.host_api.db, 'service_get_by_host_and_binary')
@mock.patch.object(self.host_api.db, 'service_update')
def _do_test(mock_service_update, mock_service_get_by_host_and_binary):
def _do_test(mock_service_update, mock_service_get_by_host_and_binary,
mock_update_compute_provider_status):
mock_service_get_by_host_and_binary.return_value = expected_result
mock_service_update.return_value = expected_result
result = self.host_api.service_update_by_host_and_binary(
self.ctxt, host_name, binary, params_to_update)
self._compare_obj(result, expected_result)
mock_update_compute_provider_status.assert_called_once_with(
self.ctxt, test.MatchType(objects.Service))
_do_test()
@mock.patch('nova.compute.api.HostAPI._update_compute_provider_status',
new_callable=mock.NonCallableMock)
def test_service_update_no_update_provider_status(self, mock_ucps):
"""Tests the scenario that the service is updated but the disabled
field is not changed, for example the forced_down field is only
updated. In this case _update_compute_provider_status should not be
called.
"""
service = objects.Service(forced_down=True)
self.assertIn('forced_down', service.obj_what_changed())
with mock.patch.object(service, 'save') as mock_save:
retval = self.host_api.service_update(self.ctxt, service)
self.assertIs(retval, service)
mock_save.assert_called_once_with()
@mock.patch('nova.compute.rpcapi.ComputeAPI.set_host_enabled',
new_callable=mock.NonCallableMock)
def test_update_compute_provider_status_service_too_old(self, mock_she):
"""Tests the scenario that the service is up but is too old to sync the
COMPUTE_STATUS_DISABLED trait.
"""
service = objects.Service(host='fake-host')
service.version = compute.MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED - 1
with mock.patch.object(
self.host_api.servicegroup_api, 'service_is_up',
return_value=True) as service_is_up:
self.host_api._update_compute_provider_status(self.ctxt, service)
service_is_up.assert_called_once_with(service)
self.assertIn('Compute service on host fake-host is too old to sync '
'the COMPUTE_STATUS_DISABLED trait in Placement.',
self.stdlog.logger.output)
@mock.patch('nova.compute.rpcapi.ComputeAPI.set_host_enabled',
side_effect=messaging.MessagingTimeout)
def test_update_compute_provider_status_service_rpc_error(self, mock_she):
"""Tests the scenario that the RPC call to the compute service raised
some exception.
"""
service = objects.Service(host='fake-host', disabled=True)
with mock.patch.object(
self.host_api.servicegroup_api, 'service_is_up',
return_value=True) as service_is_up:
self.host_api._update_compute_provider_status(self.ctxt, service)
service_is_up.assert_called_once_with(service)
mock_she.assert_called_once_with(self.ctxt, 'fake-host', False)
log_output = self.stdlog.logger.output
self.assertIn('An error occurred while updating host enabled '
'status to "disabled" for compute host: fake-host',
log_output)
self.assertIn('MessagingTimeout', log_output)
@mock.patch.object(objects.InstanceList, 'get_by_host',
return_value = ['fake-responses'])
def test_instance_get_all_by_host(self, mock_get):

View File

@ -468,8 +468,10 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
version='5.0')
def test_set_host_enabled(self):
self.flags(long_rpc_timeout=600, rpc_response_timeout=120)
self._test_compute_api('set_host_enabled', 'call',
enabled='enabled', host='host')
enabled='enabled', host='host',
call_monitor_timeout=120, timeout=600)
def test_get_host_uptime(self):
self._test_compute_api('get_host_uptime', 'call', host='host')