Ensure source service is up before resizing/migrating

If the source compute service is down when a resize or
cold migrate is initiated the prep_resize cast from the
selected destination compute service to the source will
fail/hang. The API can validate the source compute service
is up or fail the operation with a 409 response if the
source service is down. Note that a host status of
"MAINTENANCE" means the service is up but disabled by
an administrator which is OK for resize/cold migrate.

The solution here works the validation into the
check_instance_host decorator which surprisingly isn't
used in more places where the source host is involved
like reboot, rebuild, snapshot, etc. This change just
handles the resize method but is done in such a way that
the check_instance_host decorator could be applied to
those other methods and perform the is-up check as well.
The decorator is made backward compatible by default.

Note that Instance._save_services is added because during
resize the Instance is updated and the services field
is set but not actually changed, but Instance.save()
handles object fields differently so we need to implement
the no-op _save_services method to avoid a failure.

Change-Id: I85423c7bcacff3bc465c22686d0675529d211b59
Closes-Bug: #1856925
This commit is contained in:
Matt Riedemann 2019-12-19 13:29:50 -05:00
parent d5a786f540
commit ea2ea492a3
12 changed files with 169 additions and 36 deletions

View File

@ -56,7 +56,7 @@ class MigrateServerController(wsgi.Controller):
host_name = body['migrate'].get('host')
instance = common.get_instance(self.compute_api, context, id,
expected_attrs=['flavor'])
expected_attrs=['flavor', 'services'])
if common.instance_has_port_with_resource_request(
instance.uuid, self.network_api):
@ -76,7 +76,9 @@ class MigrateServerController(wsgi.Controller):
host_name=host_name)
except (exception.TooManyInstances, exception.QuotaError) as e:
raise exc.HTTPForbidden(explanation=e.format_message())
except exception.InstanceIsLocked as e:
except (exception.InstanceIsLocked,
exception.InstanceNotReady,
exception.ServiceUnavailable) as e:
raise exc.HTTPConflict(explanation=e.format_message())
except exception.InstanceInvalidState as state_error:
common.raise_http_conflict_for_instance_invalid_state(state_error,

View File

@ -348,7 +348,7 @@ class ServersController(wsgi.Controller):
return response
def _get_server(self, context, req, instance_uuid, is_detail=False,
cell_down_support=False):
cell_down_support=False, columns_to_join=None):
"""Utility function for looking up an instance by uuid.
:param context: request context for auth
@ -360,6 +360,8 @@ class ServersController(wsgi.Controller):
returning a minimal instance
construct if the relevant cell is
down.
:param columns_to_join: optional list of extra fields to join on the
Instance object
"""
expected_attrs = ['flavor', 'numa_topology']
if is_detail:
@ -369,6 +371,8 @@ class ServersController(wsgi.Controller):
expected_attrs.append("trusted_certs")
expected_attrs = self._view_builder.get_show_expected_attrs(
expected_attrs)
if columns_to_join:
expected_attrs.extend(columns_to_join)
instance = common.get_instance(self.compute_api, context,
instance_uuid,
expected_attrs=expected_attrs,
@ -931,7 +935,8 @@ class ServersController(wsgi.Controller):
def _resize(self, req, instance_id, flavor_id, auto_disk_config=None):
"""Begin the resize process with given instance/flavor."""
context = req.environ["nova.context"]
instance = self._get_server(context, req, instance_id)
instance = self._get_server(context, req, instance_id,
columns_to_join=['services'])
context.can(server_policies.SERVERS % 'resize',
target={'user_id': instance.user_id,
'project_id': instance.project_id})
@ -954,7 +959,9 @@ class ServersController(wsgi.Controller):
except exception.QuotaError as error:
raise exc.HTTPForbidden(
explanation=error.format_message())
except exception.InstanceIsLocked as e:
except (exception.InstanceIsLocked,
exception.InstanceNotReady,
exception.ServiceUnavailable) as e:
raise exc.HTTPConflict(explanation=e.format_message())
except exception.InstanceInvalidState as state_error:
common.raise_http_conflict_for_instance_invalid_state(state_error,

View File

@ -186,13 +186,34 @@ def reject_instance_state(vm_state=None, task_state=None):
return outer
def check_instance_host(function):
@six.wraps(function)
def wrapped(self, context, instance, *args, **kwargs):
if not instance.host:
raise exception.InstanceNotReady(instance_id=instance.uuid)
return function(self, context, instance, *args, **kwargs)
return wrapped
def check_instance_host(check_is_up=False):
"""Validate the instance.host before performing the operation.
At a minimum this method will check that the instance.host is set.
:param check_is_up: If True, check that the instance.host status is UP
or MAINTENANCE (disabled but not down).
:raises: InstanceNotReady if the instance.host is not set
:raises: ServiceUnavailable if check_is_up=True and the instance.host
compute service status is not UP or MAINTENANCE
"""
def outer(function):
@six.wraps(function)
def wrapped(self, context, instance, *args, **kwargs):
if not instance.host:
raise exception.InstanceNotReady(instance_id=instance.uuid)
if check_is_up:
# Make sure the source compute service is not down otherwise we
# cannot proceed.
host_status = self.get_instance_host_status(instance)
if host_status not in (fields_obj.HostStatus.UP,
fields_obj.HostStatus.MAINTENANCE):
# ComputeServiceUnavailable would make more sense here but
# we do not want to leak hostnames to end users.
raise exception.ServiceUnavailable()
return function(self, context, instance, *args, **kwargs)
return wrapped
return outer
def check_instance_lock(function):
@ -2519,14 +2540,14 @@ class API(base.Base):
clean_shutdown=clean_shutdown)
@check_instance_lock
@check_instance_host
@check_instance_host()
@check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.ERROR])
def stop(self, context, instance, do_cast=True, clean_shutdown=True):
"""Stop an instance."""
self.force_stop(context, instance, do_cast, clean_shutdown)
@check_instance_lock
@check_instance_host
@check_instance_host()
@check_instance_state(vm_state=[vm_states.STOPPED])
def start(self, context, instance):
"""Start an instance."""
@ -2539,7 +2560,7 @@ class API(base.Base):
self.compute_rpcapi.start_instance(context, instance)
@check_instance_lock
@check_instance_host
@check_instance_host()
@check_instance_state(vm_state=vm_states.ALLOW_TRIGGER_CRASH_DUMP)
def trigger_crash_dump(self, context, instance):
"""Trigger crash dump in an instance."""
@ -3783,6 +3804,7 @@ class API(base.Base):
@check_instance_lock
@check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED])
@check_instance_host(check_is_up=True)
def resize(self, context, instance, flavor_id=None, clean_shutdown=True,
host_name=None, auto_disk_config=None):
"""Resize (ie, migrate) a running instance.
@ -4074,12 +4096,12 @@ class API(base.Base):
self._record_action_start(context, instance, instance_actions.UNPAUSE)
self.compute_rpcapi.unpause_instance(context, instance)
@check_instance_host
@check_instance_host()
def get_diagnostics(self, context, instance):
"""Retrieve diagnostics for the given instance."""
return self.compute_rpcapi.get_diagnostics(context, instance=instance)
@check_instance_host
@check_instance_host()
def get_instance_diagnostics(self, context, instance):
"""Retrieve diagnostics for the given instance."""
return self.compute_rpcapi.get_instance_diagnostics(context,
@ -4161,7 +4183,7 @@ class API(base.Base):
instance=instance,
new_pass=password)
@check_instance_host
@check_instance_host()
@reject_instance_state(
task_state=[task_states.DELETING, task_states.MIGRATING])
def get_vnc_console(self, context, instance, console_type):
@ -4170,7 +4192,7 @@ class API(base.Base):
instance=instance, console_type=console_type)
return {'url': connect_info['access_url']}
@check_instance_host
@check_instance_host()
@reject_instance_state(
task_state=[task_states.DELETING, task_states.MIGRATING])
def get_spice_console(self, context, instance, console_type):
@ -4179,7 +4201,7 @@ class API(base.Base):
instance=instance, console_type=console_type)
return {'url': connect_info['access_url']}
@check_instance_host
@check_instance_host()
@reject_instance_state(
task_state=[task_states.DELETING, task_states.MIGRATING])
def get_rdp_console(self, context, instance, console_type):
@ -4188,7 +4210,7 @@ class API(base.Base):
instance=instance, console_type=console_type)
return {'url': connect_info['access_url']}
@check_instance_host
@check_instance_host()
@reject_instance_state(
task_state=[task_states.DELETING, task_states.MIGRATING])
def get_serial_console(self, context, instance, console_type):
@ -4197,7 +4219,7 @@ class API(base.Base):
instance=instance, console_type=console_type)
return {'url': connect_info['access_url']}
@check_instance_host
@check_instance_host()
@reject_instance_state(
task_state=[task_states.DELETING, task_states.MIGRATING])
def get_mks_console(self, context, instance, console_type):
@ -4206,7 +4228,7 @@ class API(base.Base):
instance=instance, console_type=console_type)
return {'url': connect_info['access_url']}
@check_instance_host
@check_instance_host()
def get_console_output(self, context, instance, tail_length=None):
"""Get console output for an instance."""
return self.compute_rpcapi.get_console_output(context,
@ -5082,7 +5104,7 @@ class API(base.Base):
# We allow creating the snapshot in any vm_state as long as there is
# no task being performed on the instance and it has a host.
@check_instance_host
@check_instance_host()
@check_instance_state(vm_state=None)
def do_volume_snapshot_create(self, context, instance):
self.compute_rpcapi.volume_snapshot_create(context, instance,
@ -5104,7 +5126,7 @@ class API(base.Base):
# We allow deleting the snapshot in any vm_state as long as there is
# no task being performed on the instance and it has a host.
@check_instance_host
@check_instance_host()
@check_instance_state(vm_state=None)
def do_volume_snapshot_delete(self, context, instance):
self.compute_rpcapi.volume_snapshot_delete(context, instance,

View File

@ -686,6 +686,10 @@ class Instance(base.NovaPersistentObject, base.NovaObject,
# NOTE(gibi): tags are not saved through the instance
pass
def _save_services(self, context):
# NOTE(mriedem): services are not saved through the instance
pass
@staticmethod
def _nullify_flavor_description(flavor_info):
"""Helper method to nullify descriptions from a set of primitive

View File

@ -99,9 +99,9 @@ class TestComputeTaskNotificationSample(
'hw:numa_cpus.0': '0',
'hw:numa_mem.0': 512})
self._wait_for_notification('instance.create.end')
# Force down the compute node
# Disable the compute node
service_id = self.api.get_service_id('nova-compute')
self.admin_api.put_service_force_down(service_id, True)
self.admin_api.put_service(service_id, {'status': 'disabled'})
fake_notifier.reset()

View File

@ -387,3 +387,65 @@ class EnforceVolumeBackedForZeroDiskFlavorTestCase(
server = self.admin_api.post_server({'server': server_req})
server = self._wait_for_state_change(server, 'ERROR')
self.assertIn('No valid host', server['fault']['message'])
class ResizeCheckInstanceHostTestCase(
integrated_helpers.ProviderUsageBaseTestCase):
"""Tests for the check_instance_host decorator used during resize/migrate.
"""
compute_driver = 'fake.MediumFakeDriver'
def test_resize_source_compute_validation(self, resize=True):
flavors = self.api.get_flavors()
# Start up a compute on which to create a server.
self._start_compute('host1')
# Create a server on host1.
server = self._build_minimal_create_server_request(
name='test_resize_source_compute_validation',
image_uuid=fake_image.get_valid_image_id(),
flavor_id=flavors[0]['id'],
networks='none')
server = self.api.post_server({'server': server})
server = self._wait_for_state_change(server, 'ACTIVE')
# Check if we're cold migrating.
if resize:
req = {'resize': {'flavorRef': flavors[1]['id']}}
else:
req = {'migrate': None}
# Start up a destination compute.
self._start_compute('host2')
# First, force down the source compute service.
source_service = self.api.get_services(
binary='nova-compute', host='host1')[0]
self.api.put_service(source_service['id'], {'forced_down': True})
# Try the operation and it should fail with a 409 response.
ex = self.assertRaises(api_client.OpenStackApiException,
self.api.post_server_action, server['id'], req)
self.assertEqual(409, ex.response.status_code)
self.assertIn('Service is unavailable at this time', six.text_type(ex))
# Now bring the source compute service up but disable it. The operation
# should be allowed in this case since the service is up.
self.api.put_service(source_service['id'],
{'forced_down': False, 'status': 'disabled'})
self.api.post_server_action(server['id'], req)
server = self._wait_for_state_change(server, 'VERIFY_RESIZE')
# Revert the resize to get the server back to host1.
self.api.post_server_action(server['id'], {'revertResize': None})
server = self._wait_for_state_change(server, 'ACTIVE')
# Now shelve offload the server so it does not have a host.
self.api.post_server_action(server['id'], {'shelve': None})
self._wait_for_server_parameter(server,
{'status': 'SHELVED_OFFLOADED',
'OS-EXT-SRV-ATTR:host': None})
# Now try the operation again and it should fail with a different
# 409 response.
ex = self.assertRaises(api_client.OpenStackApiException,
self.api.post_server_action, server['id'], req)
self.assertEqual(409, ex.response.status_code)
# This error comes from check_instance_state which is processed before
# check_instance_host.
self.assertIn('while it is in vm_state shelved_offloaded',
six.text_type(ex))
def test_cold_migrate_source_compute_validation(self):
self.test_resize_source_compute_validation(resize=False)

View File

@ -52,7 +52,7 @@ class CommonMixin(object):
if action == '_migrate_live':
expected_attrs = ['numa_topology']
elif action == '_migrate':
expected_attrs = ['flavor']
expected_attrs = ['flavor', 'services']
uuid = uuidutils.generate_uuid()
self.mock_get.side_effect = exception.InstanceNotFound(
@ -75,7 +75,7 @@ class CommonMixin(object):
if action == '_migrate_live':
expected_attrs = ['numa_topology']
elif action == '_migrate':
expected_attrs = ['flavor']
expected_attrs = ['flavor', 'services']
if method is None:
method = action.replace('_', '')
@ -139,7 +139,7 @@ class CommonMixin(object):
if action == '_migrate_live':
expected_attrs = ['numa_topology']
elif action == '_migrate':
expected_attrs = ['flavor']
expected_attrs = ['flavor', 'services']
if method is None:
method = action.replace('_', '')
@ -180,7 +180,7 @@ class CommonMixin(object):
if action == '_migrate_live':
expected_attrs = ['numa_topology']
elif action == '_migrate':
expected_attrs = ['flavor']
expected_attrs = ['flavor', 'services']
if method is None:
method = action.replace('_', '')

View File

@ -131,9 +131,9 @@ class MigrateServerTestsV21(admin_only_action_common.CommonTests):
body={'migrate': None})
mock_resize.assert_called_once_with(
self.context, instance, host_name=self.host_name)
self.mock_get.assert_called_once_with(self.context, instance.uuid,
expected_attrs=['flavor'],
cell_down_support=False)
self.mock_get.assert_called_once_with(
self.context, instance.uuid, expected_attrs=['flavor', 'services'],
cell_down_support=False)
def test_migrate_too_many_instances(self):
exc_info = exception.TooManyInstances(overs='', req='', used=0,

View File

@ -138,9 +138,11 @@ class ServerActionsControllerTestV21(test.TestCase):
eval(controller_function),
self.req, instance['uuid'],
body=body_map.get(action))
expected_attrs = ['flavor', 'numa_topology']
if method == 'resize':
expected_attrs.append('services')
mock_get.assert_called_once_with(self.context, uuid,
expected_attrs=['flavor', 'numa_topology'],
expected_attrs=expected_attrs,
cell_down_support=False)
mock_method.assert_called_once_with(self.context, instance,
*args, **kwargs)

View File

@ -12509,6 +12509,8 @@ class DisabledInstanceTypesTestCase(BaseTestCase):
self.assertRaises(exception.FlavorNotFound,
self.compute_api.create, self.context, self.inst_type, None)
@mock.patch('nova.compute.api.API.get_instance_host_status',
new=mock.Mock(return_value=obj_fields.HostStatus.UP))
@mock.patch('nova.compute.api.API._validate_flavor_image_nostatus')
@mock.patch('nova.objects.RequestSpec')
def test_can_resize_to_visible_instance_type(self, mock_reqspec,
@ -12531,6 +12533,8 @@ class DisabledInstanceTypesTestCase(BaseTestCase):
with mock.patch('nova.conductor.api.ComputeTaskAPI.resize_instance'):
self.compute_api.resize(self.context, instance, '4')
@mock.patch('nova.compute.api.API.get_instance_host_status',
new=mock.Mock(return_value=obj_fields.HostStatus.UP))
def test_cannot_resize_to_disabled_instance_type(self):
instance = self._create_fake_instance_obj()
orig_get_flavor_by_flavor_id = \

View File

@ -1752,6 +1752,8 @@ class _ComputeAPIUnitTestMixIn(object):
mock_inst_save.assert_called_once_with(expected_task_state=[None])
mock_get_requested_resources.assert_not_called()
@mock.patch('nova.compute.api.API.get_instance_host_status',
new=mock.Mock(return_value=fields_obj.HostStatus.UP))
@mock.patch('nova.compute.utils.is_volume_backed_instance',
return_value=False)
@mock.patch('nova.compute.api.API._validate_flavor_image_nostatus')
@ -1966,6 +1968,8 @@ class _ComputeAPIUnitTestMixIn(object):
def test_resize_allow_cross_cell_resize_true(self):
self._test_resize(allow_cross_cell_resize=True)
@mock.patch('nova.compute.api.API.get_instance_host_status',
new=mock.Mock(return_value=fields_obj.HostStatus.UP))
@mock.patch('nova.compute.flavors.get_flavor_by_flavor_id')
@mock.patch('nova.objects.Quotas.count_as_dict')
@mock.patch('nova.objects.Quotas.limit_check_project_and_user')
@ -2023,6 +2027,8 @@ class _ComputeAPIUnitTestMixIn(object):
self._test_migrate(host_name='target_host',
allow_cross_cell_resize=True)
@mock.patch('nova.compute.api.API.get_instance_host_status',
new=mock.Mock(return_value=fields_obj.HostStatus.UP))
@mock.patch.object(objects.ComputeNodeList, 'get_all_by_host',
side_effect=exception.ComputeHostNotFound(
host='nonexistent_host'))
@ -2032,6 +2038,8 @@ class _ComputeAPIUnitTestMixIn(object):
self.compute_api.resize, self.context,
fake_inst, host_name='nonexistent_host')
@mock.patch('nova.compute.api.API.get_instance_host_status',
new=mock.Mock(return_value=fields_obj.HostStatus.UP))
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(compute_api.API, '_record_action_start')
@mock.patch.object(quotas_obj.Quotas, 'limit_check_project_and_user')
@ -2057,6 +2065,8 @@ class _ComputeAPIUnitTestMixIn(object):
mock_resize.assert_not_called()
mock_save.assert_not_called()
@mock.patch('nova.compute.api.API.get_instance_host_status',
new=mock.Mock(return_value=fields_obj.HostStatus.UP))
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(compute_api.API, '_record_action_start')
@mock.patch.object(quotas_obj.Quotas, 'limit_check_project_and_user')
@ -2083,6 +2093,8 @@ class _ComputeAPIUnitTestMixIn(object):
mock_resize.assert_not_called()
mock_save.assert_not_called()
@mock.patch('nova.compute.api.API.get_instance_host_status',
new=mock.Mock(return_value=fields_obj.HostStatus.UP))
@mock.patch.object(flavors, 'get_flavor_by_flavor_id')
def test_resize_to_zero_disk_flavor_fails(self, get_flavor_by_flavor_id):
fake_inst = self._create_instance_obj()
@ -2097,6 +2109,8 @@ class _ComputeAPIUnitTestMixIn(object):
self.compute_api.resize, self.context,
fake_inst, flavor_id='flavor-id')
@mock.patch('nova.compute.api.API.get_instance_host_status',
new=mock.Mock(return_value=fields_obj.HostStatus.UP))
@mock.patch('nova.compute.api.API._validate_flavor_image_nostatus')
@mock.patch.object(objects.RequestSpec, 'get_by_instance_uuid')
@mock.patch('nova.compute.api.API._record_action_start')
@ -2126,6 +2140,8 @@ class _ComputeAPIUnitTestMixIn(object):
do_test()
@mock.patch('nova.compute.api.API.get_instance_host_status',
new=mock.Mock(return_value=fields_obj.HostStatus.UP))
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(compute_api.API, '_record_action_start')
@mock.patch.object(quotas_obj.Quotas,
@ -2182,6 +2198,8 @@ class _ComputeAPIUnitTestMixIn(object):
mock_record.assert_not_called()
mock_resize.assert_not_called()
@mock.patch('nova.compute.api.API.get_instance_host_status',
new=mock.Mock(return_value=fields_obj.HostStatus.UP))
@mock.patch.object(flavors, 'get_flavor_by_flavor_id')
@mock.patch.object(compute_utils, 'upsize_quota_delta')
@mock.patch.object(quotas_obj.Quotas, 'count_as_dict')
@ -2207,6 +2225,8 @@ class _ComputeAPIUnitTestMixIn(object):
fake_inst, flavor_id='flavor-id')
self.assertFalse(mock_save.called)
@mock.patch('nova.compute.api.API.get_instance_host_status',
new=mock.Mock(return_value=fields_obj.HostStatus.UP))
@mock.patch.object(flavors, 'get_flavor_by_flavor_id')
@mock.patch.object(objects.Quotas, 'count_as_dict')
@mock.patch.object(objects.Quotas, 'limit_check_project_and_user')
@ -3502,6 +3522,8 @@ class _ComputeAPIUnitTestMixIn(object):
lambda obj, context, image_id, **kwargs: self.fake_image)
return self.fake_image['id']
@mock.patch('nova.compute.api.API.get_instance_host_status',
new=mock.Mock(return_value=fields_obj.HostStatus.UP))
def test_resize_with_disabled_auto_disk_config_fails(self):
fake_inst = self._create_instance_with_disabled_disk_config(
object=True)

View File

@ -0,0 +1,8 @@
---
fixes:
- |
This release contains a fix for `bug 1856925`_ such that ``resize`` and
``migrate`` server actions will be rejected with a 409 ``HTTPConflict``
response if the source compute service is down.
.. _bug 1856925: https://bugs.launchpad.net/nova/+bug/1856925