Merge "Delete ARQs by UUID if Cyborg ARQ bind fails."
This commit is contained in:
commit
ccecb507ff
|
@ -212,10 +212,11 @@ class _CyborgClient(object):
|
|||
resp, err_msg = self._call_cyborg(self._client.patch,
|
||||
self.ARQ_URL, json=patch_list)
|
||||
if err_msg:
|
||||
arq_uuids = bindings.keys()
|
||||
msg = _(' Binding failed for ARQ UUIDs: ')
|
||||
err_msg = err_msg + msg + ','.join(bindings.keys())
|
||||
raise exception.AcceleratorRequestOpFailed(
|
||||
op=_('bind'), msg=err_msg)
|
||||
err_msg = err_msg + msg + ','.join(arq_uuids)
|
||||
raise exception.AcceleratorRequestBindingFailed(
|
||||
arqs=arq_uuids, msg=err_msg)
|
||||
|
||||
def get_arqs_for_instance(self, instance_uuid, only_resolved=False):
|
||||
"""Get ARQs for the instance.
|
||||
|
@ -278,3 +279,26 @@ class _CyborgClient(object):
|
|||
msg = err_msg + _(' Instance %s') % instance_uuid
|
||||
raise exception.AcceleratorRequestOpFailed(
|
||||
op=_('delete'), msg=msg)
|
||||
|
||||
def delete_arqs_by_uuid(self, arq_uuids):
|
||||
"""Delete the specified ARQs, unbinding them if needed.
|
||||
|
||||
This is meant to be used to clean up ARQs that have failed to bind
|
||||
to an instance. So delete_arqs_for_instance() is not applicable.
|
||||
|
||||
This Cyborg API call is NOT idempotent, i.e., if called more than
|
||||
once, the 2nd and later calls will throw errors.
|
||||
|
||||
If this fails, an error is logged but no exception is raised
|
||||
because this cleans up Cyborg resources, but should otherwise
|
||||
not affect instance spawn.
|
||||
|
||||
:params arq_uuids: dict_keys() of ARQ UUIDs
|
||||
"""
|
||||
arq_uuid_str = ','.join(arq_uuids)
|
||||
params = {'arqs': arq_uuid_str}
|
||||
resp, err_msg = self._call_cyborg(self._client.delete,
|
||||
self.ARQ_URL, params=params)
|
||||
if err_msg:
|
||||
# No point raising an exception.
|
||||
LOG.error('Failed to delete ARQs %s', arq_uuid_str)
|
||||
|
|
|
@ -838,15 +838,13 @@ class ComputeTaskManager(base.Base):
|
|||
host.service_host, host.nodename, alts, instance=instance)
|
||||
|
||||
try:
|
||||
resource_provider_mapping = (
|
||||
local_reqspec.get_request_group_mapping())
|
||||
accel_uuids = self._create_and_bind_arqs(
|
||||
context, instance.uuid, instance.flavor.extra_specs,
|
||||
host.nodename, resource_provider_mapping)
|
||||
accel_uuids = self._create_and_bind_arq_for_instance(
|
||||
context, instance, host, local_reqspec)
|
||||
except Exception as exc:
|
||||
LOG.exception('Failed to reschedule. Reason: %s', exc)
|
||||
self._cleanup_when_reschedule_fails(context, instance, exc,
|
||||
legacy_request_spec, requested_networks)
|
||||
self._cleanup_when_reschedule_fails(
|
||||
context, instance, exc, legacy_request_spec,
|
||||
requested_networks)
|
||||
continue
|
||||
|
||||
self.compute_rpcapi.build_and_run_instance(context,
|
||||
|
@ -861,6 +859,22 @@ class ComputeTaskManager(base.Base):
|
|||
limits=host.limits, host_list=host_list,
|
||||
accel_uuids=accel_uuids)
|
||||
|
||||
def _create_and_bind_arq_for_instance(self, context, instance, host,
|
||||
request_spec):
|
||||
try:
|
||||
resource_provider_mapping = (
|
||||
request_spec.get_request_group_mapping())
|
||||
# Using nodename instead of hostname. See:
|
||||
# http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa
|
||||
return self._create_and_bind_arqs(
|
||||
context, instance.uuid, instance.flavor.extra_specs,
|
||||
host.nodename, resource_provider_mapping)
|
||||
except exception.AcceleratorRequestBindingFailed as exc:
|
||||
# If anything failed here we need to cleanup and bail out.
|
||||
cyclient = cyborg.get_client(context)
|
||||
cyclient.delete_arqs_by_uuid(exc.arqs)
|
||||
raise
|
||||
|
||||
def _schedule_instances(self, context, request_spec,
|
||||
instance_uuids=None, return_alternates=False):
|
||||
scheduler_utils.setup_instance_group(context, request_spec)
|
||||
|
@ -1619,17 +1633,10 @@ class ComputeTaskManager(base.Base):
|
|||
# this one.
|
||||
continue
|
||||
|
||||
accel_uuids = []
|
||||
try:
|
||||
resource_provider_mapping = (
|
||||
request_spec.get_request_group_mapping())
|
||||
# Using nodename instead of hostname. See:
|
||||
# http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa
|
||||
accel_uuids = self._create_and_bind_arqs(
|
||||
context, instance.uuid, instance.flavor.extra_specs,
|
||||
host.nodename, resource_provider_mapping)
|
||||
accel_uuids = self._create_and_bind_arq_for_instance(
|
||||
context, instance, host, request_spec)
|
||||
except Exception as exc:
|
||||
# If anything failed here we need to cleanup and bail out.
|
||||
with excutils.save_and_reraise_exception():
|
||||
self._cleanup_build_artifacts(
|
||||
context, exc, instances, build_requests, request_specs,
|
||||
|
|
|
@ -2320,6 +2320,14 @@ class AcceleratorRequestOpFailed(NovaException):
|
|||
msg_fmt = _("Failed to %(op)s accelerator requests: %(msg)s")
|
||||
|
||||
|
||||
class AcceleratorRequestBindingFailed(NovaException):
|
||||
msg_fmt = _("Failed to bind accelerator requests: %(msg)s")
|
||||
|
||||
def __init__(self, arqs, **kwargs):
|
||||
super().__init__(message=self.msg_fmt, **kwargs)
|
||||
self.arqs = arqs or []
|
||||
|
||||
|
||||
class InvalidLibvirtGPUConfig(NovaException):
|
||||
msg_fmt = _('Invalid configuration for GPU devices: %(reason)s')
|
||||
|
||||
|
|
|
@ -250,13 +250,15 @@ class CyborgTestCase(test.NoDBTestCase):
|
|||
called_params = mock_cyborg_patch.call_args.kwargs['json']
|
||||
self.assertEqual(sorted(called_params), sorted(patch_list))
|
||||
|
||||
@mock.patch('nova.accelerator.cyborg._CyborgClient.delete_arqs_by_uuid')
|
||||
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
|
||||
def test_bind_arqs_exception(self, mock_call_cyborg):
|
||||
def test_bind_arqs_exception(self, mock_call_cyborg, mock_del_arqs):
|
||||
# If Cyborg returns invalid response, raise exception.
|
||||
bindings, _ = self._get_bound_arqs()
|
||||
mock_call_cyborg.return_value = None, 'Some error'
|
||||
self.assertRaises(exception.AcceleratorRequestOpFailed,
|
||||
self.client.bind_arqs, bindings=bindings)
|
||||
self.assertRaises(exception.AcceleratorRequestBindingFailed,
|
||||
self.client.bind_arqs, bindings=bindings)
|
||||
mock_del_arqs.assert_not_called()
|
||||
|
||||
@mock.patch('keystoneauth1.adapter.Adapter.get')
|
||||
def test_get_arqs_for_instance(self, mock_cyborg_get):
|
||||
|
@ -368,3 +370,27 @@ class CyborgTestCase(test.NoDBTestCase):
|
|||
expected_msg = ('Failed to delete accelerator requests: ' +
|
||||
err_msg + ' Instance ' + instance_uuid)
|
||||
self.assertEqual(expected_msg, exc.format_message())
|
||||
|
||||
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
|
||||
def test_delete_arqs_by_uuid(self, mock_call_cyborg):
|
||||
# Happy path
|
||||
mock_call_cyborg.return_value = ('Some Value', None)
|
||||
_, bound_arqs = self._get_bound_arqs()
|
||||
arq_uuids = [arq['uuid'] for arq in bound_arqs]
|
||||
arq_uuid_str = ','.join(arq_uuids)
|
||||
self.client.delete_arqs_by_uuid(arq_uuids)
|
||||
mock_call_cyborg.assert_called_once_with(mock.ANY,
|
||||
self.client.ARQ_URL, params={'arqs': arq_uuid_str})
|
||||
|
||||
@mock.patch('nova.accelerator.cyborg.LOG.error')
|
||||
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
|
||||
def test_delete_arqs_by_uuid_exception(self, mock_call_cyborg, mock_log):
|
||||
mock_call_cyborg.return_value = (None, 'Some error')
|
||||
_, bound_arqs = self._get_bound_arqs()
|
||||
arq_uuids = [arq['uuid'] for arq in bound_arqs]
|
||||
arq_uuid_str = ','.join(arq_uuids)
|
||||
self.client.delete_arqs_by_uuid(arq_uuids)
|
||||
mock_call_cyborg.assert_called_once_with(mock.ANY,
|
||||
self.client.ARQ_URL, params={'arqs': arq_uuid_str})
|
||||
mock_log.assert_called_once_with('Failed to delete ARQs %s',
|
||||
arq_uuid_str)
|
||||
|
|
|
@ -26,6 +26,7 @@ from oslo_utils import timeutils
|
|||
from oslo_versionedobjects import exception as ovo_exc
|
||||
import six
|
||||
|
||||
from nova.accelerator import cyborg
|
||||
from nova import block_device
|
||||
from nova.compute import flavors
|
||||
from nova.compute import rpcapi as compute_rpcapi
|
||||
|
@ -48,6 +49,7 @@ from nova import objects
|
|||
from nova.objects import base as obj_base
|
||||
from nova.objects import block_device as block_device_obj
|
||||
from nova.objects import fields
|
||||
from nova.objects import request_spec
|
||||
from nova.scheduler.client import query
|
||||
from nova.scheduler import utils as scheduler_utils
|
||||
from nova import test
|
||||
|
@ -583,7 +585,7 @@ class _BaseTaskTestCase(object):
|
|||
|
||||
mock_getaz.return_value = 'myaz'
|
||||
mock_create_bind_arqs.side_effect = (
|
||||
exc.AcceleratorRequestOpFailed(op='', msg=''))
|
||||
exc.AcceleratorRequestBindingFailed(arqs=[], msg=''))
|
||||
|
||||
self.conductor.build_instances(self.context,
|
||||
instances=instances,
|
||||
|
@ -605,10 +607,10 @@ class _BaseTaskTestCase(object):
|
|||
# in the above flow. So, we compare the fields instead.
|
||||
mock_cleanup.assert_has_calls([
|
||||
mock.call(self.context, test.MatchType(objects.Instance),
|
||||
test.MatchType(exc.AcceleratorRequestOpFailed),
|
||||
test.MatchType(exc.AcceleratorRequestBindingFailed),
|
||||
test.MatchType(dict), None),
|
||||
mock.call(self.context, test.MatchType(objects.Instance),
|
||||
test.MatchType(exc.AcceleratorRequestOpFailed),
|
||||
test.MatchType(exc.AcceleratorRequestBindingFailed),
|
||||
test.MatchType(dict), None),
|
||||
])
|
||||
call_list = mock_cleanup.call_args_list
|
||||
|
@ -2548,11 +2550,11 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
|
|||
self, mock_create_bind_arqs, mock_cleanup):
|
||||
# Exceptions in _create_and_bind_arqs result in cleanup
|
||||
mock_create_bind_arqs.side_effect = (
|
||||
exc.AcceleratorRequestOpFailed(op='', msg=''))
|
||||
exc.AcceleratorRequestBindingFailed(arqs=[], msg=''))
|
||||
|
||||
try:
|
||||
self._do_schedule_and_build_instances_test(self.params)
|
||||
except exc.AcceleratorRequestOpFailed:
|
||||
except exc.AcceleratorRequestBindingFailed:
|
||||
pass
|
||||
|
||||
mock_cleanup.assert_called_once_with(
|
||||
|
@ -2561,6 +2563,26 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
|
|||
self.params['block_device_mapping'], self.params['tags'],
|
||||
mock.ANY)
|
||||
|
||||
@mock.patch.object(request_spec.RequestSpec, "get_request_group_mapping")
|
||||
@mock.patch.object(cyborg, "get_client")
|
||||
@mock.patch.object(
|
||||
conductor_manager.ComputeTaskManager, '_create_and_bind_arqs')
|
||||
def test__create_and_bind_arq_for_instance(
|
||||
self, mock_create_bind_arqs, mock_client, mock_request_mappings):
|
||||
# Exceptions in _create_and_bind_arqs result in cleanup
|
||||
arqs = ["fake-arq-uuid"]
|
||||
mock_create_bind_arqs.side_effect = (
|
||||
exc.AcceleratorRequestBindingFailed(arqs=arqs, msg=''))
|
||||
mock_client.return_value = mock.Mock()
|
||||
instance = mock.Mock()
|
||||
instance.uuid = "fake-uuid"
|
||||
ex = self.assertRaises(exc.AcceleratorRequestBindingFailed,
|
||||
self.conductor._create_and_bind_arq_for_instance,
|
||||
None, instance, mock.Mock(), request_spec.RequestSpec())
|
||||
|
||||
self.assertIn('Failed to bind accelerator requests', ex.message)
|
||||
mock_client.return_value.delete_arqs_by_uuid.assert_called_with(arqs)
|
||||
|
||||
def test_map_instance_to_cell_already_mapped(self):
|
||||
"""Tests a scenario where an instance is already mapped to a cell
|
||||
during scheduling.
|
||||
|
|
Loading…
Reference in New Issue