Block servers with vGPU and device profile in heal_allocations

Nested allocations are only partially supported in nova-manage placement
heal_allocations CLI. This patch documents the missing support and
blocks healing instances with VGPU or Cyborg device profile request in
the embedded flavor. Blocking is needed as if --forced is used with such
instances then the tool could recreate an allocation ignoring some of
these resources.

Change-Id: I89ac90d2ea8bc268940869dbbc90352bfad5c0de
Related-Bug: bug/1939020
This commit is contained in:
Balazs Gibizer 2021-08-05 15:20:02 +02:00
parent 2ffd973860
commit 59c2262ca5
6 changed files with 110 additions and 9 deletions

View File

@ -180,8 +180,13 @@ things:
* `Migration-based allocations`_ would be lost if manually deleted during a
resize. These are allocations tracked by the migration resource record
on the source compute service during a migration.
* Healing allocations does not supported nested resource allocations before the
20.0.0 (Train) release.
* Healing allocations only partially support nested allocations. Nested
allocations due to Neutron ports having QoS policies are supported since
20.0.0 (Train) release. But nested allocations due to vGPU or Cyborg device
profile requests in the flavor are not supported. Also if you are using
provider.yaml files on compute hosts to define additional resources, if those
resources are defined on child resource providers then instances using such
resources are not supported.
If you do use the ``heal_allocations`` command to cleanup allocations for a
specific trouble instance, it is recommended to take note of what the

View File

@ -1144,6 +1144,15 @@ state transition. For each instance found, allocations are created against
the compute node resource provider for that instance based on the flavor
associated with the instance.
.. note::
Nested allocations are only partially supported. Nested allocations due to
Neutron ports having QoS policies are supported since 20.0.0 (Train)
release. But nested allocations due to vGPU or Cyborg device profile
requests in the flavor are not supported. Also if you are using
provider.yaml files on compute hosts to define additional resources, if
those resources are defined on child resource providers then instances
using such resources are not supported.
Also if the instance has any port attached that has resource request
(e.g. :neutron-doc:`Quality of Service (QoS): Guaranteed Bandwidth
<admin/config-qos-min-bw.html>`) but the corresponding

View File

@ -1715,6 +1715,18 @@ class PlacementCommands(object):
allocations['user_id'] = instance.user_id
return allocations
@staticmethod
def ensure_instance_has_no_vgpu_request(instance):
if instance.flavor.extra_specs.get("resources:VGPU"):
raise exception.HealvGPUAllocationNotSupported(
instance_uuid=instance.uuid)
@staticmethod
def ensure_instance_has_no_cyborg_device_profile_request(instance):
if instance.flavor.extra_specs.get("accel:device_profile"):
raise exception.HealDeviceProfileAllocationNotSupported(
instance_uuid=instance.uuid)
def _heal_allocations_for_instance(self, ctxt, instance, node_cache,
output, placement, dry_run,
heal_port_allocations, neutron,
@ -1771,6 +1783,9 @@ class PlacementCommands(object):
output(_('Instance %s is not on a host.') % instance.uuid)
return
self.ensure_instance_has_no_vgpu_request(instance)
self.ensure_instance_has_no_cyborg_device_profile_request(instance)
try:
allocations = placement.get_allocs_for_consumer(
ctxt, instance.uuid)
@ -1887,7 +1902,7 @@ class PlacementCommands(object):
:param max_count: batch size (limit per instance query)
:param unlimited: True if all instances in the cell should be
processed, else False to just process $max_count instances
:param outout: function that takes a single message for verbose output
:param output: function that takes a single message for verbose output
:param placement: nova.scheduler.client.report.SchedulerReportClient
to communicate with the Placement service API.
:param dry_run: Process instances and print output but do not commit
@ -2027,6 +2042,7 @@ class PlacementCommands(object):
* 5: Unable to query ports from neutron
* 6: Unable to update ports in neutron
* 7: Cannot roll back neutron port updates. Manual steps needed.
* 8: Cannot heal instance with vGPU or Cyborg resource request
* 127: Invalid input.
"""
# NOTE(mriedem): Thoughts on ways to expand this:
@ -2155,6 +2171,12 @@ class PlacementCommands(object):
except exception.UnableToRollbackPortUpdates as e:
print(e.format_message())
return 7
except (
exception.HealvGPUAllocationNotSupported,
exception.HealDeviceProfileAllocationNotSupported,
) as e:
print(e.format_message())
return 8
# Make sure we don't go over the max count. Note that we
# don't include instances that already have allocations in the

View File

@ -2235,6 +2235,24 @@ class MissingDomainCapabilityFeatureException(NovaException):
"including <%(feature)s> feature.")
class HealAllocationException(NovaException):
msg_fmt = _("Healing instance allocation failed.")
class HealvGPUAllocationNotSupported(HealAllocationException):
msg_fmt = _(
"Healing allocation for instance %(instance_uuid)s with vGPU resource "
"request is not supported."
)
class HealDeviceProfileAllocationNotSupported(HealAllocationException):
msg_fmt = _(
"Healing allocation for instance %(instance_uuid)s with Cyborg device "
"profile request is not supported."
)
class HealPortAllocationException(NovaException):
msg_fmt = _("Healing port allocation failed.")

View File

@ -780,6 +780,50 @@ class TestNovaManagePlacementHealAllocations(
)
self.assertEqual(4, result, self.output.getvalue())
def test_instance_with_vgpu_is_blocked(self):
# we cannot boot with VGPU in these tests so manipulate the
# instance.flavor directly after the boot to simulate an instance with
# VGPU request
server, _ = self._boot_and_remove_allocations(self.flavor, 'cell1')
instance = objects.Instance.get_by_uuid(
context.get_admin_context(), server['id'])
instance.flavor.extra_specs["resources:VGPU"] = 1
instance.save()
result = self.cli.heal_allocations(
verbose=True, instance_uuid=server['id'],
force=True
)
self.assertIn(
f"Healing allocation for instance {server['id']} with vGPU "
f"resource request is not supported.",
self.output.getvalue()
)
self.assertEqual(8, result, self.output.getvalue())
def test_instance_with_cyborg_dev_profile_is_blocked(self):
# we cannot boot with cyborg device in these tests so manipulate the
# instance.flavor directly after the boot to simulate an instance with
# cyborg request
server, _ = self._boot_and_remove_allocations(self.flavor, 'cell1')
instance = objects.Instance.get_by_uuid(
context.get_admin_context(), server['id'])
instance.flavor.extra_specs["accel:device_profile"] = "foo"
instance.save()
result = self.cli.heal_allocations(
verbose=True, instance_uuid=server['id'],
force=True
)
self.assertIn(
f"Healing allocation for instance {server['id']} with Cyborg "
f"device profile request is not supported.",
self.output.getvalue()
)
self.assertEqual(8, result, self.output.getvalue())
class TestNovaManagePlacementHealPortAllocations(
test_servers.PortResourceRequestBasedSchedulingTestBase):

View File

@ -2399,7 +2399,8 @@ class TestNovaManagePlacement(test.NoDBTestCase):
return_value=objects.InstanceList(objects=[
objects.Instance(
uuid=uuidsentinel.instance, host='fake', node='fake',
task_state=None)]))
task_state=None,
flavor=objects.Flavor(extra_specs={}))]))
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocs_for_consumer', return_value={})
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename',
@ -2419,7 +2420,7 @@ class TestNovaManagePlacement(test.NoDBTestCase):
return_value=objects.InstanceList(objects=[
objects.Instance(
uuid=uuidsentinel.instance, host='fake', node='fake',
task_state=None, flavor=objects.Flavor(),
task_state=None, flavor=objects.Flavor(extra_specs={}),
project_id='fake-project', user_id='fake-user')]))
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocs_for_consumer', return_value={})
@ -2463,7 +2464,7 @@ class TestNovaManagePlacement(test.NoDBTestCase):
new=mock.Mock(return_value=objects.InstanceList(objects=[
objects.Instance(
uuid=uuidsentinel.instance, host='fake', node='fake',
task_state=None, flavor=objects.Flavor(),
task_state=None, flavor=objects.Flavor(extra_specs={}),
project_id='fake-project', user_id='fake-user')])))
def test_heal_allocations_get_allocs_placement_fails(self):
self.assertEqual(3, self.cli.heal_allocations())
@ -2480,7 +2481,7 @@ class TestNovaManagePlacement(test.NoDBTestCase):
side_effect=[
objects.InstanceList(objects=[objects.Instance(
uuid=uuidsentinel.instance, host='fake', node='fake',
task_state=None, flavor=objects.Flavor(),
task_state=None, flavor=objects.Flavor(extra_specs={}),
project_id='fake-project', user_id='fake-user')]),
objects.InstanceList(objects=[])])
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
@ -2505,7 +2506,8 @@ class TestNovaManagePlacement(test.NoDBTestCase):
objects.Instance(
uuid=uuidsentinel.instance, host='fake',
node='fake', task_state=None,
project_id='fake-project', user_id='fake-user')]),
project_id='fake-project', user_id='fake-user',
flavor=objects.Flavor(extra_specs={}))]),
objects.InstanceList()))
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocs_for_consumer')
@ -2559,7 +2561,8 @@ class TestNovaManagePlacement(test.NoDBTestCase):
objects.Instance(
uuid=uuidsentinel.instance, host='fake', node='fake',
task_state=None, project_id='fake-project',
user_id='fake-user')]))
user_id='fake-user',
flavor=objects.Flavor(extra_specs={}))]))
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocs_for_consumer')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.put',