Block servers with vGPU and device profile in heal_allocations

Nested allocations are only partially supported in nova-manage placement heal_allocations CLI. This patch documents the missing support and blocks healing instances with VGPU or Cyborg device profile request in the embedded flavor. Blocking is needed as if --forced is used with such instances then the tool could recreate an allocation ignoring some of these resources. Change-Id: I89ac90d2ea8bc268940869dbbc90352bfad5c0de Related-Bug: bug/1939020
2021-08-05 15:20:02 +02:00 · 2021-08-05 15:20:02 +02:00 · 59c2262ca5
parent 2ffd973860
commit 59c2262ca5
6 changed files with 110 additions and 9 deletions
--- a/doc/source/admin/troubleshooting/orphaned-allocations.rst
+++ b/doc/source/admin/troubleshooting/orphaned-allocations.rst
@ -180,8 +180,13 @@ things:
 * `Migration-based allocations`_ would be lost if manually deleted during a
  resize. These are allocations tracked by the migration resource record
  on the source compute service during a migration.
-* Healing allocations does not supported nested resource allocations before the
-  20.0.0 (Train) release.
+* Healing allocations only partially support nested allocations. Nested
+  allocations due to Neutron ports having QoS policies are supported since
+  20.0.0 (Train) release. But nested allocations due to vGPU or Cyborg device
+  profile requests in the flavor are not supported. Also if you are using
+  provider.yaml files on compute hosts to define additional resources, if those
+  resources are defined on child resource providers then instances using such
+  resources are not supported.

 If you do use the ``heal_allocations`` command to cleanup allocations for a
 specific trouble instance, it is recommended to take note of what the
--- a/doc/source/cli/nova-manage.rst
+++ b/doc/source/cli/nova-manage.rst
@ -1144,6 +1144,15 @@ state transition. For each instance found, allocations are created against
 the compute node resource provider for that instance based on the flavor
 associated with the instance.

+.. note::
+    Nested allocations are only partially supported. Nested allocations due to
+    Neutron ports having QoS policies are supported since 20.0.0 (Train)
+    release. But nested allocations due to vGPU or Cyborg device profile
+    requests in the flavor are not supported. Also if you are using
+    provider.yaml files on compute hosts to define additional resources, if
+    those resources are defined on child resource providers then instances
+    using such resources are not supported.
+
 Also if the instance has any port attached that has resource request
 (e.g. :neutron-doc:`Quality of Service (QoS): Guaranteed Bandwidth
 <admin/config-qos-min-bw.html>`) but the corresponding
--- a/nova/cmd/manage.py
+++ b/nova/cmd/manage.py
@ -1715,6 +1715,18 @@ class PlacementCommands(object):
        allocations['user_id'] = instance.user_id
        return allocations

+    @staticmethod
+    def ensure_instance_has_no_vgpu_request(instance):
+        if instance.flavor.extra_specs.get("resources:VGPU"):
+            raise exception.HealvGPUAllocationNotSupported(
+                instance_uuid=instance.uuid)
+
+    @staticmethod
+    def ensure_instance_has_no_cyborg_device_profile_request(instance):
+        if instance.flavor.extra_specs.get("accel:device_profile"):
+            raise exception.HealDeviceProfileAllocationNotSupported(
+                instance_uuid=instance.uuid)
+
    def _heal_allocations_for_instance(self, ctxt, instance, node_cache,
                                       output, placement, dry_run,
                                       heal_port_allocations, neutron,
@ -1771,6 +1783,9 @@ class PlacementCommands(object):
            output(_('Instance %s is not on a host.') % instance.uuid)
            return

+        self.ensure_instance_has_no_vgpu_request(instance)
+        self.ensure_instance_has_no_cyborg_device_profile_request(instance)
+
        try:
            allocations = placement.get_allocs_for_consumer(
                ctxt, instance.uuid)
@ -1887,7 +1902,7 @@ class PlacementCommands(object):
        :param max_count: batch size (limit per instance query)
        :param unlimited: True if all instances in the cell should be
            processed, else False to just process $max_count instances
-        :param outout: function that takes a single message for verbose output
+        :param output: function that takes a single message for verbose output
        :param placement: nova.scheduler.client.report.SchedulerReportClient
            to communicate with the Placement service API.
        :param dry_run: Process instances and print output but do not commit
@ -2027,6 +2042,7 @@ class PlacementCommands(object):
        * 5: Unable to query ports from neutron
        * 6: Unable to update ports in neutron
        * 7: Cannot roll back neutron port updates. Manual steps needed.
+        * 8: Cannot heal instance with vGPU or Cyborg resource request
        * 127: Invalid input.
        """
        # NOTE(mriedem): Thoughts on ways to expand this:
@ -2155,6 +2171,12 @@ class PlacementCommands(object):
                except exception.UnableToRollbackPortUpdates as e:
                    print(e.format_message())
                    return 7
+                except (
+                    exception.HealvGPUAllocationNotSupported,
+                    exception.HealDeviceProfileAllocationNotSupported,
+                ) as e:
+                    print(e.format_message())
+                    return 8

                # Make sure we don't go over the max count. Note that we
                # don't include instances that already have allocations in the
--- a/nova/exception.py
+++ b/nova/exception.py
@ -2235,6 +2235,24 @@ class MissingDomainCapabilityFeatureException(NovaException):
                "including <%(feature)s> feature.")


+class HealAllocationException(NovaException):
+    msg_fmt = _("Healing instance allocation failed.")
+
+
+class HealvGPUAllocationNotSupported(HealAllocationException):
+    msg_fmt = _(
+        "Healing allocation for instance %(instance_uuid)s with vGPU resource "
+        "request is not supported."
+    )
+
+
+class HealDeviceProfileAllocationNotSupported(HealAllocationException):
+    msg_fmt = _(
+        "Healing allocation for instance %(instance_uuid)s with Cyborg device "
+        "profile request is not supported."
+    )
+
+
 class HealPortAllocationException(NovaException):
    msg_fmt = _("Healing port allocation failed.")

--- a/nova/tests/functional/test_nova_manage.py
+++ b/nova/tests/functional/test_nova_manage.py
@ -780,6 +780,50 @@ class TestNovaManagePlacementHealAllocations(
        )
        self.assertEqual(4, result, self.output.getvalue())

+    def test_instance_with_vgpu_is_blocked(self):
+        # we cannot boot with VGPU in these tests so manipulate the
+        # instance.flavor directly after the boot to simulate an instance with
+        # VGPU request
+        server, _ = self._boot_and_remove_allocations(self.flavor, 'cell1')
+        instance = objects.Instance.get_by_uuid(
+            context.get_admin_context(), server['id'])
+        instance.flavor.extra_specs["resources:VGPU"] = 1
+        instance.save()
+
+        result = self.cli.heal_allocations(
+            verbose=True, instance_uuid=server['id'],
+            force=True
+        )
+
+        self.assertIn(
+            f"Healing allocation for instance {server['id']} with vGPU "
+            f"resource request is not supported.",
+            self.output.getvalue()
+        )
+        self.assertEqual(8, result, self.output.getvalue())
+
+    def test_instance_with_cyborg_dev_profile_is_blocked(self):
+        # we cannot boot with cyborg device in these tests so manipulate the
+        # instance.flavor directly after the boot to simulate an instance with
+        # cyborg request
+        server, _ = self._boot_and_remove_allocations(self.flavor, 'cell1')
+        instance = objects.Instance.get_by_uuid(
+            context.get_admin_context(), server['id'])
+        instance.flavor.extra_specs["accel:device_profile"] = "foo"
+        instance.save()
+
+        result = self.cli.heal_allocations(
+            verbose=True, instance_uuid=server['id'],
+            force=True
+        )
+
+        self.assertIn(
+            f"Healing allocation for instance {server['id']} with Cyborg "
+            f"device profile request is not supported.",
+            self.output.getvalue()
+        )
+        self.assertEqual(8, result, self.output.getvalue())
+

 class TestNovaManagePlacementHealPortAllocations(
        test_servers.PortResourceRequestBasedSchedulingTestBase):
--- a/nova/tests/unit/cmd/test_manage.py
+++ b/nova/tests/unit/cmd/test_manage.py
@ -2399,7 +2399,8 @@ class TestNovaManagePlacement(test.NoDBTestCase):
                return_value=objects.InstanceList(objects=[
                    objects.Instance(
                        uuid=uuidsentinel.instance, host='fake', node='fake',
-                        task_state=None)]))
+                        task_state=None,
+                        flavor=objects.Flavor(extra_specs={}))]))
    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
                'get_allocs_for_consumer', return_value={})
    @mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename',
@ -2419,7 +2420,7 @@ class TestNovaManagePlacement(test.NoDBTestCase):
                return_value=objects.InstanceList(objects=[
                    objects.Instance(
                        uuid=uuidsentinel.instance, host='fake', node='fake',
-                        task_state=None, flavor=objects.Flavor(),
+                        task_state=None, flavor=objects.Flavor(extra_specs={}),
                        project_id='fake-project', user_id='fake-user')]))
    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
                'get_allocs_for_consumer', return_value={})
@ -2463,7 +2464,7 @@ class TestNovaManagePlacement(test.NoDBTestCase):
                new=mock.Mock(return_value=objects.InstanceList(objects=[
                    objects.Instance(
                        uuid=uuidsentinel.instance, host='fake', node='fake',
-                        task_state=None, flavor=objects.Flavor(),
+                        task_state=None, flavor=objects.Flavor(extra_specs={}),
                        project_id='fake-project', user_id='fake-user')])))
    def test_heal_allocations_get_allocs_placement_fails(self):
        self.assertEqual(3, self.cli.heal_allocations())
@ -2480,7 +2481,7 @@ class TestNovaManagePlacement(test.NoDBTestCase):
                side_effect=[
                    objects.InstanceList(objects=[objects.Instance(
                        uuid=uuidsentinel.instance, host='fake', node='fake',
-                        task_state=None, flavor=objects.Flavor(),
+                        task_state=None, flavor=objects.Flavor(extra_specs={}),
                        project_id='fake-project', user_id='fake-user')]),
                    objects.InstanceList(objects=[])])
    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
@ -2505,7 +2506,8 @@ class TestNovaManagePlacement(test.NoDBTestCase):
                        objects.Instance(
                            uuid=uuidsentinel.instance, host='fake',
                            node='fake', task_state=None,
-                            project_id='fake-project', user_id='fake-user')]),
+                            project_id='fake-project', user_id='fake-user',
+                            flavor=objects.Flavor(extra_specs={}))]),
                    objects.InstanceList()))
    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
                'get_allocs_for_consumer')
@ -2559,7 +2561,8 @@ class TestNovaManagePlacement(test.NoDBTestCase):
                    objects.Instance(
                        uuid=uuidsentinel.instance, host='fake', node='fake',
                        task_state=None, project_id='fake-project',
-                        user_id='fake-user')]))
+                        user_id='fake-user',
+                        flavor=objects.Flavor(extra_specs={}))]))
    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
                'get_allocs_for_consumer')
    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.put',