From 60851e44649e463bfda25d9dea84443467e4a30c Mon Sep 17 00:00:00 2001
From: Sylvain Bauza <sbauza@redhat.com>
Date: Mon, 30 Oct 2023 18:11:46 +0100
Subject: [PATCH] libvirt: Cap with max_instances GPU types

We want to cap a maximum mdevs we can create.
If some type has enough capacity, then other GPUs won't be used and
existing ResourceProviders would be deleted.

Closes-Bug: #2041519
Change-Id: I069879a333152bb849c248b3dcb56357a11d0324
---
 nova/conf/devices.py                        |  9 +++
 nova/tests/functional/libvirt/test_vgpu.py  | 78 +++++++++++++++++++++
 nova/tests/unit/virt/libvirt/test_driver.py | 41 +++++++++++
 nova/virt/libvirt/driver.py                 | 48 +++++++++++--
 4 files changed, 170 insertions(+), 6 deletions(-)

diff --git a/nova/conf/devices.py b/nova/conf/devices.py
index f3ef187c0201..265aa1d02ba0 100644
--- a/nova/conf/devices.py
+++ b/nova/conf/devices.py
@@ -89,6 +89,15 @@ def register_dynamic_opts(conf):
                  'CUSTOM_ if it is not VGPU.')
         conf.register_opt(class_opt, group='mdev_%s' % mdev_type)
 
+        # Register the '[mdev_$(MDEV_TYPE)]/max_instances' opts
+        max_inst_opt = cfg.IntOpt(
+            'max_instances',
+            default=None, min=1,
+            help='Number of mediated devices that type can create. '
+                 'If not set, it implies that we use the maximum allowed by '
+                 'the type.')
+        conf.register_opt(max_inst_opt, group='mdev_%s' % mdev_type)
+
 
 def list_opts():
     return {devices_group: mdev_opts}
diff --git a/nova/tests/functional/libvirt/test_vgpu.py b/nova/tests/functional/libvirt/test_vgpu.py
index 8f108d216b89..0fcdf47e9189 100644
--- a/nova/tests/functional/libvirt/test_vgpu.py
+++ b/nova/tests/functional/libvirt/test_vgpu.py
@@ -429,6 +429,84 @@ class VGPUMultipleTypesTests(VGPUTestBase):
             self.assertEqual(expected[trait], mdev_info['parent'])
 
 
+class VGPULimitMultipleTypesTests(VGPUTestBase):
+
+    def setUp(self):
+        super(VGPULimitMultipleTypesTests, self).setUp()
+        extra_spec = {"resources:VGPU": "1"}
+        self.flavor = self._create_flavor(extra_spec=extra_spec)
+
+        self.flags(
+            enabled_mdev_types=[fakelibvirt.NVIDIA_11_VGPU_TYPE,
+                                fakelibvirt.NVIDIA_12_VGPU_TYPE],
+            group='devices')
+        # we need to call the below again to ensure the updated
+        # 'device_addresses' value is read and the new groups created
+        nova.conf.devices.register_dynamic_opts(CONF)
+        # host1 will have 2 physical GPUs :
+        #  - 0000:81:00.0 will only support nvidia-11
+        #  - 0000:81:01.0 will only support nvidia-12
+        MDEVCAP_DEV1_PCI_ADDR = self.libvirt2pci_address(
+            fakelibvirt.MDEVCAP_DEV1_PCI_ADDR)
+        MDEVCAP_DEV2_PCI_ADDR = self.libvirt2pci_address(
+            fakelibvirt.MDEVCAP_DEV2_PCI_ADDR)
+        self.flags(device_addresses=[MDEVCAP_DEV1_PCI_ADDR],
+                   group='mdev_nvidia-11')
+        self.flags(device_addresses=[MDEVCAP_DEV2_PCI_ADDR],
+                   group='mdev_nvidia-12')
+
+        # Start the compute by supporting both types
+        pci_info = fakelibvirt.HostPCIDevicesInfo(
+            num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
+            multiple_gpu_types=True)
+        self.compute1 = self.start_compute_with_vgpu('host1', pci_info)
+
+    def test_create_servers_with_vgpu(self):
+        physdev1_rp_uuid = self._get_provider_uuid_by_name(
+            self.compute1.host + '_' + fakelibvirt.MDEVCAP_DEV1_PCI_ADDR)
+        physdev2_rp_uuid = self._get_provider_uuid_by_name(
+            self.compute1.host + '_' + fakelibvirt.MDEVCAP_DEV2_PCI_ADDR)
+
+        # Just for asserting the inventories we currently have.
+        physdev1_inventory = self._get_provider_inventory(physdev1_rp_uuid)
+        self.assertEqual(16, physdev1_inventory[orc.VGPU]['total'])
+        physdev2_inventory = self._get_provider_inventory(physdev2_rp_uuid)
+        self.assertEqual(8, physdev2_inventory[orc.VGPU]['total'])
+
+        # Now, let's limit the capacity for the first type to 2
+        self.flags(max_instances=2, group='mdev_nvidia-11')
+        # Make a restart to update the Resource Providers
+        self.compute2 = self.restart_compute_service('host1')
+        # Make sure we can still create an instance
+        server = self._create_server(
+            image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
+            flavor_id=self.flavor, networks='auto', host=self.compute1.host)
+        mdevs = self.compute1.driver._get_mediated_devices()
+        self.assertEqual(1, len(mdevs))
+
+        # ... but actually looking at Placement, only now the 2nd GPU can be
+        # used because nvidia-11 was limited to 2 while the GPU supporting it
+        # was having a 8th capacity.
+        physdev2_inventory = self._get_provider_inventory(physdev2_rp_uuid)
+        self.assertEqual(8, physdev2_inventory[orc.VGPU]['total'])
+
+        # Get the instance we just created
+        inst = objects.Instance.get_by_uuid(self.context, server['id'])
+        expected_rp_name = (self.compute1.host + '_' +
+                            fakelibvirt.MDEVCAP_DEV2_PCI_ADDR)
+        # Yes, indeed we use the 2nd GPU
+        self.assert_mdev_usage(self.compute1, expected_amount=1,
+                                   expected_rc=orc.VGPU, instance=inst,
+                                   expected_rp_name=expected_rp_name)
+        # ... and what happened to the first GPU inventory ? Well, the whole
+        # Resource Provider disappeared !
+        provider = self._get_resource_provider_by_uuid(physdev1_rp_uuid)
+        self.assertEqual(404, provider['errors'][0]['status'])
+        self.assertIn(
+            "No resource provider with uuid %s found" % physdev1_rp_uuid,
+            provider['errors'][0]['detail'])
+
+
 class VGPULiveMigrationTests(base.LibvirtMigrationMixin, VGPUTestBase):
 
     # Use the right minimum versions for live-migration
diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
index b43860f5b9f7..6f0d68ffa01f 100644
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -26833,6 +26833,42 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
         self._test_get_gpu_inventories(drvr, expected, ['nvidia-11',
                                                         'nvidia-12'])
 
+    def test_get_gpu_inventories_with_max_instances_per_type(self):
+        self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
+                   group='devices')
+        # we need to call the below again to ensure the updated
+        # 'device_addresses' value is read and the new groups created
+        nova.conf.devices.register_dynamic_opts(CONF)
+        self.flags(device_addresses=['0000:06:00.0'], group='mdev_nvidia-11')
+        self.flags(device_addresses=['0000:07:00.0'], group='mdev_nvidia-12')
+        # We will cap the max vGPUs for nvidia-11 for 2 but we leave nvidia-12
+        # uncapped
+        self.flags(max_instances=2, group='mdev_nvidia-11')
+        drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
+        expected = {
+            # we don't accept this PCI device because max_instance is less than
+            # its capacity
+            'pci_0000_06_00_0': {'total': 0,
+                                 'max_unit': 0,
+                                 'min_unit': 1,
+                                 'step_size': 1,
+                                 'reserved': 0,
+                                 'allocation_ratio': 1.0,
+                                 },
+            # the second GPU supports nvidia-12 but the existing mdev is not
+            # using this type, so we only count the availableInstances value
+            # for nvidia-12.
+            'pci_0000_07_00_0': {'total': 10,
+                                 'max_unit': 10,
+                                 'min_unit': 1,
+                                 'step_size': 1,
+                                 'reserved': 0,
+                                 'allocation_ratio': 1.0,
+                                 },
+        }
+        self._test_get_gpu_inventories(drvr, expected, ['nvidia-11',
+                                                        'nvidia-12'])
+
     @mock.patch.object(libvirt_driver.LOG, 'warning')
     def test_get_supported_vgpu_types(self, mock_warning):
         # Verify that by default we don't support vGPU types
@@ -26846,6 +26882,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
         # devices or mdev classes *yet* if we don't have a vgpu type section.
         self.assertEqual({}, drvr.pgpu_type_mapping)
         self.assertEqual({}, drvr.mdev_class_mapping)
+        self.assertEqual({}, drvr.mdev_type_max_mapping)
         # Remember, we only support the VGPU resource class if we only have
         # one needed vGPU type without a specific vgpu type section.
         self.assertEqual({orc.VGPU}, drvr.mdev_classes)
@@ -26865,6 +26902,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
         self.assertEqual(['nvidia-11'], drvr._get_supported_vgpu_types())
         self.assertEqual({}, drvr.pgpu_type_mapping)
         self.assertEqual({}, drvr.mdev_class_mapping)
+        self.assertEqual({}, drvr.mdev_type_max_mapping)
         # Here we only support one vGPU type
         self.assertEqual({orc.VGPU}, drvr.mdev_classes)
         msg = ("The mdev type '%(type)s' was listed in '[devices] "
@@ -26882,6 +26920,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
         self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-11')
         self.flags(device_addresses=['0000:85:00.0'], group='mdev_nvidia-12')
         self.flags(mdev_class='CUSTOM_NOTVGPU', group='mdev_nvidia-12')
+        self.flags(max_instances=2, group='mdev_nvidia-11')
         self.assertEqual(['nvidia-11', 'nvidia-12'],
                          drvr._get_supported_vgpu_types())
         self.assertEqual({'0000:84:00.0': 'nvidia-11',
@@ -26890,6 +26929,8 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
                           '0000:85:00.0': 'CUSTOM_NOTVGPU'},
                           drvr.mdev_class_mapping)
         self.assertEqual({orc.VGPU, 'CUSTOM_NOTVGPU'}, drvr.mdev_classes)
+        # nvidia-12 is unlimited
+        self.assertEqual({'nvidia-11': 2}, drvr.mdev_type_max_mapping)
         mock_warning.assert_not_called()
 
     def test_get_supported_vgpu_types_with_duplicate_types(self):
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index 88969306a3b0..c35e7fbfdb66 100644
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -538,6 +538,8 @@ class LibvirtDriver(driver.ComputeDriver):
         )
         # This set is for knowing all the mdev classes the operator provides
         self.mdev_classes = set([])
+        # this is for knowing how many mdevs can be created by a type
+        self.mdev_type_max_mapping = collections.defaultdict(str)
         self.supported_vgpu_types = self._get_supported_vgpu_types()
 
         # This dict is for knowing which mdevs are already claimed by some
@@ -8217,6 +8219,9 @@ class LibvirtDriver(driver.ComputeDriver):
                     self.mdev_classes = {first_group.mdev_class}
                 return [first_type]
             mdev_class = group.mdev_class
+            # By default, max_instances is None
+            if group.max_instances:
+                self.mdev_type_max_mapping[vgpu_type] = group.max_instances
             for device_address in group.device_addresses:
                 if device_address in self.pgpu_type_mapping:
                     raise exception.InvalidLibvirtMdevConfig(
@@ -8367,18 +8372,44 @@ class LibvirtDriver(driver.ComputeDriver):
         if not enabled_mdev_types:
             return {}
         inventories = {}
+        # counting how many mdevs we are currently supporting per type
+        type_limit_mapping: ty.Dict[str, int] = collections.defaultdict(int)
         count_per_parent = self._count_mediated_devices(enabled_mdev_types)
         for dev_name, count in count_per_parent.items():
+            mdev_type = self._get_vgpu_type_per_pgpu(dev_name)
+            type_limit_mapping[mdev_type] += count
             inventories[dev_name] = {'total': count}
         # Filter how many available mdevs we can create for all the supported
         # types.
         count_per_dev = self._count_mdev_capable_devices(enabled_mdev_types)
         # Combine the counts into the dict that we return to the caller.
         for dev_name, count in count_per_dev.items():
+            mdev_type = self._get_vgpu_type_per_pgpu(dev_name)
+            mdev_limit = self.mdev_type_max_mapping.get(mdev_type)
+            # Some GPU types could have defined limits. For the others, say
+            # they are just unlimited
+            # NOTE(sbauza): Instead of not accepting GPUs if their capacity is
+            # more than the limit, we could just accept them by capping their
+            # total value by the limit.
+            if (mdev_limit and
+                type_limit_mapping[mdev_type] + count > mdev_limit):
+                # We don't have space for creating new mediated devices
+                LOG.debug("Skipping to update %s as the available count of "
+                          "mediated devices (%s) is above the maximum we can "
+                          "use (%s)",
+                          dev_name, count,
+                          mdev_limit - type_limit_mapping[mdev_type])
+                # We want the resource provider to be deleted, so we pass the
+                # inventory with a total of 0 so _ensure_pgpu_providers() will
+                # delete it.
+                inventories[dev_name] = {'total': 0}
+                continue
+            type_limit_mapping[mdev_type] += count
             inv_per_parent = inventories.setdefault(
                 dev_name, {'total': 0})
             inv_per_parent['total'] += count
-            inv_per_parent.update({
+        for dev_name in inventories:
+            inventories[dev_name].update({
                 'min_unit': 1,
                 'step_size': 1,
                 'reserved': 0,
@@ -8386,7 +8417,7 @@ class LibvirtDriver(driver.ComputeDriver):
                 # since we can't overallocate vGPU resources
                 'allocation_ratio': 1.0,
                 # FIXME(sbauza): Some vendors could support only one
-                'max_unit': inv_per_parent['total'],
+                'max_unit': inventories[dev_name]['total'],
             })
 
         return inventories
@@ -9388,12 +9419,17 @@ class LibvirtDriver(driver.ComputeDriver):
         # Dict of PGPU RPs keyed by their libvirt PCI name
         pgpu_rps = {}
         for pgpu_dev_id, inventory in inventories_dict.items():
-            # Skip (and omit) inventories with total=0 because placement does
-            # not allow setting total=0 for inventory.
-            if not inventory['total']:
-                continue
             # For each physical GPU, we make sure to have a child provider
             pgpu_rp_name = '%s_%s' % (nodename, pgpu_dev_id)
+            # Skip (and omit) inventories with total=0 because placement does
+            # not allow setting total=0 for inventory. If the inventory already
+            # exists, we rather delete it.
+            if not inventory['total']:
+                if provider_tree.exists(pgpu_rp_name):
+                    LOG.debug('Deleting %s resource provider since it does '
+                              'not longer have any inventory', pgpu_rp_name)
+                    provider_tree.remove(pgpu_rp_name)
+                continue
             if not provider_tree.exists(pgpu_rp_name):
                 # This is the first time creating the child provider so add
                 # it to the tree under the root node provider.