From 60851e44649e463bfda25d9dea84443467e4a30c Mon Sep 17 00:00:00 2001 From: Sylvain Bauza Date: Mon, 30 Oct 2023 18:11:46 +0100 Subject: [PATCH] libvirt: Cap with max_instances GPU types We want to cap a maximum mdevs we can create. If some type has enough capacity, then other GPUs won't be used and existing ResourceProviders would be deleted. Closes-Bug: #2041519 Change-Id: I069879a333152bb849c248b3dcb56357a11d0324 --- nova/conf/devices.py | 9 +++ nova/tests/functional/libvirt/test_vgpu.py | 78 +++++++++++++++++++++ nova/tests/unit/virt/libvirt/test_driver.py | 41 +++++++++++ nova/virt/libvirt/driver.py | 48 +++++++++++-- 4 files changed, 170 insertions(+), 6 deletions(-) diff --git a/nova/conf/devices.py b/nova/conf/devices.py index f3ef187c0201..265aa1d02ba0 100644 --- a/nova/conf/devices.py +++ b/nova/conf/devices.py @@ -89,6 +89,15 @@ def register_dynamic_opts(conf): 'CUSTOM_ if it is not VGPU.') conf.register_opt(class_opt, group='mdev_%s' % mdev_type) + # Register the '[mdev_$(MDEV_TYPE)]/max_instances' opts + max_inst_opt = cfg.IntOpt( + 'max_instances', + default=None, min=1, + help='Number of mediated devices that type can create. ' + 'If not set, it implies that we use the maximum allowed by ' + 'the type.') + conf.register_opt(max_inst_opt, group='mdev_%s' % mdev_type) + def list_opts(): return {devices_group: mdev_opts} diff --git a/nova/tests/functional/libvirt/test_vgpu.py b/nova/tests/functional/libvirt/test_vgpu.py index 8f108d216b89..0fcdf47e9189 100644 --- a/nova/tests/functional/libvirt/test_vgpu.py +++ b/nova/tests/functional/libvirt/test_vgpu.py @@ -429,6 +429,84 @@ class VGPUMultipleTypesTests(VGPUTestBase): self.assertEqual(expected[trait], mdev_info['parent']) +class VGPULimitMultipleTypesTests(VGPUTestBase): + + def setUp(self): + super(VGPULimitMultipleTypesTests, self).setUp() + extra_spec = {"resources:VGPU": "1"} + self.flavor = self._create_flavor(extra_spec=extra_spec) + + self.flags( + enabled_mdev_types=[fakelibvirt.NVIDIA_11_VGPU_TYPE, + fakelibvirt.NVIDIA_12_VGPU_TYPE], + group='devices') + # we need to call the below again to ensure the updated + # 'device_addresses' value is read and the new groups created + nova.conf.devices.register_dynamic_opts(CONF) + # host1 will have 2 physical GPUs : + # - 0000:81:00.0 will only support nvidia-11 + # - 0000:81:01.0 will only support nvidia-12 + MDEVCAP_DEV1_PCI_ADDR = self.libvirt2pci_address( + fakelibvirt.MDEVCAP_DEV1_PCI_ADDR) + MDEVCAP_DEV2_PCI_ADDR = self.libvirt2pci_address( + fakelibvirt.MDEVCAP_DEV2_PCI_ADDR) + self.flags(device_addresses=[MDEVCAP_DEV1_PCI_ADDR], + group='mdev_nvidia-11') + self.flags(device_addresses=[MDEVCAP_DEV2_PCI_ADDR], + group='mdev_nvidia-12') + + # Start the compute by supporting both types + pci_info = fakelibvirt.HostPCIDevicesInfo( + num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2, + multiple_gpu_types=True) + self.compute1 = self.start_compute_with_vgpu('host1', pci_info) + + def test_create_servers_with_vgpu(self): + physdev1_rp_uuid = self._get_provider_uuid_by_name( + self.compute1.host + '_' + fakelibvirt.MDEVCAP_DEV1_PCI_ADDR) + physdev2_rp_uuid = self._get_provider_uuid_by_name( + self.compute1.host + '_' + fakelibvirt.MDEVCAP_DEV2_PCI_ADDR) + + # Just for asserting the inventories we currently have. + physdev1_inventory = self._get_provider_inventory(physdev1_rp_uuid) + self.assertEqual(16, physdev1_inventory[orc.VGPU]['total']) + physdev2_inventory = self._get_provider_inventory(physdev2_rp_uuid) + self.assertEqual(8, physdev2_inventory[orc.VGPU]['total']) + + # Now, let's limit the capacity for the first type to 2 + self.flags(max_instances=2, group='mdev_nvidia-11') + # Make a restart to update the Resource Providers + self.compute2 = self.restart_compute_service('host1') + # Make sure we can still create an instance + server = self._create_server( + image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6', + flavor_id=self.flavor, networks='auto', host=self.compute1.host) + mdevs = self.compute1.driver._get_mediated_devices() + self.assertEqual(1, len(mdevs)) + + # ... but actually looking at Placement, only now the 2nd GPU can be + # used because nvidia-11 was limited to 2 while the GPU supporting it + # was having a 8th capacity. + physdev2_inventory = self._get_provider_inventory(physdev2_rp_uuid) + self.assertEqual(8, physdev2_inventory[orc.VGPU]['total']) + + # Get the instance we just created + inst = objects.Instance.get_by_uuid(self.context, server['id']) + expected_rp_name = (self.compute1.host + '_' + + fakelibvirt.MDEVCAP_DEV2_PCI_ADDR) + # Yes, indeed we use the 2nd GPU + self.assert_mdev_usage(self.compute1, expected_amount=1, + expected_rc=orc.VGPU, instance=inst, + expected_rp_name=expected_rp_name) + # ... and what happened to the first GPU inventory ? Well, the whole + # Resource Provider disappeared ! + provider = self._get_resource_provider_by_uuid(physdev1_rp_uuid) + self.assertEqual(404, provider['errors'][0]['status']) + self.assertIn( + "No resource provider with uuid %s found" % physdev1_rp_uuid, + provider['errors'][0]['detail']) + + class VGPULiveMigrationTests(base.LibvirtMigrationMixin, VGPUTestBase): # Use the right minimum versions for live-migration diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index b43860f5b9f7..6f0d68ffa01f 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -26833,6 +26833,42 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): self._test_get_gpu_inventories(drvr, expected, ['nvidia-11', 'nvidia-12']) + def test_get_gpu_inventories_with_max_instances_per_type(self): + self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'], + group='devices') + # we need to call the below again to ensure the updated + # 'device_addresses' value is read and the new groups created + nova.conf.devices.register_dynamic_opts(CONF) + self.flags(device_addresses=['0000:06:00.0'], group='mdev_nvidia-11') + self.flags(device_addresses=['0000:07:00.0'], group='mdev_nvidia-12') + # We will cap the max vGPUs for nvidia-11 for 2 but we leave nvidia-12 + # uncapped + self.flags(max_instances=2, group='mdev_nvidia-11') + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + expected = { + # we don't accept this PCI device because max_instance is less than + # its capacity + 'pci_0000_06_00_0': {'total': 0, + 'max_unit': 0, + 'min_unit': 1, + 'step_size': 1, + 'reserved': 0, + 'allocation_ratio': 1.0, + }, + # the second GPU supports nvidia-12 but the existing mdev is not + # using this type, so we only count the availableInstances value + # for nvidia-12. + 'pci_0000_07_00_0': {'total': 10, + 'max_unit': 10, + 'min_unit': 1, + 'step_size': 1, + 'reserved': 0, + 'allocation_ratio': 1.0, + }, + } + self._test_get_gpu_inventories(drvr, expected, ['nvidia-11', + 'nvidia-12']) + @mock.patch.object(libvirt_driver.LOG, 'warning') def test_get_supported_vgpu_types(self, mock_warning): # Verify that by default we don't support vGPU types @@ -26846,6 +26882,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): # devices or mdev classes *yet* if we don't have a vgpu type section. self.assertEqual({}, drvr.pgpu_type_mapping) self.assertEqual({}, drvr.mdev_class_mapping) + self.assertEqual({}, drvr.mdev_type_max_mapping) # Remember, we only support the VGPU resource class if we only have # one needed vGPU type without a specific vgpu type section. self.assertEqual({orc.VGPU}, drvr.mdev_classes) @@ -26865,6 +26902,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): self.assertEqual(['nvidia-11'], drvr._get_supported_vgpu_types()) self.assertEqual({}, drvr.pgpu_type_mapping) self.assertEqual({}, drvr.mdev_class_mapping) + self.assertEqual({}, drvr.mdev_type_max_mapping) # Here we only support one vGPU type self.assertEqual({orc.VGPU}, drvr.mdev_classes) msg = ("The mdev type '%(type)s' was listed in '[devices] " @@ -26882,6 +26920,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-11') self.flags(device_addresses=['0000:85:00.0'], group='mdev_nvidia-12') self.flags(mdev_class='CUSTOM_NOTVGPU', group='mdev_nvidia-12') + self.flags(max_instances=2, group='mdev_nvidia-11') self.assertEqual(['nvidia-11', 'nvidia-12'], drvr._get_supported_vgpu_types()) self.assertEqual({'0000:84:00.0': 'nvidia-11', @@ -26890,6 +26929,8 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): '0000:85:00.0': 'CUSTOM_NOTVGPU'}, drvr.mdev_class_mapping) self.assertEqual({orc.VGPU, 'CUSTOM_NOTVGPU'}, drvr.mdev_classes) + # nvidia-12 is unlimited + self.assertEqual({'nvidia-11': 2}, drvr.mdev_type_max_mapping) mock_warning.assert_not_called() def test_get_supported_vgpu_types_with_duplicate_types(self): diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 88969306a3b0..c35e7fbfdb66 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -538,6 +538,8 @@ class LibvirtDriver(driver.ComputeDriver): ) # This set is for knowing all the mdev classes the operator provides self.mdev_classes = set([]) + # this is for knowing how many mdevs can be created by a type + self.mdev_type_max_mapping = collections.defaultdict(str) self.supported_vgpu_types = self._get_supported_vgpu_types() # This dict is for knowing which mdevs are already claimed by some @@ -8217,6 +8219,9 @@ class LibvirtDriver(driver.ComputeDriver): self.mdev_classes = {first_group.mdev_class} return [first_type] mdev_class = group.mdev_class + # By default, max_instances is None + if group.max_instances: + self.mdev_type_max_mapping[vgpu_type] = group.max_instances for device_address in group.device_addresses: if device_address in self.pgpu_type_mapping: raise exception.InvalidLibvirtMdevConfig( @@ -8367,18 +8372,44 @@ class LibvirtDriver(driver.ComputeDriver): if not enabled_mdev_types: return {} inventories = {} + # counting how many mdevs we are currently supporting per type + type_limit_mapping: ty.Dict[str, int] = collections.defaultdict(int) count_per_parent = self._count_mediated_devices(enabled_mdev_types) for dev_name, count in count_per_parent.items(): + mdev_type = self._get_vgpu_type_per_pgpu(dev_name) + type_limit_mapping[mdev_type] += count inventories[dev_name] = {'total': count} # Filter how many available mdevs we can create for all the supported # types. count_per_dev = self._count_mdev_capable_devices(enabled_mdev_types) # Combine the counts into the dict that we return to the caller. for dev_name, count in count_per_dev.items(): + mdev_type = self._get_vgpu_type_per_pgpu(dev_name) + mdev_limit = self.mdev_type_max_mapping.get(mdev_type) + # Some GPU types could have defined limits. For the others, say + # they are just unlimited + # NOTE(sbauza): Instead of not accepting GPUs if their capacity is + # more than the limit, we could just accept them by capping their + # total value by the limit. + if (mdev_limit and + type_limit_mapping[mdev_type] + count > mdev_limit): + # We don't have space for creating new mediated devices + LOG.debug("Skipping to update %s as the available count of " + "mediated devices (%s) is above the maximum we can " + "use (%s)", + dev_name, count, + mdev_limit - type_limit_mapping[mdev_type]) + # We want the resource provider to be deleted, so we pass the + # inventory with a total of 0 so _ensure_pgpu_providers() will + # delete it. + inventories[dev_name] = {'total': 0} + continue + type_limit_mapping[mdev_type] += count inv_per_parent = inventories.setdefault( dev_name, {'total': 0}) inv_per_parent['total'] += count - inv_per_parent.update({ + for dev_name in inventories: + inventories[dev_name].update({ 'min_unit': 1, 'step_size': 1, 'reserved': 0, @@ -8386,7 +8417,7 @@ class LibvirtDriver(driver.ComputeDriver): # since we can't overallocate vGPU resources 'allocation_ratio': 1.0, # FIXME(sbauza): Some vendors could support only one - 'max_unit': inv_per_parent['total'], + 'max_unit': inventories[dev_name]['total'], }) return inventories @@ -9388,12 +9419,17 @@ class LibvirtDriver(driver.ComputeDriver): # Dict of PGPU RPs keyed by their libvirt PCI name pgpu_rps = {} for pgpu_dev_id, inventory in inventories_dict.items(): - # Skip (and omit) inventories with total=0 because placement does - # not allow setting total=0 for inventory. - if not inventory['total']: - continue # For each physical GPU, we make sure to have a child provider pgpu_rp_name = '%s_%s' % (nodename, pgpu_dev_id) + # Skip (and omit) inventories with total=0 because placement does + # not allow setting total=0 for inventory. If the inventory already + # exists, we rather delete it. + if not inventory['total']: + if provider_tree.exists(pgpu_rp_name): + LOG.debug('Deleting %s resource provider since it does ' + 'not longer have any inventory', pgpu_rp_name) + provider_tree.remove(pgpu_rp_name) + continue if not provider_tree.exists(pgpu_rp_name): # This is the first time creating the child provider so add # it to the tree under the root node provider.