vgpu: Allow device_addresses to not be set

Sometimes, some GPU may have a long list of PCI addresses (say a SRIOV
GPU) or operators may have a long list of GPUs. In order to help their
lifes, let's allow device_addresses to be optional.

This means that a valid configuration could be :

    [devices]
    enabled_mdev_types = nvidia-35, nvidia-36

    [mdev_nvidia-35]

    [mdev_nvidia-36]

NOTE(sbauza): we have a slight coverage gap for testing what happens
if the groups aren't set, but I'll add it in a next patch

Related-Bug: #2041519
Change-Id: I73762a0295212ee003db2149d6a9cf701023464f
This commit is contained in:
Sylvain Bauza 2023-11-28 11:52:57 +01:00
parent 60851e4464
commit d445eaf9dd
3 changed files with 130 additions and 73 deletions

View File

@ -29,15 +29,20 @@ guest instance.
If more than one single mdev type is provided, then for each *mdev type* an
additional section, ``[mdev_$(MDEV_TYPE)]``, must be added to the configuration
file. Each section then **must** be configured with a single configuration
option, ``device_addresses``, which should be a list of PCI addresses
corresponding to the physical GPU(s) or mdev-capable hardware to assign to this
type.
file. Each section then can be configured with a single configuration option,
``device_addresses``, which should be a list of PCI addresses corresponding to
the physical GPU(s) or mdev-capable hardware to assign to this type. If
`device_addresses` is not provided, then the related GPU type will be the
default for all the found GPUs that aren't used by other types.
If one or more sections are missing (meaning that a specific type is not wanted
to use for at least one physical device) or if no device addresses are provided
, then Nova will only use the first type that was provided by
``[devices]/enabled_mdev_types``.
to use for at least one physical device), then Nova will only use the first
type that was provided by ``[devices]/enabled_mdev_types``.
If two or more sections are not set with ``device_addresses`` values, then only
the first one will be used for defaulting all the non-defined GPUs to use this
type.
If the same PCI address is provided for two different types, nova-compute will
return an InvalidLibvirtMdevConfig exception at restart.
@ -54,6 +59,17 @@ will be accepted. A valid configuration could then be::
[vgpu_nvidia-36]
device_addresses = 0000:86:00.0
Another valid configuration could be::
[devices]
enabled_mdev_types = nvidia-35, nvidia-36
[mdev_nvidia-35]
[mdev_nvidia-36]
device_addresses = 0000:86:00.0
""")
]

View File

@ -26870,7 +26870,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
'nvidia-12'])
@mock.patch.object(libvirt_driver.LOG, 'warning')
def test_get_supported_vgpu_types(self, mock_warning):
def test_get_supported_vgpu_types_fails(self, mock_warning):
# Verify that by default we don't support vGPU types
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertEqual([], drvr._get_supported_vgpu_types())
@ -26889,34 +26889,34 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
# Since the operator wanted to only support one type, it's fine to not
# provide config groups
mock_warning.assert_not_called()
# For further checking
mock_warning.reset_mock()
# Now two types without forgetting to provide the pGPU addresses
@mock.patch.object(libvirt_driver.LOG, 'warning')
def test_get_supported_vgpu_types_two_types_unset(self, mock_warning):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
# Now two types without providing the pGPU addresses
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
group='devices')
# we need to call the below again to ensure the updated
# 'device_addresses' value is read and the new groups created
nova.conf.devices.register_dynamic_opts(CONF)
self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-11')
self.assertEqual(['nvidia-11'], drvr._get_supported_vgpu_types())
self.assertEqual({}, drvr.pgpu_type_mapping)
self.assertEqual({}, drvr.mdev_class_mapping)
self.assertEqual({}, drvr.mdev_type_max_mapping)
# Here we only support one vGPU type
self.assertEqual({orc.VGPU}, drvr.mdev_classes)
msg = ("The mdev type '%(type)s' was listed in '[devices] "
"enabled_mdev_types' but no corresponding "
"'[mdev_%(type)s]' group or "
"'[mdev_%(type)s] device_addresses' "
"option was defined. Only the first type '%(ftype)s' "
"will be used." % {'type': 'nvidia-12',
'ftype': 'nvidia-11'})
msg = ("Mdev type default already set to "
" %(default_type)s so %(this_type)s will not "
"be used." % {
'default_type': 'nvidia-11',
'this_type': 'nvidia-12'})
mock_warning.assert_called_once_with(msg)
# For further checking
mock_warning.reset_mock()
# And now do it correctly !
@mock.patch.object(libvirt_driver.LOG, 'warning')
def test_get_supported_vgpu_types(self, mock_warning):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
group='devices')
# we need to call the below again to ensure the updated
# 'device_addresses' value is read and the new groups created
nova.conf.devices.register_dynamic_opts(CONF)
self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-11')
self.flags(device_addresses=['0000:85:00.0'], group='mdev_nvidia-12')
self.flags(mdev_class='CUSTOM_NOTVGPU', group='mdev_nvidia-12')
@ -26957,20 +26957,6 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
libvirt_driver.LibvirtDriver,
fake.FakeVirtAPI(), False)
@mock.patch.object(nova.conf.devices, 'register_dynamic_opts')
def test_get_supported_vgpu_types_registering_dynamic_opts(self, rdo):
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
group='devices')
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
drvr._get_supported_vgpu_types()
# Okay below is confusing, but remember, ._get_supported_vgpu_types()
# is first called by the LibvirtDriver object creation, so when
# calling the above drvr._get_supported_vgpu_types() method, it will
# be the second time that register_dynamic_opts() will be called.
rdo.assert_has_calls([mock.call(CONF), mock.call(CONF)])
@mock.patch.object(libvirt_driver.LOG, 'warning')
def test_get_supported_vgpu_types_with_a_single_type(self, mock_warning):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
@ -26990,6 +26976,49 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
self.assertEqual({'CUSTOM_NOTVGPU'}, drvr.mdev_classes)
mock_warning.assert_not_called()
def test_get_supported_vgpu_types_with_default_type(self):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
group='devices')
# we need to call the below again to ensure the updated
# 'device_addresses' value is read and the new groups created
nova.conf.devices.register_dynamic_opts(CONF)
# Enable nvidia-11 as a the default type for all GPUs but 0000:84:00.0
self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-12')
self.assertEqual(['nvidia-11', 'nvidia-12'],
drvr._get_supported_vgpu_types())
self.assertEqual({'0000:84:00.0': 'nvidia-12'}, drvr.pgpu_type_mapping)
self.assertEqual('nvidia-11', drvr.pgpu_type_default)
@mock.patch.object(libvirt_driver.LOG, 'warning')
def test_get_supported_vgpu_types_with_duplicate_default_type(
self, mock_warning):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12', 'nvidia-13'],
group='devices')
# we need to call the below again to ensure the updated
# 'device_addresses' value is read and the new groups created
nova.conf.devices.register_dynamic_opts(CONF)
# Add a specific GPU for a third type
self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-13')
# As both nvidia-11 and nvidia-12 aren't set with device_addresses,
# only one of them should be the default.
# nvidia-12 won't be supported since none of the GPUs will use it.
self.assertEqual(['nvidia-11', 'nvidia-13'],
drvr._get_supported_vgpu_types())
self.assertEqual({'0000:84:00.0': 'nvidia-13'}, drvr.pgpu_type_mapping)
# There can be only one :-)
self.assertEqual('nvidia-11', drvr.pgpu_type_default)
msg = ("Mdev type default already set to "
" %(default_type)s so %(this_type)s will not "
"be used." % {
'default_type': 'nvidia-11',
'this_type': 'nvidia-12'})
mock_warning.assert_called_once_with(msg)
def test_get_vgpu_type_per_pgpu(self):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
device = 'pci_0000_84_00_0'
@ -27046,6 +27075,20 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
# 0000:86:00.0 wasn't configured
self.assertIsNone(drvr._get_vgpu_type_per_pgpu('pci_0000_86_00_0'))
def test_get_vgpu_type_per_pgpu_with_default_type(self):
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
group='devices')
# we need to call the below again to ensure the updated
# 'device_addresses' value is read and the new groups created
nova.conf.devices.register_dynamic_opts(CONF)
self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-11')
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertEqual('nvidia-11',
drvr._get_vgpu_type_per_pgpu('pci_0000_84_00_0'))
# Any GPU but 0000:84:00.0 defaults now to nvidia-12
self.assertEqual('nvidia-12',
drvr._get_vgpu_type_per_pgpu('pci_0000_85_00_0'))
def test_get_resource_class_for_device(self):
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
group='devices')

View File

@ -540,6 +540,8 @@ class LibvirtDriver(driver.ComputeDriver):
self.mdev_classes = set([])
# this is for knowing how many mdevs can be created by a type
self.mdev_type_max_mapping = collections.defaultdict(str)
# if we have a wildcard, we default to use this mdev type
self.pgpu_type_default = None
self.supported_vgpu_types = self._get_supported_vgpu_types()
# This dict is for knowing which mdevs are already claimed by some
@ -8190,38 +8192,36 @@ class LibvirtDriver(driver.ComputeDriver):
# be calling this method before init_host()
nova.conf.devices.register_dynamic_opts(CONF)
enabled_mdev_types = []
for vgpu_type in CONF.devices.enabled_mdev_types:
enabled_mdev_types.append(vgpu_type)
# NOTE(sbauza) group is now always set because we register the
# dynamic options above
group = getattr(CONF, 'mdev_%s' % vgpu_type, None)
if group is None or not group.device_addresses:
first_type = CONF.devices.enabled_mdev_types[0]
if len(CONF.devices.enabled_mdev_types) > 1:
# Only provide the warning if the operator provided more
# than one type as it's not needed to provide groups
# if you only use one vGPU type.
msg = ("The mdev type '%(type)s' was listed in '[devices] "
"enabled_mdev_types' but no corresponding "
"'[mdev_%(type)s]' group or "
"'[mdev_%(type)s] device_addresses' "
"option was defined. Only the first type "
"'%(ftype)s' will be used." % {'type': vgpu_type,
'ftype': first_type})
LOG.warning(msg)
# We need to reset the mapping tables that we started to
# provide keys and values from previously processed vGPUs but
# since there is a problem for this vGPU type, we only want to
# support only the first type.
self.pgpu_type_mapping.clear()
self.mdev_class_mapping.clear()
first_group = getattr(CONF, 'mdev_%s' % first_type, None)
if first_group is None:
self.mdev_classes = {orc.VGPU}
else:
self.mdev_classes = {first_group.mdev_class}
return [first_type]
if group is None:
# Should never happen but if so, just fails early.
raise exception.InvalidLibvirtMdevConfig(
reason="can't find '[devices]/mdev_%s group' "
"in the configuration" % group
)
mdev_class = group.mdev_class
# By default, max_instances is None
if group.max_instances:
self.mdev_type_max_mapping[vgpu_type] = group.max_instances
if not group.device_addresses:
if not self.pgpu_type_default:
self.pgpu_type_default = vgpu_type
self.mdev_classes.add(mdev_class)
else:
msg = ("Mdev type default already set to "
" %(default_type)s so %(this_type)s will not "
"be used." % {
'default_type': self.pgpu_type_default,
'this_type': vgpu_type})
LOG.warning(msg)
# we remove the type from the supported list.
enabled_mdev_types.remove(vgpu_type)
continue
for device_address in group.device_addresses:
if device_address in self.pgpu_type_mapping:
raise exception.InvalidLibvirtMdevConfig(
@ -8238,7 +8238,7 @@ class LibvirtDriver(driver.ComputeDriver):
self.pgpu_type_mapping[device_address] = vgpu_type
self.mdev_class_mapping[device_address] = mdev_class
self.mdev_classes.add(mdev_class)
return CONF.devices.enabled_mdev_types
return enabled_mdev_types
@staticmethod
def _get_pci_id_from_libvirt_name(
@ -8272,16 +8272,14 @@ class LibvirtDriver(driver.ComputeDriver):
if not self.supported_vgpu_types:
return
if len(self.supported_vgpu_types) == 1:
first_type = self.supported_vgpu_types[0]
group = getattr(CONF, 'mdev_%s' % first_type, None)
if group is None or not group.device_addresses:
return first_type
device_address = self._get_pci_id_from_libvirt_name(device_address)
if not device_address:
return
return self.pgpu_type_mapping.get(device_address)
mdev_type = self.pgpu_type_mapping.get(device_address)
# if we can't find the mdev type by the config, do we have a default
# type because of a config group not using device_addresses ?
# NOTE(sbauza): By default pgpu_type_default is None if unset
return mdev_type or self.pgpu_type_default
def _get_resource_class_for_device(self, device_address):
"""Returns the resource class for the inventory of this device.