diff --git a/doc/source/admin/virtual-gpu.rst b/doc/source/admin/virtual-gpu.rst
index c7f295dda09f..a26c3be65123 100644
--- a/doc/source/admin/virtual-gpu.rst
+++ b/doc/source/admin/virtual-gpu.rst
@@ -293,6 +293,36 @@ Caveats
This information is correct as of the 17.0.0 Queens release. Where
improvements have been made or issues fixed, they are noted per item.
+* When live-migrating an instance using vGPUs, the libvirt guest domain XML
+ isn't updated with the new mediated device UUID to use for the target.
+
+ .. versionchanged:: 29.0.0
+
+ In the 2024.2 Caracal release, Nova now `supports vGPU live-migrations`_. In
+ order to do this, both the source and target compute service need to have
+ minimum versions of libvirt-8.6.0, QEMU-8.1.0 and Linux kernel 5.18.0. You
+ need to ensure that either you use only single common vGPU type between two
+ computes. Where multiple mdev types are configured on the source and
+ destination host, custom traits or custom resource classes must be
+ configured, reported by the host and requested by the instance to make sure
+ that the Placement API correctly returns the supported GPU using the right
+ vGPU type for a migration. Last but not least, if you want to live-migrate
+ nVidia mediated devices, you need to update
+ :oslo.config:option:`libvirt.live_migration_downtime`,
+ :oslo.config:option:`libvirt.live_migration_downtime_steps` and
+ :oslo.config:option:`libvirt.live_migration_downtime_delay`:
+
+ .. code-block:: ini
+
+ live_migration_downtime = 500000
+ live_migration_downtime_steps = 3
+ live_migration_downtime_delay = 3
+
+ You can see an example of a working live-migration `here`__.
+
+ .. __: http://sbauza.github.io/vgpu/vgpu_live_migration.html
+
+
* Suspending a guest that has vGPUs doesn't yet work because of a libvirt
limitation (it can't hot-unplug mediated devices from a guest). Workarounds
using other instance actions (like snapshotting the instance or shelving it)
@@ -355,6 +385,7 @@ For nested vGPUs:
.. _bug 1778563: https://bugs.launchpad.net/nova/+bug/1778563
.. _bug 1762688: https://bugs.launchpad.net/nova/+bug/1762688
.. _bug 1948705: https://bugs.launchpad.net/nova/+bug/1948705
+.. _supports vGPU live-migrations: https://specs.openstack.org/openstack/nova-specs/specs/2024.1/approved/libvirt-mdev-live-migrate.html
.. Links
.. _Intel GVT-g: https://01.org/igvt-g
diff --git a/nova/tests/functional/libvirt/test_vgpu.py b/nova/tests/functional/libvirt/test_vgpu.py
index 7def9bc6d875..8f108d216b89 100644
--- a/nova/tests/functional/libvirt/test_vgpu.py
+++ b/nova/tests/functional/libvirt/test_vgpu.py
@@ -576,10 +576,9 @@ class VGPULiveMigrationTests(base.LibvirtMigrationMixin, VGPUTestBase):
mdevs = self.src.driver._get_all_assigned_mediated_devices(inst)
self.assertEqual(1, len(mdevs))
self._live_migrate(self.server, 'completed')
- # FIXME(sbauza): The domain is fully copied to the destination so the
- # XML contains the original mdev but given the 'devices' attribute on
- # the fixture doesn't have it, that's why we have a KeyError.
- self.assertRaises(KeyError, self.assert_mdev_usage, self.dest, 0)
+ # Now the destination XML is updated, so the destination mdev is
+ # correctly used.
+ self.assert_mdev_usage(self.dest, 1)
class VGPULiveMigrationTestsLMFailed(VGPULiveMigrationTests):
diff --git a/nova/tests/unit/virt/libvirt/test_migration.py b/nova/tests/unit/virt/libvirt/test_migration.py
index 155c25998687..ca4fb02a1267 100644
--- a/nova/tests/unit/virt/libvirt/test_migration.py
+++ b/nova/tests/unit/virt/libvirt/test_migration.py
@@ -190,6 +190,40 @@ class UtilityMigrationTestCase(test.NoDBTestCase):
new_xml = new_xml.replace("/dev/dax0.2", "/dev/dax2.0")
self.assertXmlEqual(res, new_xml)
+ def test_update_mdev_xml(self):
+ xml_pattern = """
+
+
+
+
+
+"""
+ data = objects.LibvirtLiveMigrateData(
+ target_mdevs={uuids.src_mdev: uuids.dst_mdev})
+ doc = etree.fromstring(xml_pattern % uuids.src_mdev)
+ res = migration._update_mdev_xml(doc, data.target_mdevs)
+ self.assertEqual(xml_pattern % uuids.dst_mdev,
+ etree.tostring(res, encoding='unicode'))
+
+ def test_update_mdev_xml_fails_on_notfound_mdev(self):
+ xml_pattern = """
+
+
+
+
+
+"""
+ data = objects.LibvirtLiveMigrateData(
+ target_mdevs={uuids.other_mdev: uuids.dst_mdev})
+ doc = etree.fromstring(xml_pattern % uuids.src_mdev)
+ # src_mdev UUID doesn't exist in target_mdevs dict
+ self.assertRaises(exception.NovaException,
+ migration._update_mdev_xml, doc, data.target_mdevs)
+
def test_update_numa_xml(self):
doc = etree.fromstring("""
diff --git a/nova/virt/libvirt/migration.py b/nova/virt/libvirt/migration.py
index 0aacec56d8da..22293c2fd973 100644
--- a/nova/virt/libvirt/migration.py
+++ b/nova/virt/libvirt/migration.py
@@ -67,6 +67,8 @@ def get_updated_guest_xml(instance, guest, migrate_data, get_volume_config,
xml_doc = _update_vif_xml(xml_doc, migrate_data, get_vif_config)
if 'dst_numa_info' in migrate_data:
xml_doc = _update_numa_xml(xml_doc, migrate_data)
+ if 'target_mdevs' in migrate_data:
+ xml_doc = _update_mdev_xml(xml_doc, migrate_data.target_mdevs)
if new_resources:
xml_doc = _update_device_resources_xml(xml_doc, new_resources)
return etree.tostring(xml_doc, encoding='unicode')
@@ -106,6 +108,28 @@ def _update_vpmems_xml(xml_doc, vpmems):
return xml_doc
+def _update_mdev_xml(xml_doc, target_mdevs):
+ for dev in xml_doc.findall('./devices/hostdev'):
+ if dev.get('type') == 'mdev':
+ address_tag = dev.find('source/address')
+ if address_tag is None:
+ continue
+ src_mdev = address_tag.get('uuid')
+ if src_mdev is not None:
+ dst_mdev = target_mdevs.get(src_mdev)
+ if dst_mdev is None:
+ # For some reason, we don't know which mdev to use
+ # so we prefer to abort the live-migration.
+ raise exception.NovaException(
+ 'Unable to find the destination mediated device UUID '
+ 'to use for this source mdev UUID : %s' % src_mdev)
+ else:
+ address_tag.set('uuid', dst_mdev)
+ LOG.debug('_update_mdev_xml output xml=%s',
+ etree.tostring(xml_doc, encoding='unicode', pretty_print=True))
+ return xml_doc
+
+
def _update_numa_xml(xml_doc, migrate_data):
LOG.debug('_update_numa_xml input xml=%s',
etree.tostring(xml_doc, encoding='unicode', pretty_print=True))
diff --git a/releasenotes/notes/bp-libvirt-mdev-live-migrate-4396dbe4d9a9775f.yaml b/releasenotes/notes/bp-libvirt-mdev-live-migrate-4396dbe4d9a9775f.yaml
new file mode 100644
index 000000000000..0ac1c909b06c
--- /dev/null
+++ b/releasenotes/notes/bp-libvirt-mdev-live-migrate-4396dbe4d9a9775f.yaml
@@ -0,0 +1,13 @@
+---
+features:
+ - |
+ Instances using vGPUs can now be correctly live-migrated by the libvirt
+ driver between compute nodes supporting the same mediated device types used
+ by the instance. In order to be able to do this, the compute hosts need to
+ support at least the minimum versions of libvirt-8.6.0, QEMU-8.1.0 and
+ Linux kernel 5.18.0. If operators use multiple vGPU types per compute, they
+ need to make sure they already use custom traits or custom resource classes
+ for the GPUs resource providers and that the instance was created with a
+ flavor using either a custom resource class or asking for a custom trait in
+ order to make sure that Placement API will provide the right target GPU
+ using the same mdev type for the instance.