XenAPI: get vGPU stats from hypervisor

This commit is to get vGPU resource stats from XenAPI. It contains the following data for vGPU types: * total amount: The capacity of the vGPUs which will be used to create inventory data for ResourceClass.VGPU. * max_heads: The maximal display heads which will be used to create inventory data for ResourceClass.VGPU_DISPLAY_HEAD. * model_name: The vGPU type's name in XenAPI. The vGPU types are filtered basing on model_name. * PGPU group uuid and vGPU type's uuid: The identifiers in XenAPI. We will use them to create the child resource provider's name in nested-RP, so that the vGPU resource allocation can be easily mapped back to the PGPU group and vGPU type at spawning instance. Once it's ready to use the nested resource providers, we can support multiple vGPU types in a compute node. But at the moment we only enable one vGPU type in a compute node. The restriction is implemented by checking the configure option of enabled_vgpu_type. If multiple vGPU types are enabled, we will only use the first one. Change-Id: I2f9cf80b71b50a4b8e011a4a8e40474cc02baba7 blueprint: add-support-for-vgpu
2017-10-13 11:20:29 +00:00 · 2017-10-13 11:20:29 +00:00 · 6d2cd197bc
parent fa2c1567c1
commit 6d2cd197bc
5 changed files with 322 additions and 2 deletions
--- a/nova/tests/unit/virt/xenapi/test_driver.py
+++ b/nova/tests/unit/virt/xenapi/test_driver.py
@ -67,7 +67,15 @@ class XenAPIDriverTestCase(stubs.XenAPITestBaseNoDB):
                },
                'vcpus_used': 10,
                'pci_passthrough_devices': '',
-                'host_other-config': {'iscsi_iqn': 'someiqn'}}
+                'host_other-config': {'iscsi_iqn': 'someiqn'},
+                'vgpu_stats': {
+                    'c8328467-badf-43d8-8e28-0e096b0f88b1':
+                        {'uuid': '6444c6ee-3a49-42f5-bebb-606b52175e67',
+                         'total': 7,
+                         'max_heads': 1,
+                         'type_name': 'Intel GVT-g',
+                         },
+                     }}

    def test_available_resource(self):
        driver = self._get_driver()
--- a/nova/tests/unit/virt/xenapi/test_vgpu.py
+++ b/nova/tests/unit/virt/xenapi/test_vgpu.py
@ -0,0 +1,202 @@
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import mock
+
+from nova import test
+from nova.virt.xenapi import host
+
+
+class VGPUTestCase(test.NoDBTestCase):
+    """Unit tests for Driver operations."""
+    @mock.patch.object(host.HostState, 'update_status',
+                       return_value='fake_stats_1')
+    @mock.patch.object(host.HostState, '_get_vgpu_stats_in_group')
+    def test_get_vgpu_stats_empty_cfg(self, mock_get, mock_update):
+        # no vGPU type configured.
+        self.flags(enabled_vgpu_types=[], group='devices')
+        session = mock.Mock()
+
+        host_obj = host.HostState(session)
+        stats = host_obj._get_vgpu_stats()
+
+        session.call_xenapi.assert_not_called()
+        self.assertEqual(stats, {})
+
+    @mock.patch.object(host.HostState, 'update_status',
+                       return_value='fake_stats_1')
+    @mock.patch.object(host.HostState, '_get_vgpu_stats_in_group')
+    def test_get_vgpu_stats_single_type(self, mock_get, mock_update):
+        # configured single vGPU type
+        self.flags(enabled_vgpu_types=['type_name_1'], group='devices')
+        session = mock.Mock()
+        # multiple GPU groups
+        session.call_xenapi.side_effect = [
+            ['grp_ref1', 'grp_ref2'],  # GPU_group.get_all
+            'uuid_1',  # GPU_group.get_uuid
+            'uuid_2',  # GPU_group.get_uuid
+        ]
+        # Let it return None for the 2nd GPU group for the case
+        # that it doesn't have the specified vGPU type enabled.
+        mock_get.side_effect = ['fake_stats_1', None]
+        host_obj = host.HostState(session)
+        stats = host_obj._get_vgpu_stats()
+
+        self.assertEqual(session.call_xenapi.call_count, 3)
+        self.assertEqual(mock_update.call_count, 1)
+        self.assertEqual(mock_get.call_count, 2)
+        self.assertEqual(stats, {'uuid_1': 'fake_stats_1'})
+
+    @mock.patch.object(host.HostState, 'update_status',
+                       return_value='fake_stats_1')
+    @mock.patch.object(host.HostState, '_get_vgpu_stats_in_group')
+    def test_get_vgpu_stats_multi_types(self, mock_get, mock_update):
+        # when multiple vGPU types configured, it use the first one.
+        self.flags(enabled_vgpu_types=['type_name_1', 'type_name_2'],
+                   group='devices')
+        session = mock.Mock()
+        session.call_xenapi.side_effect = [
+            ['grp_ref1'],  # GPU_group.get_all
+            'uuid_1',  # GPU_group.get_uuid
+        ]
+        mock_get.side_effect = ['fake_stats_1']
+        host_obj = host.HostState(session)
+        stats = host_obj._get_vgpu_stats()
+
+        self.assertEqual(session.call_xenapi.call_count, 2)
+        self.assertEqual(mock_update.call_count, 1)
+        self.assertEqual(stats, {'uuid_1': 'fake_stats_1'})
+        # called with the first vGPU type: 'type_name_1'
+        mock_get.assert_called_with('grp_ref1', ['type_name_1'])
+
+    @mock.patch.object(host.HostState, 'update_status',
+                       return_value='fake_stats_1')
+    @mock.patch.object(host.HostState, '_get_total_vgpu_in_grp',
+                       return_value=7)
+    def test_get_vgpu_stats_in_group(self, mock_get, mock_update):
+        # Test it will return vGPU stat for the enabled vGPU type.
+        enabled_vgpu_types = ['type_name_2']
+        session = mock.Mock()
+        session.call_xenapi.side_effect = [
+            ['type_ref_1', 'type_ref_2'],  # GPU_group.get_enabled_VGPU_types
+            'type_name_1',  # VGPU_type.get_model_name
+            'type_name_2',  # VGPU_type.get_model_name
+            'type_uuid_2',  # VGPU_type.get_uuid
+            '4',  # VGPU_type.get_max_heads
+        ]
+        host_obj = host.HostState(session)
+
+        stats = host_obj._get_vgpu_stats_in_group('grp_ref',
+                                                  enabled_vgpu_types)
+
+        expect_stats = {'uuid': 'type_uuid_2',
+                        'type_name': 'type_name_2',
+                        'max_heads': 4,
+                        'total': 7,
+                        }
+        self.assertEqual(session.call_xenapi.call_count, 5)
+        # It should get_uuid for the vGPU type passed via *enabled_vgpu_types*
+        # (the arg for get_uuid should be 'type_ref_2').
+        get_uuid_call = [mock.call('VGPU_type.get_uuid', 'type_ref_2')]
+        session.call_xenapi.assert_has_calls(get_uuid_call)
+        mock_get.assert_called_once()
+        self.assertEqual(expect_stats, stats)
+
+    @mock.patch.object(host.HostState, 'update_status')
+    @mock.patch.object(host.HostState, '_get_total_vgpu_in_grp',
+                       return_value=7)
+    def test_get_vgpu_stats_in_group_multiple(self, mock_get, mock_update):
+        # Test when enabled multiple vGPU types in the same group.
+        # It should only return the first vGPU type's stats.
+        enabled_vgpu_types = ['type_name_1', 'type_name_2']
+        session = mock.Mock()
+        session.call_xenapi.side_effect = [
+            ['type_ref_1', 'type_ref_2'],  # GPU_group.get_enabled_VGPU_types
+            'type_name_1',  # VGPU_type.get_model_name
+            'type_name_2',  # VGPU_type.get_model_name
+            'type_uuid_1',  # VGPU_type.get_uuid
+            '4',  # VGPU_type.get_max_heads
+        ]
+        host_obj = host.HostState(session)
+
+        stats = host_obj._get_vgpu_stats_in_group('grp_ref',
+                                                  enabled_vgpu_types)
+
+        expect_stats = {
+            'uuid': 'type_uuid_1',
+            'type_name': 'type_name_1',
+            'max_heads': 4,
+            'total': 7,
+        }
+        self.assertEqual(session.call_xenapi.call_count, 5)
+        # It should call get_uuid for the first vGPU type (the arg for get_uuid
+        # should be 'type_ref_1').
+        get_uuid_call = [mock.call('VGPU_type.get_uuid', 'type_ref_1')]
+        session.call_xenapi.assert_has_calls(get_uuid_call)
+        mock_get.assert_called_once()
+        self.assertEqual(expect_stats, stats)
+
+    @mock.patch.object(host.HostState, 'update_status')
+    @mock.patch.object(host.HostState, '_get_total_vgpu_in_grp',
+                       return_value=7)
+    def test_get_vgpu_stats_in_group_cfg_not_in_grp(self, mock_get,
+                                                    mock_update):
+        # Test when the enable_vgpu_types is not a valid
+        # type belong to the GPU group. It will return None.
+        enabled_vgpu_types = ['bad_type_name']
+        session = mock.Mock()
+        session.call_xenapi.side_effect = [
+            ['type_ref_1', 'type_ref_2'],  # GPU_group.get_enabled_VGPU_types
+            'type_name_1',  # VGPU_type.get_model_name
+            'type_name_2',  # VGPU_type.get_model_name
+        ]
+        host_obj = host.HostState(session)
+
+        stats = host_obj._get_vgpu_stats_in_group('grp_ref',
+                                                  enabled_vgpu_types)
+
+        expect_stats = None
+        self.assertEqual(session.call_xenapi.call_count, 3)
+        mock_get.assert_not_called()
+        self.assertEqual(expect_stats, stats)
+
+    @mock.patch.object(host.HostState, 'update_status')
+    def test_get_total_vgpu_in_grp(self, mock_update):
+        session = mock.Mock()
+        # The fake PGPU records returned from call_xenapi's string function:
+        # "PGPU.get_all_records_where".
+        pgpu_records = {
+            'pgpu_ref1': {
+                'enabled_VGPU_types': ['type_ref1', 'type_ref2'],
+                'supported_VGPU_max_capacities': {
+                    'type_ref1': '1',
+                    'type_ref2': '3',
+                }
+            },
+            'pgpu_ref2': {
+                'enabled_VGPU_types': ['type_ref1', 'type_ref2'],
+                'supported_VGPU_max_capacities': {
+                    'type_ref1': '1',
+                    'type_ref2': '3',
+                }
+            }
+        }
+        session.call_xenapi.return_value = pgpu_records
+        host_obj = host.HostState(session)
+
+        total = host_obj._get_total_vgpu_in_grp('grp_ref', 'type_ref1')
+
+        session.call_xenapi.assert_called_with(
+            'PGPU.get_all_records_where', 'field "GPU_group" = "grp_ref"')
+        # The total amount of VGPUs is equal to sum of vaiable VGPU of
+        # 'type_ref1' in all PGPUs.
+        self.assertEqual(total, 2)
--- a/nova/tests/unit/virt/xenapi/test_xenapi.py
+++ b/nova/tests/unit/virt/xenapi/test_xenapi.py
@ -2230,12 +2230,14 @@ class XenAPIHostTestCase(stubs.XenAPITestBase):

    @mock.patch.object(host.HostState, 'get_disk_used')
    @mock.patch.object(host.HostState, '_get_passthrough_devices')
+    @mock.patch.object(host.HostState, '_get_vgpu_stats')
    @mock.patch.object(jsonutils, 'loads')
    @mock.patch.object(vm_utils, 'list_vms')
    @mock.patch.object(vm_utils, 'scan_default_sr')
    @mock.patch.object(host_management, 'get_host_data')
    def test_update_stats_caches_hostname(self, mock_host_data, mock_scan_sr,
                                          mock_list_vms, mock_loads,
+                                          mock_vgpus_stats,
                                          mock_devices, mock_dis_used):
        data = {'disk_total': 0,
                'disk_used': 0,
@ -2266,10 +2268,12 @@ class XenAPIHostTestCase(stubs.XenAPITestBase):
            self.assertEqual(2, mock_host_data.call_count)
            self.assertEqual(2, mock_scan_sr.call_count)
            self.assertEqual(2, mock_devices.call_count)
+            self.assertEqual(2, mock_vgpus_stats.call_count)
            mock_loads.assert_called_with(data)
            mock_host_data.assert_called_with(self.conn._session)
            mock_scan_sr.assert_called_with(self.conn._session)
            mock_devices.assert_called_with()
+            mock_vgpus_stats.assert_called_with()


@mock.patch.object(host.HostState, 'update_status')
--- a/nova/virt/xenapi/fake.py
+++ b/nova/virt/xenapi/fake.py
@ -68,7 +68,8 @@ from nova.i18n import _


 _CLASSES = ['host', 'network', 'session', 'pool', 'SR', 'VBD',
-            'PBD', 'VDI', 'VIF', 'PIF', 'VM', 'VLAN', 'task']
+            'PBD', 'VDI', 'VIF', 'PIF', 'VM', 'VLAN', 'task',
+            'GPU_group', 'PGPU', 'VGPU_type']
 _after_create_functions = {}
 _destroy_functions = {}

--- a/nova/virt/xenapi/host.py
+++ b/nova/virt/xenapi/host.py
@ -220,6 +220,110 @@ class HostState(object):

        return passthrough_devices

+    def _get_vgpu_stats(self):
+        """Invoke XenAPI to get the stats for VGPUs.
+
+        The return value is a dict which has GPU groups' uuid as
+        the keys:
+            dict(grp_uuid_1=dict_vgpu_stats_in_grp_1,
+                 grp_uuid_2=dict_vgpu_stats_in_grp_2,
+                 ...,
+                 grp_uuid_n=dict_vgpu_stats_in_grp_n)
+        The `dict_vgpu_stats_in_grp_x` is a dict represents the
+        vGPU stats in GPU group x. For details, please refer to
+        the return value of the function of _get_vgpu_stats_in_group().
+        """
+        if not CONF.devices.enabled_vgpu_types:
+            return {}
+
+        vgpu_stats = {}
+
+        # NOTE(jianghuaw): If there are multiple vGPU types enabled in
+        # the configure option, we only choose the first one so that
+        # we support only one vGPU type per compute node at the moment.
+        # Once we switch to use the nested resource providers, we will
+        # remove these lines to allow multiple vGPU types within multiple
+        # GPU groups (each group has a different vGPU type enabled).
+        if len(CONF.devices.enabled_vgpu_types) > 1:
+            LOG.warning('XenAPI only supports one GPU type per compute node,'
+                        ' only first type will be used.')
+        cfg_enabled_types = CONF.devices.enabled_vgpu_types[:1]
+
+        vgpu_grp_refs = self._session.call_xenapi('GPU_group.get_all')
+        for ref in vgpu_grp_refs:
+            grp_uuid = self._session.call_xenapi('GPU_group.get_uuid', ref)
+            stat = self._get_vgpu_stats_in_group(ref, cfg_enabled_types)
+            if stat:
+                vgpu_stats[grp_uuid] = stat
+
+        LOG.debug("Returning vGPU stats: %s", vgpu_stats)
+
+        return vgpu_stats
+
+    def _get_vgpu_stats_in_group(self, grp_ref, vgpu_types):
+        """Get stats for the specified vGPU types in a GPU group.
+
+        NOTE(Jianghuaw): In XenAPI, a GPU group is the minimal unit
+        from where to create a vGPU for an instance. So here, we
+        report vGPU resources for a particular GPU group. When we use
+        nested resource providers to represent the vGPU resources,
+        each GPU group will be a child resource provider under the
+        compute node.
+
+        The return value is a dict. For example:
+        {'uuid': '6444c6ee-3a49-42f5-bebb-606b52175e67',
+         'total': 7,
+         'max_heads': '1',
+         'type_name': 'Intel GVT-g',
+         }
+        """
+        type_refs_in_grp = self._session.call_xenapi(
+            'GPU_group.get_enabled_VGPU_types', grp_ref)
+
+        type_names_in_grp = {self._session.call_xenapi(
+                                 'VGPU_type.get_model_name',
+                                 type_ref): type_ref
+                             for type_ref in type_refs_in_grp}
+        # Get the vGPU types enabled both in this GPU group and in the
+        # nova conf.
+        enabled_types = set(vgpu_types) & set(type_names_in_grp)
+        if not enabled_types:
+            return
+
+        stat = {}
+        # Get the sorted enabled types, so that we can always choose the same
+        # type when there are multiple enabled vGPU types.
+        sorted_types = sorted(enabled_types)
+        chosen_type = sorted_types[0]
+        if len(sorted_types) > 1:
+            LOG.warning('XenAPI only supports one vGPU type per GPU group,'
+                        ' but enabled multiple vGPU types: %(available)s.'
+                        ' Choosing the first one: %(chosen)s.',
+                       dict(available=sorted_types,
+                            chosen=chosen_type))
+        type_ref = type_names_in_grp[chosen_type]
+        type_uuid = self._session.call_xenapi('VGPU_type.get_uuid', type_ref)
+        stat['uuid'] = type_uuid
+        stat['type_name'] = chosen_type
+        stat['max_heads'] = int(self._session.call_xenapi(
+            'VGPU_type.get_max_heads', type_ref))
+
+        stat['total'] = self._get_total_vgpu_in_grp(grp_ref, type_ref)
+        return stat
+
+    def _get_total_vgpu_in_grp(self, grp_ref, type_ref):
+        """Get the total capacity of vGPUs in the group."""
+        pgpu_recs = self._session.call_xenapi(
+            'PGPU.get_all_records_where', 'field "GPU_group" = "%s"' % grp_ref)
+
+        total = 0
+        for pgpu_ref in pgpu_recs:
+            pgpu_rec = pgpu_recs[pgpu_ref]
+            if type_ref in pgpu_rec['enabled_VGPU_types']:
+                cap = pgpu_rec['supported_VGPU_max_capacities'][type_ref]
+                total += int(cap)
+        return total
+
    def get_host_stats(self, refresh=False):
        """Return the current state of the host. If 'refresh' is
        True, run the update first.
@ -309,6 +413,7 @@ class HostState(object):
                vcpus_used = vcpus_used + int(vm_rec['VCPUs_max'])
            data['vcpus_used'] = vcpus_used
            data['pci_passthrough_devices'] = self._get_passthrough_devices()
+            data['vgpu_stats'] = self._get_vgpu_stats()
            self._stats = data