Merge "XenAPI: get vGPU stats from hypervisor"

2017-12-04 18:26:37 +00:00 · 2017-12-04 18:26:37 +00:00 · 2c4a1a390a
parent 6ee125ca81 6d2cd197bc
commit 2c4a1a390a
5 changed files with 322 additions and 2 deletions
--- a/nova/tests/unit/virt/xenapi/test_driver.py
+++ b/nova/tests/unit/virt/xenapi/test_driver.py
@ -67,7 +67,15 @@ class XenAPIDriverTestCase(stubs.XenAPITestBaseNoDB):
                },
                'vcpus_used': 10,
                'pci_passthrough_devices': '',
-                'host_other-config': {'iscsi_iqn': 'someiqn'}}
+                'host_other-config': {'iscsi_iqn': 'someiqn'},
+                'vgpu_stats': {
+                    'c8328467-badf-43d8-8e28-0e096b0f88b1':
+                        {'uuid': '6444c6ee-3a49-42f5-bebb-606b52175e67',
+                         'total': 7,
+                         'max_heads': 1,
+                         'type_name': 'Intel GVT-g',
+                         },
+                     }}

    def test_available_resource(self):
        driver = self._get_driver()
--- a/nova/tests/unit/virt/xenapi/test_vgpu.py
+++ b/nova/tests/unit/virt/xenapi/test_vgpu.py
@ -0,0 +1,202 @@
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import mock
+
+from nova import test
+from nova.virt.xenapi import host
+
+
+class VGPUTestCase(test.NoDBTestCase):
+    """Unit tests for Driver operations."""
+    @mock.patch.object(host.HostState, 'update_status',
+                       return_value='fake_stats_1')
+    @mock.patch.object(host.HostState, '_get_vgpu_stats_in_group')
+    def test_get_vgpu_stats_empty_cfg(self, mock_get, mock_update):
+        # no vGPU type configured.
+        self.flags(enabled_vgpu_types=[], group='devices')
+        session = mock.Mock()
+
+        host_obj = host.HostState(session)
+        stats = host_obj._get_vgpu_stats()
+
+        session.call_xenapi.assert_not_called()
+        self.assertEqual(stats, {})
+
+    @mock.patch.object(host.HostState, 'update_status',
+                       return_value='fake_stats_1')
+    @mock.patch.object(host.HostState, '_get_vgpu_stats_in_group')
+    def test_get_vgpu_stats_single_type(self, mock_get, mock_update):
+        # configured single vGPU type
+        self.flags(enabled_vgpu_types=['type_name_1'], group='devices')
+        session = mock.Mock()
+        # multiple GPU groups
+        session.call_xenapi.side_effect = [
+            ['grp_ref1', 'grp_ref2'],  # GPU_group.get_all
+            'uuid_1',  # GPU_group.get_uuid
+            'uuid_2',  # GPU_group.get_uuid
+        ]
+        # Let it return None for the 2nd GPU group for the case
+        # that it doesn't have the specified vGPU type enabled.
+        mock_get.side_effect = ['fake_stats_1', None]
+        host_obj = host.HostState(session)
+        stats = host_obj._get_vgpu_stats()
+
+        self.assertEqual(session.call_xenapi.call_count, 3)
+        self.assertEqual(mock_update.call_count, 1)
+        self.assertEqual(mock_get.call_count, 2)
+        self.assertEqual(stats, {'uuid_1': 'fake_stats_1'})
+
+    @mock.patch.object(host.HostState, 'update_status',
+                       return_value='fake_stats_1')
+    @mock.patch.object(host.HostState, '_get_vgpu_stats_in_group')
+    def test_get_vgpu_stats_multi_types(self, mock_get, mock_update):
+        # when multiple vGPU types configured, it use the first one.
+        self.flags(enabled_vgpu_types=['type_name_1', 'type_name_2'],
+                   group='devices')
+        session = mock.Mock()
+        session.call_xenapi.side_effect = [
+            ['grp_ref1'],  # GPU_group.get_all
+            'uuid_1',  # GPU_group.get_uuid
+        ]
+        mock_get.side_effect = ['fake_stats_1']
+        host_obj = host.HostState(session)
+        stats = host_obj._get_vgpu_stats()
+
+        self.assertEqual(session.call_xenapi.call_count, 2)
+        self.assertEqual(mock_update.call_count, 1)
+        self.assertEqual(stats, {'uuid_1': 'fake_stats_1'})
+        # called with the first vGPU type: 'type_name_1'
+        mock_get.assert_called_with('grp_ref1', ['type_name_1'])
+
+    @mock.patch.object(host.HostState, 'update_status',
+                       return_value='fake_stats_1')
+    @mock.patch.object(host.HostState, '_get_total_vgpu_in_grp',
+                       return_value=7)
+    def test_get_vgpu_stats_in_group(self, mock_get, mock_update):
+        # Test it will return vGPU stat for the enabled vGPU type.
+        enabled_vgpu_types = ['type_name_2']
+        session = mock.Mock()
+        session.call_xenapi.side_effect = [
+            ['type_ref_1', 'type_ref_2'],  # GPU_group.get_enabled_VGPU_types
+            'type_name_1',  # VGPU_type.get_model_name
+            'type_name_2',  # VGPU_type.get_model_name
+            'type_uuid_2',  # VGPU_type.get_uuid
+            '4',  # VGPU_type.get_max_heads
+        ]
+        host_obj = host.HostState(session)
+
+        stats = host_obj._get_vgpu_stats_in_group('grp_ref',
+                                                  enabled_vgpu_types)
+
+        expect_stats = {'uuid': 'type_uuid_2',
+                        'type_name': 'type_name_2',
+                        'max_heads': 4,
+                        'total': 7,
+                        }
+        self.assertEqual(session.call_xenapi.call_count, 5)
+        # It should get_uuid for the vGPU type passed via *enabled_vgpu_types*
+        # (the arg for get_uuid should be 'type_ref_2').
+        get_uuid_call = [mock.call('VGPU_type.get_uuid', 'type_ref_2')]
+        session.call_xenapi.assert_has_calls(get_uuid_call)
+        mock_get.assert_called_once()
+        self.assertEqual(expect_stats, stats)
+
+    @mock.patch.object(host.HostState, 'update_status')
+    @mock.patch.object(host.HostState, '_get_total_vgpu_in_grp',
+                       return_value=7)
+    def test_get_vgpu_stats_in_group_multiple(self, mock_get, mock_update):
+        # Test when enabled multiple vGPU types in the same group.
+        # It should only return the first vGPU type's stats.
+        enabled_vgpu_types = ['type_name_1', 'type_name_2']
+        session = mock.Mock()
+        session.call_xenapi.side_effect = [
+            ['type_ref_1', 'type_ref_2'],  # GPU_group.get_enabled_VGPU_types
+            'type_name_1',  # VGPU_type.get_model_name
+            'type_name_2',  # VGPU_type.get_model_name
+            'type_uuid_1',  # VGPU_type.get_uuid
+            '4',  # VGPU_type.get_max_heads
+        ]
+        host_obj = host.HostState(session)
+
+        stats = host_obj._get_vgpu_stats_in_group('grp_ref',
+                                                  enabled_vgpu_types)
+
+        expect_stats = {
+            'uuid': 'type_uuid_1',
+            'type_name': 'type_name_1',
+            'max_heads': 4,
+            'total': 7,
+        }
+        self.assertEqual(session.call_xenapi.call_count, 5)
+        # It should call get_uuid for the first vGPU type (the arg for get_uuid
+        # should be 'type_ref_1').
+        get_uuid_call = [mock.call('VGPU_type.get_uuid', 'type_ref_1')]
+        session.call_xenapi.assert_has_calls(get_uuid_call)
+        mock_get.assert_called_once()
+        self.assertEqual(expect_stats, stats)
+
+    @mock.patch.object(host.HostState, 'update_status')
+    @mock.patch.object(host.HostState, '_get_total_vgpu_in_grp',
+                       return_value=7)
+    def test_get_vgpu_stats_in_group_cfg_not_in_grp(self, mock_get,
+                                                    mock_update):
+        # Test when the enable_vgpu_types is not a valid
+        # type belong to the GPU group. It will return None.
+        enabled_vgpu_types = ['bad_type_name']
+        session = mock.Mock()
+        session.call_xenapi.side_effect = [
+            ['type_ref_1', 'type_ref_2'],  # GPU_group.get_enabled_VGPU_types
+            'type_name_1',  # VGPU_type.get_model_name
+            'type_name_2',  # VGPU_type.get_model_name
+        ]
+        host_obj = host.HostState(session)
+
+        stats = host_obj._get_vgpu_stats_in_group('grp_ref',
+                                                  enabled_vgpu_types)
+
+        expect_stats = None
+        self.assertEqual(session.call_xenapi.call_count, 3)
+        mock_get.assert_not_called()
+        self.assertEqual(expect_stats, stats)
+
+    @mock.patch.object(host.HostState, 'update_status')
+    def test_get_total_vgpu_in_grp(self, mock_update):
+        session = mock.Mock()
+        # The fake PGPU records returned from call_xenapi's string function:
+        # "PGPU.get_all_records_where".
+        pgpu_records = {
+            'pgpu_ref1': {
+                'enabled_VGPU_types': ['type_ref1', 'type_ref2'],
+                'supported_VGPU_max_capacities': {
+                    'type_ref1': '1',
+                    'type_ref2': '3',
+                }
+            },
+            'pgpu_ref2': {
+                'enabled_VGPU_types': ['type_ref1', 'type_ref2'],
+                'supported_VGPU_max_capacities': {
+                    'type_ref1': '1',
+                    'type_ref2': '3',
+                }
+            }
+        }
+        session.call_xenapi.return_value = pgpu_records
+        host_obj = host.HostState(session)
+
+        total = host_obj._get_total_vgpu_in_grp('grp_ref', 'type_ref1')
+
+        session.call_xenapi.assert_called_with(
+            'PGPU.get_all_records_where', 'field "GPU_group" = "grp_ref"')
+        # The total amount of VGPUs is equal to sum of vaiable VGPU of
+        # 'type_ref1' in all PGPUs.
+        self.assertEqual(total, 2)
--- a/nova/tests/unit/virt/xenapi/test_xenapi.py
+++ b/nova/tests/unit/virt/xenapi/test_xenapi.py
@ -2230,12 +2230,14 @@ class XenAPIHostTestCase(stubs.XenAPITestBase):

    @mock.patch.object(host.HostState, 'get_disk_used')
    @mock.patch.object(host.HostState, '_get_passthrough_devices')
+    @mock.patch.object(host.HostState, '_get_vgpu_stats')
    @mock.patch.object(jsonutils, 'loads')
    @mock.patch.object(vm_utils, 'list_vms')
    @mock.patch.object(vm_utils, 'scan_default_sr')
    @mock.patch.object(host_management, 'get_host_data')
    def test_update_stats_caches_hostname(self, mock_host_data, mock_scan_sr,
                                          mock_list_vms, mock_loads,
+                                          mock_vgpus_stats,
                                          mock_devices, mock_dis_used):
        data = {'disk_total': 0,
                'disk_used': 0,
@ -2266,10 +2268,12 @@ class XenAPIHostTestCase(stubs.XenAPITestBase):
            self.assertEqual(2, mock_host_data.call_count)
            self.assertEqual(2, mock_scan_sr.call_count)
            self.assertEqual(2, mock_devices.call_count)
+            self.assertEqual(2, mock_vgpus_stats.call_count)
            mock_loads.assert_called_with(data)
            mock_host_data.assert_called_with(self.conn._session)
            mock_scan_sr.assert_called_with(self.conn._session)
            mock_devices.assert_called_with()
+            mock_vgpus_stats.assert_called_with()


@mock.patch.object(host.HostState, 'update_status')
--- a/nova/virt/xenapi/fake.py
+++ b/nova/virt/xenapi/fake.py
@ -68,7 +68,8 @@ from nova.i18n import _


 _CLASSES = ['host', 'network', 'session', 'pool', 'SR', 'VBD',
-            'PBD', 'VDI', 'VIF', 'PIF', 'VM', 'VLAN', 'task']
+            'PBD', 'VDI', 'VIF', 'PIF', 'VM', 'VLAN', 'task',
+            'GPU_group', 'PGPU', 'VGPU_type']
 _after_create_functions = {}
 _destroy_functions = {}

--- a/nova/virt/xenapi/host.py
+++ b/nova/virt/xenapi/host.py
@ -220,6 +220,110 @@ class HostState(object):

        return passthrough_devices

+    def _get_vgpu_stats(self):
+        """Invoke XenAPI to get the stats for VGPUs.
+
+        The return value is a dict which has GPU groups' uuid as
+        the keys:
+            dict(grp_uuid_1=dict_vgpu_stats_in_grp_1,
+                 grp_uuid_2=dict_vgpu_stats_in_grp_2,
+                 ...,
+                 grp_uuid_n=dict_vgpu_stats_in_grp_n)
+        The `dict_vgpu_stats_in_grp_x` is a dict represents the
+        vGPU stats in GPU group x. For details, please refer to
+        the return value of the function of _get_vgpu_stats_in_group().
+        """
+        if not CONF.devices.enabled_vgpu_types:
+            return {}
+
+        vgpu_stats = {}
+
+        # NOTE(jianghuaw): If there are multiple vGPU types enabled in
+        # the configure option, we only choose the first one so that
+        # we support only one vGPU type per compute node at the moment.
+        # Once we switch to use the nested resource providers, we will
+        # remove these lines to allow multiple vGPU types within multiple
+        # GPU groups (each group has a different vGPU type enabled).
+        if len(CONF.devices.enabled_vgpu_types) > 1:
+            LOG.warning('XenAPI only supports one GPU type per compute node,'
+                        ' only first type will be used.')
+        cfg_enabled_types = CONF.devices.enabled_vgpu_types[:1]
+
+        vgpu_grp_refs = self._session.call_xenapi('GPU_group.get_all')
+        for ref in vgpu_grp_refs:
+            grp_uuid = self._session.call_xenapi('GPU_group.get_uuid', ref)
+            stat = self._get_vgpu_stats_in_group(ref, cfg_enabled_types)
+            if stat:
+                vgpu_stats[grp_uuid] = stat
+
+        LOG.debug("Returning vGPU stats: %s", vgpu_stats)
+
+        return vgpu_stats
+
+    def _get_vgpu_stats_in_group(self, grp_ref, vgpu_types):
+        """Get stats for the specified vGPU types in a GPU group.
+
+        NOTE(Jianghuaw): In XenAPI, a GPU group is the minimal unit
+        from where to create a vGPU for an instance. So here, we
+        report vGPU resources for a particular GPU group. When we use
+        nested resource providers to represent the vGPU resources,
+        each GPU group will be a child resource provider under the
+        compute node.
+
+        The return value is a dict. For example:
+        {'uuid': '6444c6ee-3a49-42f5-bebb-606b52175e67',
+         'total': 7,
+         'max_heads': '1',
+         'type_name': 'Intel GVT-g',
+         }
+        """
+        type_refs_in_grp = self._session.call_xenapi(
+            'GPU_group.get_enabled_VGPU_types', grp_ref)
+
+        type_names_in_grp = {self._session.call_xenapi(
+                                 'VGPU_type.get_model_name',
+                                 type_ref): type_ref
+                             for type_ref in type_refs_in_grp}
+        # Get the vGPU types enabled both in this GPU group and in the
+        # nova conf.
+        enabled_types = set(vgpu_types) & set(type_names_in_grp)
+        if not enabled_types:
+            return
+
+        stat = {}
+        # Get the sorted enabled types, so that we can always choose the same
+        # type when there are multiple enabled vGPU types.
+        sorted_types = sorted(enabled_types)
+        chosen_type = sorted_types[0]
+        if len(sorted_types) > 1:
+            LOG.warning('XenAPI only supports one vGPU type per GPU group,'
+                        ' but enabled multiple vGPU types: %(available)s.'
+                        ' Choosing the first one: %(chosen)s.',
+                       dict(available=sorted_types,
+                            chosen=chosen_type))
+        type_ref = type_names_in_grp[chosen_type]
+        type_uuid = self._session.call_xenapi('VGPU_type.get_uuid', type_ref)
+        stat['uuid'] = type_uuid
+        stat['type_name'] = chosen_type
+        stat['max_heads'] = int(self._session.call_xenapi(
+            'VGPU_type.get_max_heads', type_ref))
+
+        stat['total'] = self._get_total_vgpu_in_grp(grp_ref, type_ref)
+        return stat
+
+    def _get_total_vgpu_in_grp(self, grp_ref, type_ref):
+        """Get the total capacity of vGPUs in the group."""
+        pgpu_recs = self._session.call_xenapi(
+            'PGPU.get_all_records_where', 'field "GPU_group" = "%s"' % grp_ref)
+
+        total = 0
+        for pgpu_ref in pgpu_recs:
+            pgpu_rec = pgpu_recs[pgpu_ref]
+            if type_ref in pgpu_rec['enabled_VGPU_types']:
+                cap = pgpu_rec['supported_VGPU_max_capacities'][type_ref]
+                total += int(cap)
+        return total
+
    def get_host_stats(self, refresh=False):
        """Return the current state of the host. If 'refresh' is
        True, run the update first.
@ -309,6 +413,7 @@ class HostState(object):
                vcpus_used = vcpus_used + int(vm_rec['VCPUs_max'])
            data['vcpus_used'] = vcpus_used
            data['pci_passthrough_devices'] = self._get_passthrough_devices()
+            data['vgpu_stats'] = self._get_vgpu_stats()
            self._stats = data