XenAPI: get vGPU stats from hypervisor

This commit is to get vGPU resource stats from XenAPI.
It contains the following data for vGPU types:
* total amount:
  The capacity of the vGPUs which will be used to create
  inventory data for ResourceClass.VGPU.
* max_heads:
  The maximal display heads which will be used to create
  inventory data for ResourceClass.VGPU_DISPLAY_HEAD.
* model_name:
  The vGPU type's name in XenAPI. The vGPU types are filtered
  basing on model_name.
* PGPU group uuid and vGPU type's uuid:
  The identifiers in XenAPI. We will use them to create
  the child resource provider's name in nested-RP, so that
  the vGPU resource allocation can be easily mapped back
  to the PGPU group and vGPU type at spawning instance.

Once it's ready to use the nested resource providers, we
can support multiple vGPU types in a compute node. But
at the moment we only enable one vGPU type in a compute
node. The restriction is implemented by checking the
configure option of enabled_vgpu_type. If multiple vGPU
types are enabled, we will only use the first one.

Change-Id: I2f9cf80b71b50a4b8e011a4a8e40474cc02baba7
blueprint: add-support-for-vgpu
This commit is contained in:
Jianghua Wang 2017-10-13 11:20:29 +00:00
parent fa2c1567c1
commit 6d2cd197bc
5 changed files with 322 additions and 2 deletions

View File

@ -67,7 +67,15 @@ class XenAPIDriverTestCase(stubs.XenAPITestBaseNoDB):
},
'vcpus_used': 10,
'pci_passthrough_devices': '',
'host_other-config': {'iscsi_iqn': 'someiqn'}}
'host_other-config': {'iscsi_iqn': 'someiqn'},
'vgpu_stats': {
'c8328467-badf-43d8-8e28-0e096b0f88b1':
{'uuid': '6444c6ee-3a49-42f5-bebb-606b52175e67',
'total': 7,
'max_heads': 1,
'type_name': 'Intel GVT-g',
},
}}
def test_available_resource(self):
driver = self._get_driver()

View File

@ -0,0 +1,202 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import mock
from nova import test
from nova.virt.xenapi import host
class VGPUTestCase(test.NoDBTestCase):
"""Unit tests for Driver operations."""
@mock.patch.object(host.HostState, 'update_status',
return_value='fake_stats_1')
@mock.patch.object(host.HostState, '_get_vgpu_stats_in_group')
def test_get_vgpu_stats_empty_cfg(self, mock_get, mock_update):
# no vGPU type configured.
self.flags(enabled_vgpu_types=[], group='devices')
session = mock.Mock()
host_obj = host.HostState(session)
stats = host_obj._get_vgpu_stats()
session.call_xenapi.assert_not_called()
self.assertEqual(stats, {})
@mock.patch.object(host.HostState, 'update_status',
return_value='fake_stats_1')
@mock.patch.object(host.HostState, '_get_vgpu_stats_in_group')
def test_get_vgpu_stats_single_type(self, mock_get, mock_update):
# configured single vGPU type
self.flags(enabled_vgpu_types=['type_name_1'], group='devices')
session = mock.Mock()
# multiple GPU groups
session.call_xenapi.side_effect = [
['grp_ref1', 'grp_ref2'], # GPU_group.get_all
'uuid_1', # GPU_group.get_uuid
'uuid_2', # GPU_group.get_uuid
]
# Let it return None for the 2nd GPU group for the case
# that it doesn't have the specified vGPU type enabled.
mock_get.side_effect = ['fake_stats_1', None]
host_obj = host.HostState(session)
stats = host_obj._get_vgpu_stats()
self.assertEqual(session.call_xenapi.call_count, 3)
self.assertEqual(mock_update.call_count, 1)
self.assertEqual(mock_get.call_count, 2)
self.assertEqual(stats, {'uuid_1': 'fake_stats_1'})
@mock.patch.object(host.HostState, 'update_status',
return_value='fake_stats_1')
@mock.patch.object(host.HostState, '_get_vgpu_stats_in_group')
def test_get_vgpu_stats_multi_types(self, mock_get, mock_update):
# when multiple vGPU types configured, it use the first one.
self.flags(enabled_vgpu_types=['type_name_1', 'type_name_2'],
group='devices')
session = mock.Mock()
session.call_xenapi.side_effect = [
['grp_ref1'], # GPU_group.get_all
'uuid_1', # GPU_group.get_uuid
]
mock_get.side_effect = ['fake_stats_1']
host_obj = host.HostState(session)
stats = host_obj._get_vgpu_stats()
self.assertEqual(session.call_xenapi.call_count, 2)
self.assertEqual(mock_update.call_count, 1)
self.assertEqual(stats, {'uuid_1': 'fake_stats_1'})
# called with the first vGPU type: 'type_name_1'
mock_get.assert_called_with('grp_ref1', ['type_name_1'])
@mock.patch.object(host.HostState, 'update_status',
return_value='fake_stats_1')
@mock.patch.object(host.HostState, '_get_total_vgpu_in_grp',
return_value=7)
def test_get_vgpu_stats_in_group(self, mock_get, mock_update):
# Test it will return vGPU stat for the enabled vGPU type.
enabled_vgpu_types = ['type_name_2']
session = mock.Mock()
session.call_xenapi.side_effect = [
['type_ref_1', 'type_ref_2'], # GPU_group.get_enabled_VGPU_types
'type_name_1', # VGPU_type.get_model_name
'type_name_2', # VGPU_type.get_model_name
'type_uuid_2', # VGPU_type.get_uuid
'4', # VGPU_type.get_max_heads
]
host_obj = host.HostState(session)
stats = host_obj._get_vgpu_stats_in_group('grp_ref',
enabled_vgpu_types)
expect_stats = {'uuid': 'type_uuid_2',
'type_name': 'type_name_2',
'max_heads': 4,
'total': 7,
}
self.assertEqual(session.call_xenapi.call_count, 5)
# It should get_uuid for the vGPU type passed via *enabled_vgpu_types*
# (the arg for get_uuid should be 'type_ref_2').
get_uuid_call = [mock.call('VGPU_type.get_uuid', 'type_ref_2')]
session.call_xenapi.assert_has_calls(get_uuid_call)
mock_get.assert_called_once()
self.assertEqual(expect_stats, stats)
@mock.patch.object(host.HostState, 'update_status')
@mock.patch.object(host.HostState, '_get_total_vgpu_in_grp',
return_value=7)
def test_get_vgpu_stats_in_group_multiple(self, mock_get, mock_update):
# Test when enabled multiple vGPU types in the same group.
# It should only return the first vGPU type's stats.
enabled_vgpu_types = ['type_name_1', 'type_name_2']
session = mock.Mock()
session.call_xenapi.side_effect = [
['type_ref_1', 'type_ref_2'], # GPU_group.get_enabled_VGPU_types
'type_name_1', # VGPU_type.get_model_name
'type_name_2', # VGPU_type.get_model_name
'type_uuid_1', # VGPU_type.get_uuid
'4', # VGPU_type.get_max_heads
]
host_obj = host.HostState(session)
stats = host_obj._get_vgpu_stats_in_group('grp_ref',
enabled_vgpu_types)
expect_stats = {
'uuid': 'type_uuid_1',
'type_name': 'type_name_1',
'max_heads': 4,
'total': 7,
}
self.assertEqual(session.call_xenapi.call_count, 5)
# It should call get_uuid for the first vGPU type (the arg for get_uuid
# should be 'type_ref_1').
get_uuid_call = [mock.call('VGPU_type.get_uuid', 'type_ref_1')]
session.call_xenapi.assert_has_calls(get_uuid_call)
mock_get.assert_called_once()
self.assertEqual(expect_stats, stats)
@mock.patch.object(host.HostState, 'update_status')
@mock.patch.object(host.HostState, '_get_total_vgpu_in_grp',
return_value=7)
def test_get_vgpu_stats_in_group_cfg_not_in_grp(self, mock_get,
mock_update):
# Test when the enable_vgpu_types is not a valid
# type belong to the GPU group. It will return None.
enabled_vgpu_types = ['bad_type_name']
session = mock.Mock()
session.call_xenapi.side_effect = [
['type_ref_1', 'type_ref_2'], # GPU_group.get_enabled_VGPU_types
'type_name_1', # VGPU_type.get_model_name
'type_name_2', # VGPU_type.get_model_name
]
host_obj = host.HostState(session)
stats = host_obj._get_vgpu_stats_in_group('grp_ref',
enabled_vgpu_types)
expect_stats = None
self.assertEqual(session.call_xenapi.call_count, 3)
mock_get.assert_not_called()
self.assertEqual(expect_stats, stats)
@mock.patch.object(host.HostState, 'update_status')
def test_get_total_vgpu_in_grp(self, mock_update):
session = mock.Mock()
# The fake PGPU records returned from call_xenapi's string function:
# "PGPU.get_all_records_where".
pgpu_records = {
'pgpu_ref1': {
'enabled_VGPU_types': ['type_ref1', 'type_ref2'],
'supported_VGPU_max_capacities': {
'type_ref1': '1',
'type_ref2': '3',
}
},
'pgpu_ref2': {
'enabled_VGPU_types': ['type_ref1', 'type_ref2'],
'supported_VGPU_max_capacities': {
'type_ref1': '1',
'type_ref2': '3',
}
}
}
session.call_xenapi.return_value = pgpu_records
host_obj = host.HostState(session)
total = host_obj._get_total_vgpu_in_grp('grp_ref', 'type_ref1')
session.call_xenapi.assert_called_with(
'PGPU.get_all_records_where', 'field "GPU_group" = "grp_ref"')
# The total amount of VGPUs is equal to sum of vaiable VGPU of
# 'type_ref1' in all PGPUs.
self.assertEqual(total, 2)

View File

@ -2230,12 +2230,14 @@ class XenAPIHostTestCase(stubs.XenAPITestBase):
@mock.patch.object(host.HostState, 'get_disk_used')
@mock.patch.object(host.HostState, '_get_passthrough_devices')
@mock.patch.object(host.HostState, '_get_vgpu_stats')
@mock.patch.object(jsonutils, 'loads')
@mock.patch.object(vm_utils, 'list_vms')
@mock.patch.object(vm_utils, 'scan_default_sr')
@mock.patch.object(host_management, 'get_host_data')
def test_update_stats_caches_hostname(self, mock_host_data, mock_scan_sr,
mock_list_vms, mock_loads,
mock_vgpus_stats,
mock_devices, mock_dis_used):
data = {'disk_total': 0,
'disk_used': 0,
@ -2266,10 +2268,12 @@ class XenAPIHostTestCase(stubs.XenAPITestBase):
self.assertEqual(2, mock_host_data.call_count)
self.assertEqual(2, mock_scan_sr.call_count)
self.assertEqual(2, mock_devices.call_count)
self.assertEqual(2, mock_vgpus_stats.call_count)
mock_loads.assert_called_with(data)
mock_host_data.assert_called_with(self.conn._session)
mock_scan_sr.assert_called_with(self.conn._session)
mock_devices.assert_called_with()
mock_vgpus_stats.assert_called_with()
@mock.patch.object(host.HostState, 'update_status')

View File

@ -68,7 +68,8 @@ from nova.i18n import _
_CLASSES = ['host', 'network', 'session', 'pool', 'SR', 'VBD',
'PBD', 'VDI', 'VIF', 'PIF', 'VM', 'VLAN', 'task']
'PBD', 'VDI', 'VIF', 'PIF', 'VM', 'VLAN', 'task',
'GPU_group', 'PGPU', 'VGPU_type']
_after_create_functions = {}
_destroy_functions = {}

View File

@ -220,6 +220,110 @@ class HostState(object):
return passthrough_devices
def _get_vgpu_stats(self):
"""Invoke XenAPI to get the stats for VGPUs.
The return value is a dict which has GPU groups' uuid as
the keys:
dict(grp_uuid_1=dict_vgpu_stats_in_grp_1,
grp_uuid_2=dict_vgpu_stats_in_grp_2,
...,
grp_uuid_n=dict_vgpu_stats_in_grp_n)
The `dict_vgpu_stats_in_grp_x` is a dict represents the
vGPU stats in GPU group x. For details, please refer to
the return value of the function of _get_vgpu_stats_in_group().
"""
if not CONF.devices.enabled_vgpu_types:
return {}
vgpu_stats = {}
# NOTE(jianghuaw): If there are multiple vGPU types enabled in
# the configure option, we only choose the first one so that
# we support only one vGPU type per compute node at the moment.
# Once we switch to use the nested resource providers, we will
# remove these lines to allow multiple vGPU types within multiple
# GPU groups (each group has a different vGPU type enabled).
if len(CONF.devices.enabled_vgpu_types) > 1:
LOG.warning('XenAPI only supports one GPU type per compute node,'
' only first type will be used.')
cfg_enabled_types = CONF.devices.enabled_vgpu_types[:1]
vgpu_grp_refs = self._session.call_xenapi('GPU_group.get_all')
for ref in vgpu_grp_refs:
grp_uuid = self._session.call_xenapi('GPU_group.get_uuid', ref)
stat = self._get_vgpu_stats_in_group(ref, cfg_enabled_types)
if stat:
vgpu_stats[grp_uuid] = stat
LOG.debug("Returning vGPU stats: %s", vgpu_stats)
return vgpu_stats
def _get_vgpu_stats_in_group(self, grp_ref, vgpu_types):
"""Get stats for the specified vGPU types in a GPU group.
NOTE(Jianghuaw): In XenAPI, a GPU group is the minimal unit
from where to create a vGPU for an instance. So here, we
report vGPU resources for a particular GPU group. When we use
nested resource providers to represent the vGPU resources,
each GPU group will be a child resource provider under the
compute node.
The return value is a dict. For example:
{'uuid': '6444c6ee-3a49-42f5-bebb-606b52175e67',
'total': 7,
'max_heads': '1',
'type_name': 'Intel GVT-g',
}
"""
type_refs_in_grp = self._session.call_xenapi(
'GPU_group.get_enabled_VGPU_types', grp_ref)
type_names_in_grp = {self._session.call_xenapi(
'VGPU_type.get_model_name',
type_ref): type_ref
for type_ref in type_refs_in_grp}
# Get the vGPU types enabled both in this GPU group and in the
# nova conf.
enabled_types = set(vgpu_types) & set(type_names_in_grp)
if not enabled_types:
return
stat = {}
# Get the sorted enabled types, so that we can always choose the same
# type when there are multiple enabled vGPU types.
sorted_types = sorted(enabled_types)
chosen_type = sorted_types[0]
if len(sorted_types) > 1:
LOG.warning('XenAPI only supports one vGPU type per GPU group,'
' but enabled multiple vGPU types: %(available)s.'
' Choosing the first one: %(chosen)s.',
dict(available=sorted_types,
chosen=chosen_type))
type_ref = type_names_in_grp[chosen_type]
type_uuid = self._session.call_xenapi('VGPU_type.get_uuid', type_ref)
stat['uuid'] = type_uuid
stat['type_name'] = chosen_type
stat['max_heads'] = int(self._session.call_xenapi(
'VGPU_type.get_max_heads', type_ref))
stat['total'] = self._get_total_vgpu_in_grp(grp_ref, type_ref)
return stat
def _get_total_vgpu_in_grp(self, grp_ref, type_ref):
"""Get the total capacity of vGPUs in the group."""
pgpu_recs = self._session.call_xenapi(
'PGPU.get_all_records_where', 'field "GPU_group" = "%s"' % grp_ref)
total = 0
for pgpu_ref in pgpu_recs:
pgpu_rec = pgpu_recs[pgpu_ref]
if type_ref in pgpu_rec['enabled_VGPU_types']:
cap = pgpu_rec['supported_VGPU_max_capacities'][type_ref]
total += int(cap)
return total
def get_host_stats(self, refresh=False):
"""Return the current state of the host. If 'refresh' is
True, run the update first.
@ -309,6 +413,7 @@ class HostState(object):
vcpus_used = vcpus_used + int(vm_rec['VCPUs_max'])
data['vcpus_used'] = vcpus_used
data['pci_passthrough_devices'] = self._get_passthrough_devices()
data['vgpu_stats'] = self._get_vgpu_stats()
self._stats = data