vgpu type manamegment POC

1. set default vgpu_type for every gpu if virtualized
2. support change vgpu_type for gpu

Change-Id: I052d120036cd72c8224f7e1d06e96db44979f9ee
This commit is contained in:
songwenping 2023-07-28 15:48:45 +08:00
parent 9df66a96fe
commit afc3c1dcd2
9 changed files with 176 additions and 83 deletions

View File

@ -21,15 +21,15 @@ from oslo_log import log as logging
from oslo_serialization import jsonutils
import collections
import json
import os
import cyborg.conf
from cyborg.accelerator.common import utils
from cyborg.accelerator.drivers.gpu import utils as gpu_utils
from cyborg.agent import rpcapi as agent_rpcapi
from cyborg.common import constants
from cyborg.common import exception
from cyborg.conf import CONF
from cyborg import context
from cyborg.objects.driver_objects import driver_attach_handle
from cyborg.objects.driver_objects import driver_attribute
from cyborg.objects.driver_objects import driver_controlpath_id
@ -145,19 +145,19 @@ def _generate_driver_device(gpu):
driver_device_obj.model = gpu.get('model', 'miss model info')
std_board_info = {'product_id': gpu.get('product_id'),
'controller': gpu.get('controller'), }
vendor_board_info = {'vendor_info': gpu.get('vendor_info',
'gpu_vb_info')}
driver_device_obj.std_board_info = jsonutils.dumps(std_board_info)
driver_device_obj.vendor_board_info = jsonutils.dumps(
vendor_board_info)
driver_device_obj.type = constants.DEVICE_GPU
driver_device_obj.stub = gpu.get('stub', False)
driver_device_obj.controlpath_id = _generate_controlpath_id(gpu)
driver_device_obj.deployable_list = _generate_dep_list(gpu)
vendor_board_info = {'device_address': gpu.get('devices'),
'vgpu_type': gpu.get('vgpu_type')}
driver_device_obj.vendor_board_info = jsonutils.dumps(
vendor_board_info)
return driver_device_obj
def _get_supported_vgpu_types():
def _get_supported_vgpu_types(hostname):
"""Gets supported vgpu_types from cyborg.conf.
Retrieves supported vgpu_types set by the operator and generates a
@ -172,85 +172,52 @@ def _get_supported_vgpu_types():
"""
pgpu_type_mapping = collections.defaultdict(str)
pgpu_type_mapping.clear()
if not CONF.gpu_devices.enabled_vgpu_types:
return [], pgpu_type_mapping
for vgpu_type in CONF.gpu_devices.enabled_vgpu_types:
group = getattr(CONF, 'vgpu_%s' % vgpu_type, None)
if group is None or not group.device_addresses:
# Device addresses must be configured explictly now for every
# enabled vgpu type. Will improve after the disable and enable
# devices interfaces implemented.
raise exception.InvalidvGPUConfig(
reason="Missing device addresses config for vgpu type %s"
% vgpu_type
)
for device_address in group.device_addresses:
if device_address in pgpu_type_mapping:
raise exception.InvalidvGPUConfig(
reason="Duplicate types for PCI address %s"
% device_address
)
# Just checking whether the operator fat-fingered the address.
# If it's wrong, it will return an exception
try:
# Validates whether it's a PCI ID...
utils.parse_address(device_address)
except exception.PciDeviceWrongAddressFormat:
raise exception.InvalidvGPUConfig(
reason="Incorrect PCI address: %s" % device_address
)
admin_context = context.get_admin_context()
devices = agent_rpcapi.AgentAPI().get_devices(admin_context, hostname)
for device in devices:
if device.type != 'GPU' or device.vendor_board_info == 'miss_vb_info':
continue
vbi = json.loads(device.vendor_board_info)
vgpu_type = vbi.get('vgpu_type')
device_address = vbi.get('device_address')
if vgpu_type and device_address:
pgpu_type_mapping[device_address] = vgpu_type
return CONF.gpu_devices.enabled_vgpu_types, pgpu_type_mapping
def _get_vgpu_type_per_pgpu(device_address, supported_vgpu_types,
pgpu_type_mapping):
"""Provides the vGPU type the pGPU supports.
:param device_address: the PCI device address in config,
eg.'0000:af:00.0'
"""
supported_vgpu_types, pgpu_type_mapping = _get_supported_vgpu_types()
# Bail out quickly if we don't support vGPUs
if not supported_vgpu_types:
LOG.warning('Unable to load vGPU_type from [gpu_devices] '
'Ensure "enabled_vgpu_types" is set if the gpu'
'is virtualized.')
return
try:
# Validates whether it's a PCI ID...
utils.parse_address(device_address)
except (exception.PciDeviceWrongAddressFormat, IndexError):
# this is not a valid PCI address
LOG.warning("The PCI address %s was invalid for getting the"
"related vGPU type", device_address)
return
return pgpu_type_mapping.get(device_address)
return pgpu_type_mapping
def _discover_gpus(vendor_id):
"""param: vendor_id=VENDOR_ID means only discover Nvidia GPU on the host
"""
# init vGPU conf
cyborg.conf.devices.register_dynamic_opts(CONF)
supported_vgpu_types, pgpu_type_mapping = _get_supported_vgpu_types()
hostname = CONF.host
pgpu_type_mapping = _get_supported_vgpu_types(hostname)
# discover gpu devices by "lspci"
gpu_list = []
gpus = gpu_utils.get_pci_devices(gpu_utils.GPU_FLAGS, vendor_id)
LOG.info('gpus raw info: %s', gpus)
# report trait,rc and generate driver object
for gpu in gpus:
m = gpu_utils.GPU_INFO_PATTERN.match(gpu)
if m:
gpu_dict = m.groupdict()
# get hostname for deployable_name usage
gpu_dict['hostname'] = CONF.host
gpu_dict['hostname'] = hostname
# get vgpu_type from cyborg.conf, otherwise vgpu_type=None
vgpu_type = _get_vgpu_type_per_pgpu(
gpu_dict["devices"], supported_vgpu_types, pgpu_type_mapping)
vgpu_type = pgpu_type_mapping.get(gpu_dict["devices"])
LOG.info('vgpu_type is %s', vgpu_type)
mdev_path = os.path.expandvars(
'/sys/bus/pci/devices/{0}/mdev_supported_types'.
format(gpu_dict["devices"]))
valid_types = []
try:
valid_types = os.listdir(mdev_path)
LOG.info("The GPU %(gpu)s on host %(host)s is virtualized.",
{"gpu": gpu_dict['devices'], "host": hostname})
except FileNotFoundError:
LOG.info("The GPU %(gpu)s on host %(host)s is unvirtualized.",
{"gpu": gpu_dict['devices'], "host": hostname})
# generate rc and trait for pGPU
if not vgpu_type:
if not valid_types:
gpu_dict["rc"] = constants.RESOURCES["PGPU"]
traits = _get_traits(gpu_dict["vendor_id"],
gpu_dict["product_id"])
@ -258,13 +225,9 @@ def _discover_gpus(vendor_id):
else:
# get rc
gpu_dict["rc"] = constants.RESOURCES["VGPU"]
mdev_path = os.path.expandvars(
'/sys/bus/pci/devices/{0}/mdev_supported_types'.
format(gpu_dict["devices"]))
valid_types = os.listdir(mdev_path)
if vgpu_type not in valid_types:
raise exception.InvalidVGPUType(name=vgpu_type)
gpu_dict["vGPU_type"] = vgpu_type
# default set the first vgpu_type in sorted(valid_types)
vgpu_type = vgpu_type if vgpu_type else sorted(valid_types)[0]
gpu_dict["vgpu_type"] = vgpu_type
vGPU_path = os.path.expandvars(
'/sys/bus/pci/devices/{0}/mdev_supported_types/{1}/'
.format(gpu_dict["devices"], gpu_dict["vGPU_type"]))

View File

@ -89,3 +89,9 @@ class AgentManager(periodic_task.PeriodicTasks):
def remove_vgpu_mdev(self, context, pci_addr, asked_type, ah_uuid):
LOG.debug('Remove a vgpu mdev')
gpu_utils.remove_mdev_privileged(pci_addr, asked_type, ah_uuid)
def get_devices(self, context, hostname):
return self.cond_api.get_host_devices(context, hostname)
def update_mdev(self, context):
self._rt.update_usage(context)

View File

@ -83,3 +83,15 @@ class AgentAPI(object):
pci_addr=pci_addr,
asked_type=asked_type,
ah_uuid=ah_uuid)
def get_devices(self, context, hostname):
LOG.info('Get devices by host: (%s)', hostname)
version = '1.0'
cctxt = self.client.prepare(server=hostname, version=version)
return cctxt.call(context, 'get_devices', hostname=hostname)
def update_mdev(self, context, hostname):
LOG.info('Agent update mdev for hostname: (%s)', hostname)
version = '1.0'
cctxt = self.client.prepare(server=hostname, version=version)
return cctxt.call(context, 'update_mdev')

View File

@ -13,17 +13,24 @@
# License for the specific language governing permissions and limitations
# under the License.
from http import HTTPStatus
import json
import pecan
import subprocess
import wsme
from wsme import types as wtypes
from oslo_log import log
from cyborg.accelerator.drivers.gpu import utils
from cyborg.agent.rpcapi import AgentAPI
from cyborg.api.controllers import base
from cyborg.api.controllers import link
from cyborg.api.controllers import types
from cyborg.api import expose
from cyborg.common import authorize_wsgi
from cyborg.common import exception
from cyborg.common import policy
from cyborg import objects
LOG = log.getLogger(__name__)
@ -92,6 +99,11 @@ class DeviceCollection(base.APIBase):
class DevicesController(base.CyborgController):
"""REST controller for Devices."""
_custom_actions = {'update_type': ['PATCH'], 'get_vgpu_type': ['GET']}
def __init__(self, *args, **kwargs):
super(DevicesController, self).__init__(*args, **kwargs)
self.agent = AgentAPI()
@authorize_wsgi.authorize_wsgi("cyborg:device", "get_one")
@expose.expose(Device, wtypes.text)
@ -128,3 +140,69 @@ class DevicesController(base.CyborgController):
obj_devices = objects.Device.list(context, filters=filters_dict)
LOG.info('[devices:get_all] Returned: %s', obj_devices)
return DeviceCollection.convert_with_links(obj_devices)
@authorize_wsgi.authorize_wsgi("cyborg:device", "get_vgpu_type", False)
@expose.expose('json', wtypes.text, body=types.jsontype, status_code=200)
def get_vgpu_type(self, uuid):
"""Update vgpu_types of a gpu device.
:param uuid: UUID of an device.
"""
context = pecan.request.context
device = objects.Device.get(context, uuid)
if device.type != 'GPU':
raise exception.CyborgException("Only GPU device has vgpu_type.")
hostname = device.hostname
device_address = json.loads(device.vendor_board_info).get("device_address")
command = 'nsenter -m -t1 ssh {1} ls /sys/bus/pci/devices/{0}/mdev_supported_types/'.format(
device_address, hostname)
p = subprocess.Popen(
command,
shell=True, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
out = p.stdout.readlines()
vgpu_types = []
for vgpu_type in out:
vgpu_types.append(vgpu_type.strip())
ret = {'vgpu_types': vgpu_types}
return wsme.api.Response(ret, status_code=HTTPStatus.OK,
return_type=wsme.types.DictType)
@authorize_wsgi.authorize_wsgi("cyborg:device", "update_type", False)
@expose.expose('json', wtypes.text, body=types.jsontype, status_code=200)
def update_type(self, uuid, req_dev):
"""Update vgpu_type for a gpu device.
:param uuid: UUID of an device.
:param req_dev: type of device to update.{"vgpu_type": "nvidia-182"}
"""
LOG.info("[device update_type] PUT request = (%s)", req_dev)
vgpu_type = req_dev.get('vgpu_type')
if not vgpu_type:
raise exception.VGPUTypeIsNeed()
context = pecan.request.context
device = objects.Device.get(context, uuid)
if device.type != 'GPU':
raise exception.CyborgException("Only GPU device can update vgpu_type")
deployables = objects.Deployable.get_list_by_device_id(context, device_id=device.id)
for deployable in deployables:
attach_handlers = objects.AttachHandle.get_ah_list_by_deployable_id(context, deployable.id)
for attach_handler in attach_handlers:
if attach_handler.in_use:
raise exception.DeviceInUse(device=uuid)
LOG.info("[device.vendor_board_info = (%s)", device.vendor_board_info)
vbi = json.loads(device.vendor_board_info)
device_address = vbi.get("device_address")
hostname = device.hostname
command = 'nsenter -m -t1 ssh {2} ls /sys/bus/pci/devices/{0}/mdev_supported_types/{1}'.format(
device_address, vgpu_type, hostname)
p = subprocess.Popen(
command,
shell=True, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
out, err = p.communicate()
if not out:
raise exception.VGPUTypeNotExist(gpu=device_address, vgpu_type=vgpu_type)
vbi.update({'vgpu_type': vgpu_type})
device.vendor_board_info = json.dumps(vbi)
device.save(context)
self.agent.update_mdev(context, hostname)
return Device.convert_with_links(device)

View File

@ -425,3 +425,16 @@ class FPGAProgramError(CyborgException):
class PciDeviceNotFoundById(NotFound):
_msg_fmt = _("PCI device %(id)s not found")
class DeviceInUse(Conflict):
_msg_fmt = _("Device %(device)s is in use.")
class VGPUTypeIsNeed(Invalid):
_msg_fmt = _("The vgpu type is need.")
class VGPUTypeNotExist(Invalid):
_msg_fmt = _("The vgpu type %(vgpu_type)s is not available for "
"GPU %(gpu)s.")

View File

@ -85,16 +85,19 @@ class PlacementClient(object):
def _ensure_traits(self, trait_names):
# TODO(Xinran): maintain a reference count of how many RPs use
# this trait and do the deletion only when the last RP is deleted.
for trait in trait_names:
resp = self.put("/traits/%s" % trait, None, version='1.6')
for trait_name in trait_names:
trait = self.get("/traits/%s" % trait_name, version='1.6')
if trait:
LOG.info("Trait %(trait)s already existed",
{"trait": trait_name})
continue
resp = self.put("/traits/%s" % trait_name, None, version='1.6')
if resp.status_code == 201:
LOG.info("Created trait %(trait)s", {"trait": trait})
elif resp.status_code == 204:
LOG.info("Trait %(trait)s already existed", {"trait": trait})
LOG.info("Created trait %(trait)s", {"trait": trait_name})
else:
raise Exception(
"Failed to create trait %s: HTTP %d: %s" %
(trait, resp.status_code, resp.text))
(trait_name, resp.status_code, resp.text))
def _put_rp_traits(self, rp_uuid, traits_json):
generation = self.get_resource_provider(

View File

@ -48,6 +48,12 @@ device_policies = [
policy.RuleDefault('cyborg:device:get_all',
'rule:allow',
description='Retrieve all device records'),
policy.RuleDefault('cyborg:device:update_type',
'rule:allow',
description='Update vgpu_type of GPU device'),
policy.RuleDefault('cyborg:device:get_vgpu_type',
'rule:allow',
description='Get vgpu_type of GPU device'),
]
deployable_policies = [

View File

@ -104,6 +104,9 @@ class ConductorManager(object):
"""
ExtARQ.apply_patch(context, patch_list, valid_fields)
def get_host_devices(self, context, hostname):
return DriverDevice.list(context, hostname)
def report_data(self, context, hostname, driver_device_list):
"""Update the Cyborg DB in one hostname according to the
discovered device list.

View File

@ -118,3 +118,12 @@ class ConductorAPI(object):
cctxt = self.client.prepare(topic=self.topic)
return cctxt.call(context, 'arq_apply_patch', patch_list=patch_list,
valid_fields=valid_fields)
def get_host_devices(self, context, hostname):
"""Signal to conductor service to get host devices.
:param context: request context.
:param hostname: host name
"""
cctxt = self.client.prepare(topic=self.topic)
return cctxt.call(context, 'get_host_devices', hostname=hostname)