vgpu type manamegment POC

1. set default vgpu_type for every gpu if virtualized 2. support change vgpu_type for gpu Change-Id: I052d120036cd72c8224f7e1d06e96db44979f9ee
2023-07-28 15:48:45 +08:00 · 2023-07-28 15:48:45 +08:00 · afc3c1dcd2
parent 9df66a96fe
commit afc3c1dcd2
9 changed files with 176 additions and 83 deletions
--- a/cyborg/accelerator/drivers/gpu/nvidia/sysinfo.py
+++ b/cyborg/accelerator/drivers/gpu/nvidia/sysinfo.py
@ -21,15 +21,15 @@ from oslo_log import log as logging
 from oslo_serialization import jsonutils

 import collections
+import json
 import os

-import cyborg.conf
-
 from cyborg.accelerator.common import utils
 from cyborg.accelerator.drivers.gpu import utils as gpu_utils
+from cyborg.agent import rpcapi as agent_rpcapi
 from cyborg.common import constants
-from cyborg.common import exception
 from cyborg.conf import CONF
+from cyborg import context
 from cyborg.objects.driver_objects import driver_attach_handle
 from cyborg.objects.driver_objects import driver_attribute
 from cyborg.objects.driver_objects import driver_controlpath_id
@ -145,19 +145,19 @@ def _generate_driver_device(gpu):
    driver_device_obj.model = gpu.get('model', 'miss model info')
    std_board_info = {'product_id': gpu.get('product_id'),
                      'controller': gpu.get('controller'), }
-    vendor_board_info = {'vendor_info': gpu.get('vendor_info',
-                         'gpu_vb_info')}
    driver_device_obj.std_board_info = jsonutils.dumps(std_board_info)
-    driver_device_obj.vendor_board_info = jsonutils.dumps(
-        vendor_board_info)
    driver_device_obj.type = constants.DEVICE_GPU
    driver_device_obj.stub = gpu.get('stub', False)
    driver_device_obj.controlpath_id = _generate_controlpath_id(gpu)
    driver_device_obj.deployable_list = _generate_dep_list(gpu)
+    vendor_board_info = {'device_address': gpu.get('devices'),
+                         'vgpu_type': gpu.get('vgpu_type')}
+    driver_device_obj.vendor_board_info = jsonutils.dumps(
+        vendor_board_info)
    return driver_device_obj


-def _get_supported_vgpu_types():
+def _get_supported_vgpu_types(hostname):
    """Gets supported vgpu_types from cyborg.conf.

    Retrieves supported vgpu_types set by the operator and generates a
@ -172,85 +172,52 @@ def _get_supported_vgpu_types():
    """
    pgpu_type_mapping = collections.defaultdict(str)
    pgpu_type_mapping.clear()
-    if not CONF.gpu_devices.enabled_vgpu_types:
-        return [], pgpu_type_mapping
-
-    for vgpu_type in CONF.gpu_devices.enabled_vgpu_types:
-        group = getattr(CONF, 'vgpu_%s' % vgpu_type, None)
-        if group is None or not group.device_addresses:
-            # Device addresses must be configured explictly now for every
-            # enabled vgpu type. Will improve after the disable and enable
-            # devices interfaces implemented.
-            raise exception.InvalidvGPUConfig(
-                reason="Missing device addresses config for vgpu type %s"
-                % vgpu_type
-            )
-        for device_address in group.device_addresses:
-            if device_address in pgpu_type_mapping:
-                raise exception.InvalidvGPUConfig(
-                    reason="Duplicate types for PCI address %s"
-                    % device_address
-                )
-            # Just checking whether the operator fat-fingered the address.
-            # If it's wrong, it will return an exception
-            try:
-                # Validates whether it's a PCI ID...
-                utils.parse_address(device_address)
-            except exception.PciDeviceWrongAddressFormat:
-                raise exception.InvalidvGPUConfig(
-                    reason="Incorrect PCI address: %s" % device_address
-                )
+    admin_context = context.get_admin_context()
+    devices = agent_rpcapi.AgentAPI().get_devices(admin_context, hostname)
+    for device in devices:
+        if device.type != 'GPU' or device.vendor_board_info == 'miss_vb_info':
+            continue
+        vbi = json.loads(device.vendor_board_info)
+        vgpu_type = vbi.get('vgpu_type')
+        device_address = vbi.get('device_address')
+        if vgpu_type and device_address:
            pgpu_type_mapping[device_address] = vgpu_type
-    return CONF.gpu_devices.enabled_vgpu_types, pgpu_type_mapping
-
-
-def _get_vgpu_type_per_pgpu(device_address, supported_vgpu_types,
-                            pgpu_type_mapping):
-    """Provides the vGPU type the pGPU supports.
-
-    :param device_address: the PCI device address in config,
-                           eg.'0000:af:00.0'
-    """
-    supported_vgpu_types, pgpu_type_mapping = _get_supported_vgpu_types()
-    # Bail out quickly if we don't support vGPUs
-    if not supported_vgpu_types:
-        LOG.warning('Unable to load vGPU_type from [gpu_devices] '
-                    'Ensure "enabled_vgpu_types" is set if the gpu'
-                    'is virtualized.')
-        return
-
-    try:
-        # Validates whether it's a PCI ID...
-        utils.parse_address(device_address)
-    except (exception.PciDeviceWrongAddressFormat, IndexError):
-        # this is not a valid PCI address
-        LOG.warning("The PCI address %s was invalid for getting the"
-                    "related vGPU type", device_address)
-        return
-    return pgpu_type_mapping.get(device_address)
+    return pgpu_type_mapping


 def _discover_gpus(vendor_id):
    """param: vendor_id=VENDOR_ID means only discover Nvidia GPU on the host
    """
-    # init vGPU conf
-    cyborg.conf.devices.register_dynamic_opts(CONF)
-    supported_vgpu_types, pgpu_type_mapping = _get_supported_vgpu_types()
+    hostname = CONF.host
+    pgpu_type_mapping = _get_supported_vgpu_types(hostname)
    # discover gpu devices by "lspci"
    gpu_list = []
    gpus = gpu_utils.get_pci_devices(gpu_utils.GPU_FLAGS, vendor_id)
+    LOG.info('gpus raw info: %s', gpus)
    # report trait,rc and generate driver object
    for gpu in gpus:
        m = gpu_utils.GPU_INFO_PATTERN.match(gpu)
        if m:
            gpu_dict = m.groupdict()
            # get hostname for deployable_name usage
-            gpu_dict['hostname'] = CONF.host
+            gpu_dict['hostname'] = hostname
            # get vgpu_type from cyborg.conf, otherwise vgpu_type=None
-            vgpu_type = _get_vgpu_type_per_pgpu(
-                gpu_dict["devices"], supported_vgpu_types, pgpu_type_mapping)
+            vgpu_type = pgpu_type_mapping.get(gpu_dict["devices"])
+            LOG.info('vgpu_type is %s', vgpu_type)
+            mdev_path = os.path.expandvars(
+                '/sys/bus/pci/devices/{0}/mdev_supported_types'.
+                    format(gpu_dict["devices"]))
+            valid_types = []
+            try:
+                valid_types = os.listdir(mdev_path)
+                LOG.info("The GPU %(gpu)s on host %(host)s is virtualized.",
+                         {"gpu": gpu_dict['devices'], "host": hostname})
+            except FileNotFoundError:
+                LOG.info("The GPU %(gpu)s on host %(host)s is unvirtualized.",
+                         {"gpu": gpu_dict['devices'], "host": hostname})
+
            # generate rc and trait for pGPU
-            if not vgpu_type:
+            if not valid_types:
                gpu_dict["rc"] = constants.RESOURCES["PGPU"]
                traits = _get_traits(gpu_dict["vendor_id"],
                                     gpu_dict["product_id"])
@ -258,13 +225,9 @@ def _discover_gpus(vendor_id):
            else:
                # get rc
                gpu_dict["rc"] = constants.RESOURCES["VGPU"]
-                mdev_path = os.path.expandvars(
-                    '/sys/bus/pci/devices/{0}/mdev_supported_types'.
-                    format(gpu_dict["devices"]))
-                valid_types = os.listdir(mdev_path)
-                if vgpu_type not in valid_types:
-                    raise exception.InvalidVGPUType(name=vgpu_type)
-                gpu_dict["vGPU_type"] = vgpu_type
+                # default set the first vgpu_type in sorted(valid_types)
+                vgpu_type = vgpu_type if vgpu_type else sorted(valid_types)[0]
+                gpu_dict["vgpu_type"] = vgpu_type
                vGPU_path = os.path.expandvars(
                    '/sys/bus/pci/devices/{0}/mdev_supported_types/{1}/'
                    .format(gpu_dict["devices"], gpu_dict["vGPU_type"]))
--- a/cyborg/agent/manager.py
+++ b/cyborg/agent/manager.py
@ -89,3 +89,9 @@ class AgentManager(periodic_task.PeriodicTasks):
    def remove_vgpu_mdev(self, context, pci_addr, asked_type, ah_uuid):
        LOG.debug('Remove a vgpu mdev')
        gpu_utils.remove_mdev_privileged(pci_addr, asked_type, ah_uuid)
+
+    def get_devices(self, context, hostname):
+        return self.cond_api.get_host_devices(context, hostname)
+
+    def update_mdev(self, context):
+        self._rt.update_usage(context)
--- a/cyborg/agent/rpcapi.py
+++ b/cyborg/agent/rpcapi.py
@ -83,3 +83,15 @@ class AgentAPI(object):
                          pci_addr=pci_addr,
                          asked_type=asked_type,
                          ah_uuid=ah_uuid)
+
+    def get_devices(self, context, hostname):
+        LOG.info('Get devices by host: (%s)', hostname)
+        version = '1.0'
+        cctxt = self.client.prepare(server=hostname, version=version)
+        return cctxt.call(context, 'get_devices', hostname=hostname)
+
+    def update_mdev(self, context, hostname):
+        LOG.info('Agent update mdev for hostname: (%s)', hostname)
+        version = '1.0'
+        cctxt = self.client.prepare(server=hostname, version=version)
+        return cctxt.call(context, 'update_mdev')
--- a/cyborg/api/controllers/v2/devices.py
+++ b/cyborg/api/controllers/v2/devices.py
@ -13,17 +13,24 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.

+from http import HTTPStatus
+import json
 import pecan
+import subprocess
 import wsme
 from wsme import types as wtypes

 from oslo_log import log

+from cyborg.accelerator.drivers.gpu import utils
+from cyborg.agent.rpcapi import AgentAPI
 from cyborg.api.controllers import base
 from cyborg.api.controllers import link
 from cyborg.api.controllers import types
 from cyborg.api import expose
 from cyborg.common import authorize_wsgi
+from cyborg.common import exception
+from cyborg.common import policy
 from cyborg import objects

 LOG = log.getLogger(__name__)
@ -92,6 +99,11 @@ class DeviceCollection(base.APIBase):

 class DevicesController(base.CyborgController):
    """REST controller for Devices."""
+    _custom_actions = {'update_type': ['PATCH'], 'get_vgpu_type': ['GET']}
+
+    def __init__(self, *args, **kwargs):
+        super(DevicesController, self).__init__(*args, **kwargs)
+        self.agent = AgentAPI()

    @authorize_wsgi.authorize_wsgi("cyborg:device", "get_one")
    @expose.expose(Device, wtypes.text)
@ -128,3 +140,69 @@ class DevicesController(base.CyborgController):
        obj_devices = objects.Device.list(context, filters=filters_dict)
        LOG.info('[devices:get_all] Returned: %s', obj_devices)
        return DeviceCollection.convert_with_links(obj_devices)
+
+    @authorize_wsgi.authorize_wsgi("cyborg:device", "get_vgpu_type", False)
+    @expose.expose('json', wtypes.text, body=types.jsontype, status_code=200)
+    def get_vgpu_type(self, uuid):
+        """Update vgpu_types of a gpu device.
+        :param uuid: UUID of an device.
+        """
+        context = pecan.request.context
+        device = objects.Device.get(context, uuid)
+        if device.type != 'GPU':
+            raise exception.CyborgException("Only GPU device has vgpu_type.")
+        hostname = device.hostname
+        device_address = json.loads(device.vendor_board_info).get("device_address")
+        command = 'nsenter -m -t1 ssh {1} ls /sys/bus/pci/devices/{0}/mdev_supported_types/'.format(
+            device_address, hostname)
+        p = subprocess.Popen(
+            command,
+            shell=True, stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        out = p.stdout.readlines()
+        vgpu_types = []
+        for vgpu_type in out:
+            vgpu_types.append(vgpu_type.strip())
+        ret = {'vgpu_types': vgpu_types}
+        return wsme.api.Response(ret, status_code=HTTPStatus.OK,
+                                 return_type=wsme.types.DictType)
+
+    @authorize_wsgi.authorize_wsgi("cyborg:device", "update_type", False)
+    @expose.expose('json', wtypes.text, body=types.jsontype, status_code=200)
+    def update_type(self, uuid, req_dev):
+        """Update vgpu_type for a gpu device.
+        :param uuid: UUID of an device.
+        :param req_dev: type of device to update.{"vgpu_type": "nvidia-182"}
+        """
+        LOG.info("[device update_type] PUT request = (%s)", req_dev)
+        vgpu_type = req_dev.get('vgpu_type')
+        if not vgpu_type:
+            raise exception.VGPUTypeIsNeed()
+        context = pecan.request.context
+        device = objects.Device.get(context, uuid)
+        if device.type != 'GPU':
+            raise exception.CyborgException("Only GPU device can update vgpu_type")
+        deployables = objects.Deployable.get_list_by_device_id(context, device_id=device.id)
+        for deployable in deployables:
+            attach_handlers = objects.AttachHandle.get_ah_list_by_deployable_id(context, deployable.id)
+            for attach_handler in attach_handlers:
+                if attach_handler.in_use:
+                    raise exception.DeviceInUse(device=uuid)
+        LOG.info("[device.vendor_board_info = (%s)", device.vendor_board_info)
+        vbi = json.loads(device.vendor_board_info)
+        device_address = vbi.get("device_address")
+        hostname = device.hostname
+        command = 'nsenter -m -t1 ssh {2} ls /sys/bus/pci/devices/{0}/mdev_supported_types/{1}'.format(
+            device_address, vgpu_type, hostname)
+        p = subprocess.Popen(
+            command,
+            shell=True, stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        out, err = p.communicate()
+        if not out:
+            raise exception.VGPUTypeNotExist(gpu=device_address, vgpu_type=vgpu_type)
+        vbi.update({'vgpu_type': vgpu_type})
+        device.vendor_board_info = json.dumps(vbi)
+        device.save(context)
+        self.agent.update_mdev(context, hostname)
+        return Device.convert_with_links(device)
--- a/cyborg/common/exception.py
+++ b/cyborg/common/exception.py
@ -425,3 +425,16 @@ class FPGAProgramError(CyborgException):

 class PciDeviceNotFoundById(NotFound):
    _msg_fmt = _("PCI device %(id)s not found")
+
+
+class DeviceInUse(Conflict):
+    _msg_fmt = _("Device %(device)s is in use.")
+
+
+class VGPUTypeIsNeed(Invalid):
+    _msg_fmt = _("The vgpu type is need.")
+
+
+class VGPUTypeNotExist(Invalid):
+    _msg_fmt = _("The vgpu type %(vgpu_type)s is not available for "
+                 "GPU %(gpu)s.")
--- a/cyborg/common/placement_client.py
+++ b/cyborg/common/placement_client.py
@ -85,16 +85,19 @@ class PlacementClient(object):
    def _ensure_traits(self, trait_names):
        # TODO(Xinran): maintain a reference count of how many RPs use
        # this trait and do the deletion only when the last RP is deleted.
-        for trait in trait_names:
-            resp = self.put("/traits/%s" % trait, None, version='1.6')
+        for trait_name in trait_names:
+            trait = self.get("/traits/%s" % trait_name, version='1.6')
+            if trait:
+                LOG.info("Trait %(trait)s already existed",
+                         {"trait": trait_name})
+                continue
+            resp = self.put("/traits/%s" % trait_name, None, version='1.6')
            if resp.status_code == 201:
-                LOG.info("Created trait %(trait)s", {"trait": trait})
-            elif resp.status_code == 204:
-                LOG.info("Trait %(trait)s already existed", {"trait": trait})
+                LOG.info("Created trait %(trait)s", {"trait": trait_name})
            else:
                raise Exception(
                    "Failed to create trait %s: HTTP %d: %s" %
-                    (trait, resp.status_code, resp.text))
+                    (trait_name, resp.status_code, resp.text))

    def _put_rp_traits(self, rp_uuid, traits_json):
        generation = self.get_resource_provider(
--- a/cyborg/common/policy.py
+++ b/cyborg/common/policy.py
@ -48,6 +48,12 @@ device_policies = [
    policy.RuleDefault('cyborg:device:get_all',
                       'rule:allow',
                       description='Retrieve all device records'),
+    policy.RuleDefault('cyborg:device:update_type',
+                       'rule:allow',
+                       description='Update vgpu_type of GPU device'),
+    policy.RuleDefault('cyborg:device:get_vgpu_type',
+                       'rule:allow',
+                       description='Get vgpu_type of GPU device'),
 ]

 deployable_policies = [
--- a/cyborg/conductor/manager.py
+++ b/cyborg/conductor/manager.py
@ -104,6 +104,9 @@ class ConductorManager(object):
        """
        ExtARQ.apply_patch(context, patch_list, valid_fields)

+    def get_host_devices(self, context, hostname):
+        return DriverDevice.list(context, hostname)
+
    def report_data(self, context, hostname, driver_device_list):
        """Update the Cyborg DB in one hostname according to the
        discovered device list.
--- a/cyborg/conductor/rpcapi.py
+++ b/cyborg/conductor/rpcapi.py
@ -118,3 +118,12 @@ class ConductorAPI(object):
        cctxt = self.client.prepare(topic=self.topic)
        return cctxt.call(context, 'arq_apply_patch', patch_list=patch_list,
                          valid_fields=valid_fields)
+
+    def get_host_devices(self, context, hostname):
+        """Signal to conductor service to get host devices.
+
+        :param context: request context.
+        :param hostname: host name
+        """
+        cctxt = self.client.prepare(topic=self.topic)
+        return cctxt.call(context, 'get_host_devices', hostname=hostname)