arq bind and unbound support vGPU

This patch is part of the vGPU support feature in cyborg.
It implements arq bind and unbind for vGPU resource.

Co-Authored-By: Wenping Song <songwenping@inspur.com>

Change-Id: I32c3b81345c6ce83834a83c64b88e37926724f16
This commit is contained in:
Yumeng Bao 2021-01-21 15:37:42 +08:00 committed by songwenping
parent 79e1928554
commit 4b34d897d2
9 changed files with 93 additions and 5 deletions

View File

@ -19,6 +19,7 @@ from oslo_log import log as logging
import re
import cyborg.common.exception as exception
import cyborg.conf
import cyborg.privsep
@ -41,6 +42,27 @@ def lspci_privileged():
return processutils.execute(*cmd)
@cyborg.privsep.sys_admin_pctxt.entrypoint
def create_mdev_privileged(pci_addr, mdev_type, ah_uuid):
"""Instantiate a mediated device."""
if ah_uuid is None:
raise exception.AttachHandleUUIDNeeded()
fpath = '/sys/class/mdev_bus/{0}/mdev_supported_types/{1}/create'
fpath = fpath.format(pci_addr, mdev_type)
with open(fpath, 'w') as f:
f.write(ah_uuid)
return ah_uuid
@cyborg.privsep.sys_admin_pctxt.entrypoint
def remove_mdev_privileged(physical_device, mdev_type, medv_uuid):
fpath = ('/sys/class/mdev_bus/{0}/mdev_supported_types/'
'{1}/devices/{2}/remove')
fpath = fpath.format(physical_device, mdev_type, medv_uuid)
with open(fpath, 'w') as f:
f.write("1")
def get_pci_devices(pci_flags, vendor_id=None):
device_for_vendor_out = []
all_device_out = []

View File

@ -21,6 +21,7 @@ from oslo_service import periodic_task
from oslo_utils import uuidutils
from cyborg.accelerator.drivers.fpga.base import FPGADriver
from cyborg.accelerator.drivers.gpu import utils as gpu_utils
from cyborg.agent.resource_tracker import ResourceTracker
from cyborg.agent.rpcapi import AgentAPI
from cyborg.common import exception
@ -80,3 +81,11 @@ class AgentManager(periodic_task.PeriodicTasks):
def update_available_resource(self, context, startup=True):
"""Update all kinds of accelerator resources from their drivers."""
self._rt.update_usage(context)
def create_vgpu_mdev(self, context, pci_addr, asked_type, ah_uuid):
LOG.debug('Instantiate a mediated device')
gpu_utils.create_mdev_privileged(pci_addr, asked_type, ah_uuid)
def remove_vgpu_mdev(self, context, pci_addr, asked_type, ah_uuid):
LOG.debug('Remove a vgpu mdev')
gpu_utils.remove_mdev_privileged(pci_addr, asked_type, ah_uuid)

View File

@ -61,3 +61,25 @@ class AgentAPI(object):
controlpath_id=controlpath_id,
bitstream_uuid=bitstream_uuid,
driver_name=driver_name)
def create_vgpu_mdev(self, context, hostname, pci_addr,
asked_type, ah_uuid):
LOG.debug('Agent create_vgpu_mdev: hostname: (%s) , pci_address: (%s)'
'gpu_id: (%s)', hostname, pci_addr, ah_uuid)
version = '1.0'
cctxt = self.client.prepare(server=hostname, version=version)
return cctxt.call(context, 'create_vgpu_mdev',
pci_addr=pci_addr,
asked_type=asked_type,
ah_uuid=ah_uuid)
def remove_vgpu_mdev(self, context, hostname, pci_addr,
asked_type, ah_uuid):
LOG.debug('Agent remove_vgpu_mdev: hostname: (%s) '
'gpu_id: (%s)', hostname, ah_uuid)
version = '1.0'
cctxt = self.client.prepare(server=hostname, version=version)
return cctxt.call(context, 'remove_vgpu_mdev',
pci_addr=pci_addr,
asked_type=asked_type,
ah_uuid=ah_uuid)

View File

@ -60,6 +60,7 @@ class ARQ(base.APIBase):
"""The UUID of the instance project_id associated with this ARQ, if any"""
attach_handle_type = wtypes.text
attach_handle_uuid = wtypes.text
attach_handle_info = {wtypes.text: wtypes.text}
links = wsme.wsattr([link.Link], readonly=True)

View File

@ -92,6 +92,10 @@ class AttachHandleAlreadyExists(CyborgException):
_msg_fmt = _("AttachHandle with uuid %(uuid)s already exists.")
class AttachHandleUUIDNeeded(CyborgException):
_msg_fmt = _("Need to provide AttachHandle uuid.")
class ControlpathIDAlreadyExists(CyborgException):
_msg_fmt = _("ControlpathID with uuid %(uuid)s already exists.")

View File

@ -370,7 +370,7 @@ class ConductorManager(object):
"resource_providers?name=" + hostname).json()
pr_uuid = provider["resource_providers"][0]["uuid"]
return pr_uuid
except IndexError:
except (IndexError, KeyError):
raise exception.PlacementResourceProviderNotFound(
resource_provider=hostname)

View File

@ -48,6 +48,7 @@ class ARQ(base.CyborgObject, object_base.VersionedObjectDictCompat):
# Fields populated by Cyborg after binding
'attach_handle_type': object_fields.StringField(nullable=True),
'attach_handle_uuid': object_fields.StringField(nullable=True),
'attach_handle_info': object_fields.DictOfStringsField(nullable=True),
}

View File

@ -13,11 +13,14 @@
# License for the specific language governing permissions and limitations
# under the License.
import json
from openstack import connection
from oslo_log import log as logging
from oslo_utils import versionutils
from oslo_versionedobjects import base as object_base
from cyborg.agent.rpcapi import AgentAPI
from cyborg.common import constants
from cyborg.common.constants import ARQ_STATES_TRANSFORM_MATRIX
from cyborg.common import exception
@ -78,6 +81,10 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat,
if target_version < (1, 2) and 'deployable_id' in primitive:
del primitive['deployable_id']
def __init__(self, *args, **kwargs):
super(ExtARQ, self).__init__(*args, **kwargs)
self.agent = AgentAPI()
def create(self, context, device_profile_id=None):
"""Create an ExtARQ record in the DB."""
if 'device_profile_name' not in self.arq and not device_profile_id:
@ -213,6 +220,16 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat,
try:
ah = AttachHandle.allocate(context, deployable.id)
self.attach_handle_id = ah.id
# if attach_handle is a vgpu, create the mdev in the sys path
if ah.attach_type == 'MDEV':
attach_info = json.loads(ah.attach_info)
pci_addr = "{}:{}:{}.{}".format(
attach_info['domain'], attach_info['bus'],
attach_info['device'], attach_info['function'])
hostname = self.arq.hostname
asked_type = attach_info['asked_type']
self.agent.create_vgpu_mdev(
context, hostname, pci_addr, asked_type, ah.uuid)
except Exception as e:
LOG.error("Failed to allocate attach handle for ARQ %s"
"from deployable %s. Reason: %s",
@ -237,9 +254,17 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat,
# if (self.arq.state == constants.ARQ_DELETING
# or self.arq.state == ARQ_UNBOUND):
def _deallocate_attach_handle(self, context, ah_id):
def _deallocate_attach_handle(self, context, ah_id, hostname):
try:
attach_handle = AttachHandle.get_by_id(context, ah_id)
if attach_handle.attach_type == 'MDEV':
attach_info = json.loads(attach_handle.attach_info)
pci_addr = "{}:{}:{}.{}".format(
attach_info['domain'], attach_info['bus'],
attach_info['device'], attach_info['function'])
self.agent.remove_vgpu_mdev(
context, hostname, pci_addr,
attach_info['asked_type'], attach_handle.uuid)
attach_handle.deallocate(context)
except Exception as e:
LOG.error("Failed to deallocate attach handle %s for ARQ %s."
@ -252,6 +277,7 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat,
def unbind(self, context):
arq = self.arq
hostname = arq.hostname
arq.hostname = None
arq.device_rp_uuid = None
arq.instance_uuid = None
@ -260,7 +286,7 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat,
# Unbind: mark attach handles as freed
ah_id = self.attach_handle_id
if ah_id:
self._deallocate_attach_handle(context, ah_id)
self._deallocate_attach_handle(context, ah_id, hostname)
self.attach_handle_id = None
self.deployable_id = None
self.save(context)
@ -285,6 +311,7 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat,
if db_ah is not None:
db_extarq['attach_handle_type'] = db_ah['attach_type']
db_extarq['attach_handle_info'] = db_ah['attach_info']
db_extarq['attach_handle_uuid'] = db_ah['uuid']
else:
raise exception.ResourceNotFound(
resource='Attach Handle',

View File

@ -352,7 +352,8 @@ class TestExtARQObject(base.DbTestCase):
self, mock_deallocate, mock_ah, mock_check_state):
obj_extarq = self.fake_obj_extarqs[0]
mock_ah.return_value = self.fake_obj_ahs[0]
obj_extarq._deallocate_attach_handle(self.context, mock_ah.id)
obj_extarq._deallocate_attach_handle(
self.context, mock_ah.id, obj_extarq.arq.hostname)
mock_check_state.assert_not_called()
@mock.patch('logging.LoggerAdapter.error')
@ -370,7 +371,8 @@ class TestExtARQObject(base.DbTestCase):
mock_deallocate.side_effect = e
self.assertRaises(
exception.ResourceNotFound,
obj_extarq._deallocate_attach_handle, self.context, mock_ah.id)
obj_extarq._deallocate_attach_handle, self.context, mock_ah.id,
obj_extarq.arq.hostname)
mock_log.assert_called_once_with(
msg, mock_ah.id, obj_extarq.arq.uuid, str(e))
mock_check_state.assert_called_once_with(