cyborg/cyborg/agent/resource_tracker.py

77 lines
2.6 KiB
Python

# Copyright (c) 2018 Intel.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""
Track resources like FPGA GPU and QAT for a host. Provides the
conductor with useful information about availability through the accelerator
model.
"""
from oslo_log import log as logging
from stevedore import driver
from stevedore.extension import ExtensionManager
from cyborg.common import exception
from cyborg.common import utils
from cyborg.conf import CONF
LOG = logging.getLogger(__name__)
AGENT_RESOURCE_SEMAPHORE = "agent_resources"
class ResourceTracker(object):
"""Agent helper class for keeping track of resource usage as instances
are built and destroyed.
"""
def __init__(self, host, cond_api):
self.host = host
self.conductor_api = cond_api
self.acc_drivers = []
self._initialize_drivers()
def _initialize_drivers(self, enabled_drivers=[]):
"""
Load accelerator drivers.
:return: [nvidia_gpu_driver_obj, intel_fpga_driver_obj]
"""
acc_drivers = []
if not enabled_drivers:
enabled_drivers = CONF.agent.enabled_drivers
valid_drivers = ExtensionManager(
namespace='cyborg.accelerator.driver').names()
for d in enabled_drivers:
if d not in valid_drivers:
raise exception.InvalidDriver(name=d)
acc_driver = driver.DriverManager(
namespace='cyborg.accelerator.driver', name=d,
invoke_on_load=True).driver
acc_drivers.append(acc_driver)
self.acc_drivers = acc_drivers
@utils.synchronized(AGENT_RESOURCE_SEMAPHORE)
def update_usage(self, context):
"""Update the resource usage periodically.
"""
acc_list = []
for acc_driver in self.acc_drivers:
acc_list.extend(acc_driver.discover())
# Call conductor_api here to diff and report acc data. Now, we actually
# do not have the method report_data.
if acc_list:
self.conductor_api.report_data(context, self.host, acc_list)