Provide a proper way to choose VF in CNI
This commit fixes incorrect way for choosing VF in SR-IOV binding driver. Previously sriov-device-plugin has choosen device by it's own way while CNI choosed first available VF. This entities did not know anything about each other's choice. Now SR-IOV binding driver gets a list of used by kubelet devices with help of Pod Resources Client, then chooses device from this list which is not used by pod yet and passes an appropriate VF into container's network namespace. Also this commit contains tools for cluster upgrade. Change-Id: I5b24981f715966369b05b8ab157f8bfe02afc2d4 Closes-Bug: 1826865 Signed-off-by: Danil Golov <d.golov@samsung.com>
This commit is contained in:
parent
fc20b3d7ed
commit
5206717f08
|
@ -131,6 +131,33 @@ We have to add to the sriov section following mapping:
|
|||
device_plugin_resource_prefix = samsung.com
|
||||
physnet_resource_mappings = physnet1:numa0
|
||||
|
||||
5. Enable Kubelet Pod Resources feature
|
||||
|
||||
To use SR-IOV functionality properly it is necessary to enable Kubelet Pod
|
||||
Resources feature. Pod Resources is a service provided by Kubelet via gRPC
|
||||
server that allows to request list of resources allocated for each pod and
|
||||
container on the node. These resources are devices allocated by k8s device
|
||||
plugins. Service was implemented mainly for monitoring purposes, but it also
|
||||
suitable for SR-IOV binding driver allowing it to know which VF was allocated
|
||||
for particular container.
|
||||
|
||||
To enable Pod Resources service it is needed to add
|
||||
``--feature-gates KubeletPodResources=true`` into ``/etc/sysconfig/kubelet``.
|
||||
This file could look like::
|
||||
|
||||
KUBELET_EXTRA_ARGS="--feature-gates KubeletPodResources=true"
|
||||
|
||||
Note that it is important to set right value for parameter ``kubelet_root_dir``
|
||||
in ``kuryr.conf``. By default it is ``/var/lib/kubelet``.
|
||||
In case of using containerized CNI it is necessary to mount
|
||||
``'kubelet_root_dir'/pod-resources`` directory into CNI container.
|
||||
|
||||
To use this feature add ``enable_pod_resource_service`` into kuryr.conf.
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
[sriov]
|
||||
enable_pod_resource_service = True
|
||||
|
||||
6. Use privileged user
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ CLI interface for kuryr status commands.
|
|||
|
||||
from __future__ import print_function
|
||||
|
||||
import copy
|
||||
import sys
|
||||
import textwrap
|
||||
import traceback
|
||||
|
@ -108,6 +109,8 @@ class UpgradeCommands(object):
|
|||
|
||||
if obj.obj_name() != objects.vif.PodState.obj_name():
|
||||
old_count += 1
|
||||
elif not self._has_valid_sriov_annot(obj):
|
||||
old_count += 1
|
||||
|
||||
if malformed_count == 0 and old_count == 0:
|
||||
return UpgradeCheckResult(0, 'All annotations are updated.')
|
||||
|
@ -193,16 +196,43 @@ class UpgradeCommands(object):
|
|||
t.add_row(cell)
|
||||
print(t)
|
||||
|
||||
def _has_valid_sriov_annot(self, state):
|
||||
for obj in state.vifs.values():
|
||||
if obj.obj_name() != objects.vif.VIFSriov.obj_name():
|
||||
continue
|
||||
if hasattr(obj, 'pod_name') and hasattr(obj, 'pod_link'):
|
||||
continue
|
||||
return False
|
||||
return True
|
||||
|
||||
def _convert_sriov(self, state):
|
||||
new_state = copy.deepcopy(state)
|
||||
for iface, obj in new_state.additional_vifs.items():
|
||||
if obj.obj_name() != objects.vif.VIFSriov.obj_name():
|
||||
continue
|
||||
if hasattr(obj, 'pod_name') and hasattr(obj, 'pod_link'):
|
||||
continue
|
||||
new_obj = objects.vif.VIFSriov()
|
||||
new_obj.__dict__ = obj.__dict__.copy()
|
||||
new_state.additional_vifs[iface] = new_obj
|
||||
return new_state
|
||||
|
||||
def update_annotations(self):
|
||||
def test_fn(obj):
|
||||
return obj.obj_name() != objects.vif.PodState.obj_name()
|
||||
return (obj.obj_name() != objects.vif.PodState.obj_name() or
|
||||
not self._has_valid_sriov_annot(obj))
|
||||
|
||||
def update_fn(obj):
|
||||
return vif.PodState(default_vif=obj)
|
||||
if obj.obj_name() != objects.vif.PodState.obj_name():
|
||||
return vif.PodState(default_vif=obj)
|
||||
return self._convert_sriov(obj)
|
||||
|
||||
self._convert_annotations(test_fn, update_fn)
|
||||
|
||||
def downgrade_annotations(self):
|
||||
# NOTE(danil): There is no need to downgrade sriov vifs
|
||||
# when annotations has old format. After downgrade annotations
|
||||
# will have only one default vif and it could not be sriov vif
|
||||
def test_fn(obj):
|
||||
return obj.obj_name() == objects.vif.PodState.obj_name()
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ from oslo_concurrency import lockutils
|
|||
from oslo_concurrency import processutils
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log as logging
|
||||
from oslo_serialization import jsonutils
|
||||
|
||||
from kuryr_kubernetes import clients
|
||||
from kuryr_kubernetes.cni.binding import base as b_base
|
||||
|
@ -48,13 +49,8 @@ class VIFSriovDriver(object):
|
|||
|
||||
@release_lock_object
|
||||
def connect(self, vif, ifname, netns, container_id):
|
||||
physnet = vif.physnet
|
||||
pf_names = self._get_host_pf_names(physnet)
|
||||
vf_name, vf_index, pf, pci_info = self._get_available_vf_info(pf_names)
|
||||
|
||||
if not vf_name:
|
||||
raise exceptions.CNIError(
|
||||
"No free interfaces for physnet {} available".format(physnet))
|
||||
pci = self._choose_pci(vif, ifname, netns)
|
||||
vf_name, vf_index, pf, pci_info = self._get_vf_info(pci)
|
||||
|
||||
LOG.debug("Connect {} as {} (port_id={}) in container_id={}".format(
|
||||
vf_name, ifname, vif.id, container_id))
|
||||
|
@ -74,6 +70,9 @@ class VIFSriovDriver(object):
|
|||
iface.mtu = vif.network.mtu
|
||||
iface.up()
|
||||
|
||||
pod_link = vif.pod_link
|
||||
self._annotate_device(pod_link, pci)
|
||||
|
||||
self._save_pci_info(vif.id, pci_info)
|
||||
|
||||
def disconnect(self, vif, ifname, netns, container_id):
|
||||
|
@ -82,48 +81,101 @@ class VIFSriovDriver(object):
|
|||
# it to all-zero state
|
||||
self._remove_pci_info(vif.id)
|
||||
|
||||
def _get_host_pf_names(self, physnet):
|
||||
"""Return a list of PFs, that belong to a physnet"""
|
||||
def _choose_pci(self, vif, ifname, netns):
|
||||
pr_client = clients.get_pod_resources_client()
|
||||
pod_resources_list = pr_client.list()
|
||||
resources = pod_resources_list.pod_resources
|
||||
pod_name = vif.pod_name
|
||||
pod_link = vif.pod_link
|
||||
physnet = vif.physnet
|
||||
resource_name = self._get_resource_by_physnet(physnet)
|
||||
resource = self._make_resource(resource_name)
|
||||
LOG.debug("Vif %s will correspond to pci device belonging to "
|
||||
"resource %s", vif, resource)
|
||||
pod_devices = self._get_pod_devices(pod_link)
|
||||
pod_resource = None
|
||||
container_devices = None
|
||||
for res in resources:
|
||||
if res.name == pod_name:
|
||||
pod_resource = res
|
||||
break
|
||||
if not pod_resource:
|
||||
raise exceptions.CNIError(
|
||||
"No resources are discovered for pod {}".format(pod_name))
|
||||
LOG.debug("Looking for PCI device used by kubelet service and not "
|
||||
"used by pod %s yet ...", pod_name)
|
||||
for container in pod_resource.containers:
|
||||
try:
|
||||
container_devices = container.devices
|
||||
except Exception:
|
||||
LOG.warning("No devices in container %s",
|
||||
container.name)
|
||||
continue
|
||||
|
||||
if physnet not in self._device_pf_mapping:
|
||||
raise cfg.Error(
|
||||
"No mapping for physnet {} in {}".format(
|
||||
physnet, self._device_pf_mapping))
|
||||
return self._device_pf_mapping[physnet]
|
||||
|
||||
def _get_available_vf_info(self, pf_names):
|
||||
"""Scan /sys for unacquired VF among PFs in pf_names"""
|
||||
|
||||
for pf in pf_names:
|
||||
pf_sys_path = '/sys/class/net/{}/device'.format(pf)
|
||||
nvfs = self._get_total_vfs(pf)
|
||||
for vf_index in range(nvfs):
|
||||
vf_sys_path = os.path.join(pf_sys_path,
|
||||
'virtfn{}'.format(vf_index),
|
||||
'net')
|
||||
# TODO(kzaitsev): use /var/run/kuryr/smth
|
||||
lock_path = os.path.join("/tmp",
|
||||
"{}.{}".format(pf, vf_index))
|
||||
self._acquire(lock_path)
|
||||
LOG.debug("Aquired %s lock", lock_path)
|
||||
try:
|
||||
vf_names = os.listdir(vf_sys_path)
|
||||
except OSError:
|
||||
LOG.debug("Could not open %s. "
|
||||
"Skipping vf %s for pf %s", vf_sys_path,
|
||||
vf_index, pf)
|
||||
self._release()
|
||||
for dev in container_devices:
|
||||
if dev.resource_name != resource:
|
||||
continue
|
||||
if not vf_names:
|
||||
LOG.debug("No interfaces in %s. "
|
||||
"Skipping vf %s for pf %s", vf_sys_path,
|
||||
vf_index, pf)
|
||||
self._release()
|
||||
continue
|
||||
vf_name = vf_names[0]
|
||||
pci_info = self._get_pci_info(pf, vf_index)
|
||||
LOG.debug("Aquiring vf %s of pf %s", vf_index, pf)
|
||||
return vf_name, vf_index, pf, pci_info
|
||||
|
||||
for pci in dev.device_ids:
|
||||
if pci in pod_devices:
|
||||
continue
|
||||
LOG.debug("Appropriate PCI device %s is found", pci)
|
||||
return pci
|
||||
|
||||
def _get_resource_by_physnet(self, physnet):
|
||||
mapping = config.CONF.sriov.physnet_resource_mappings
|
||||
try:
|
||||
resource_name = mapping[physnet]
|
||||
except KeyError:
|
||||
LOG.exception("No resource name for physnet %s", physnet)
|
||||
raise
|
||||
return resource_name
|
||||
|
||||
def _make_resource(self, res_name):
|
||||
res_prefix = config.CONF.sriov.device_plugin_resource_prefix
|
||||
return res_prefix + '/' + res_name
|
||||
|
||||
def _get_pod_devices(self, pod_link):
|
||||
k8s = clients.get_kubernetes_client()
|
||||
pod = k8s.get(pod_link)
|
||||
annotations = pod['metadata']['annotations']
|
||||
try:
|
||||
json_devices = annotations[constants.K8S_ANNOTATION_PCI_DEVICES]
|
||||
devices = jsonutils.loads(json_devices)
|
||||
except KeyError:
|
||||
devices = []
|
||||
except Exception as ex:
|
||||
LOG.exception("Exception while getting annotations: %s", ex)
|
||||
return devices
|
||||
|
||||
def _annotate_device(self, pod_link, pci):
|
||||
k8s = clients.get_kubernetes_client()
|
||||
pod_devices = self._get_pod_devices(pod_link)
|
||||
pod_devices.append(pci)
|
||||
pod_devices = jsonutils.dumps(pod_devices)
|
||||
|
||||
LOG.debug("Trying to annotate pod %s with pci %s", pod_link, pci)
|
||||
k8s.annotate(pod_link,
|
||||
{constants.K8S_ANNOTATION_PCI_DEVICES: pod_devices})
|
||||
|
||||
def _get_vf_info(self, pci):
|
||||
vf_sys_path = '/sys/bus/pci/devices/{}/net/'.format(pci)
|
||||
vf_names = os.listdir(vf_sys_path)
|
||||
vf_name = vf_names[0]
|
||||
|
||||
pfysfn_path = '/sys/bus/pci/devices/{}/physfn/net/'.format(pci)
|
||||
pf_names = os.listdir(pfysfn_path)
|
||||
pf_name = pf_names[0]
|
||||
|
||||
nvfs = self._get_total_vfs(pf_name)
|
||||
pf_sys_path = '/sys/class/net/{}/device'.format(pf_name)
|
||||
for vf_index in range(nvfs):
|
||||
virtfn_path = os.path.join(pf_sys_path,
|
||||
'virtfn{}'.format(vf_index))
|
||||
vf_pci = os.path.basename(os.readlink(virtfn_path))
|
||||
if vf_pci == pci:
|
||||
pci_info = self._get_pci_info(pf_name, vf_index)
|
||||
return vf_name, vf_index, pf_name, pci_info
|
||||
return None, None, None, None
|
||||
|
||||
def _get_pci_info(self, pf, vf_index):
|
||||
|
|
|
@ -279,6 +279,8 @@ class CNIDaemonServiceManager(cotyledon.ServiceManager):
|
|||
|
||||
os_vif.initialize()
|
||||
clients.setup_kubernetes_client()
|
||||
if CONF.sriov.enable_pod_resource_service:
|
||||
clients.setup_pod_resources_client()
|
||||
|
||||
self.manager = multiprocessing.Manager()
|
||||
registry = self.manager.dict() # For Watcher->Server communication.
|
||||
|
|
|
@ -258,6 +258,9 @@ sriov_opts = [
|
|||
cfg.StrOpt('kubelet_root_dir',
|
||||
help=_("The root directory of the Kubelet daemon"),
|
||||
default='/var/lib/kubelet'),
|
||||
cfg.BoolOpt('enable_pod_resource_service',
|
||||
help=_("Enable PodResources service"),
|
||||
default=False),
|
||||
cfg.DictOpt('default_physnet_subnets',
|
||||
help=_("A mapping of default subnets for certain physnets "
|
||||
"in a form of physnet-name:<SUBNET-ID>"),
|
||||
|
|
|
@ -52,6 +52,7 @@ K8S_ANNOTATION_NPWG_CRD_SUBNET_ID = 'subnetId'
|
|||
K8S_ANNOTATION_NPWG_CRD_DRIVER_TYPE = 'driverType'
|
||||
|
||||
K8S_ANNOTATION_NODE_PCI_DEVICE_INFO = 'openstack.org/kuryr-pci-info'
|
||||
K8S_ANNOTATION_PCI_DEVICES = K8S_ANNOTATION_PREFIX + '-pci-devices'
|
||||
|
||||
K8S_OS_VIF_NOOP_PLUGIN = "noop"
|
||||
|
||||
|
|
|
@ -58,6 +58,8 @@ class SriovVIFDriver(neutron_vif.NeutronPodVIFDriver):
|
|||
c_utils.tag_neutron_resources('ports', [port['id']])
|
||||
vif = ovu.neutron_to_osvif_vif(vif_plugin, port, subnets)
|
||||
vif.physnet = physnet
|
||||
vif.pod_name = pod_name
|
||||
vif.pod_link = pod['metadata']['selfLink']
|
||||
|
||||
LOG.debug("{} vifs are available for the pod {}".format(
|
||||
amount, pod_name))
|
||||
|
|
|
@ -73,10 +73,13 @@ class VIFMacvlanNested(obj_osvif.VIFBase):
|
|||
@obj_base.VersionedObjectRegistry.register
|
||||
class VIFSriov(obj_osvif.VIFDirect):
|
||||
# This is OVO based SRIOV vif.
|
||||
|
||||
VERSION = '1.0'
|
||||
# Version 1.0: Initial version
|
||||
# Version 1.1: Added pod_name field and pod_link field.
|
||||
VERSION = '1.1'
|
||||
|
||||
fields = {
|
||||
# physnet of the VIF
|
||||
'physnet': obj_fields.StringField(),
|
||||
'pod_name': obj_fields.StringField(),
|
||||
'pod_link': obj_fields.StringField(),
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ from os_vif import objects as osv_objects
|
|||
from oslo_config import cfg
|
||||
|
||||
from kuryr_kubernetes.cni.binding import base
|
||||
from kuryr_kubernetes.cni.binding import sriov
|
||||
from kuryr_kubernetes import objects
|
||||
from kuryr_kubernetes.tests import base as test_base
|
||||
from kuryr_kubernetes.tests import fake
|
||||
|
@ -213,22 +214,50 @@ class TestSriovDriver(TestDriverMixin, test_base.TestCase):
|
|||
def setUp(self):
|
||||
super(TestSriovDriver, self).setUp()
|
||||
self.vif = fake._fake_vif(objects.vif.VIFSriov)
|
||||
self.vif.physnet = 'test_physnet'
|
||||
self.vif.physnet = 'physnet2'
|
||||
self.pci_info = mock.Mock()
|
||||
self.vif.pod_link = 'pod_link'
|
||||
self.vif.pod_name = 'pod_1'
|
||||
self.pci = mock.Mock()
|
||||
|
||||
self.device_ids = ['pci_dev_1']
|
||||
self.device = mock.Mock()
|
||||
self.device.device_ids = self.device_ids
|
||||
self.device.resource_name = 'intel.com/sriov'
|
||||
|
||||
self.cont_devs = [self.device]
|
||||
self.container = mock.Mock()
|
||||
self.container.devices = self.cont_devs
|
||||
|
||||
self.pod_containers = [self.container]
|
||||
self.pod_resource = mock.Mock()
|
||||
self.pod_resource.containers = self.pod_containers
|
||||
self.pod_resource.name = 'pod_1'
|
||||
|
||||
self.resources = [self.pod_resource]
|
||||
|
||||
CONF.set_override('physnet_resource_mappings', 'physnet2:sriov',
|
||||
group='sriov')
|
||||
self.addCleanup(CONF.clear_override, 'physnet_resource_mappings',
|
||||
group='sriov')
|
||||
CONF.set_override('device_plugin_resource_prefix', 'intel.com',
|
||||
group='sriov')
|
||||
|
||||
@mock.patch('kuryr_kubernetes.cni.binding.sriov.VIFSriovDriver.'
|
||||
'_get_host_pf_names')
|
||||
'_annotate_device')
|
||||
@mock.patch('kuryr_kubernetes.cni.binding.sriov.VIFSriovDriver.'
|
||||
'_get_available_vf_info')
|
||||
'_choose_pci')
|
||||
@mock.patch('kuryr_kubernetes.cni.binding.sriov.VIFSriovDriver.'
|
||||
'_get_vf_info')
|
||||
@mock.patch('kuryr_kubernetes.cni.binding.sriov.VIFSriovDriver.'
|
||||
'_set_vf_mac')
|
||||
@mock.patch('kuryr_kubernetes.cni.binding.sriov.VIFSriovDriver.'
|
||||
'_save_pci_info')
|
||||
def test_connect(self, m_save_pci_info, m_set_vf_mac, m_avail_vf_info,
|
||||
m_host_pf_names):
|
||||
m_avail_vf_info.return_value = [self.ifname, 1,
|
||||
'h_interface', self.pci_info]
|
||||
m_host_pf_names.return_value = 'h_interface'
|
||||
def test_connect(self, m_save_pci_info, m_set_vf_mac, m_vf_info,
|
||||
m_choose_pci, m_annot_dev):
|
||||
m_vf_info.return_value = [self.ifname, 1, 'h_interface',
|
||||
self.pci_info]
|
||||
m_choose_pci.return_value = self.pci
|
||||
self._test_connect()
|
||||
|
||||
self.assertEqual(self.ifname, self.m_c_iface.ifname)
|
||||
|
@ -237,9 +266,41 @@ class TestSriovDriver(TestDriverMixin, test_base.TestCase):
|
|||
m_set_vf_mac.assert_called_once_with('h_interface', 1,
|
||||
str(self.vif.address))
|
||||
m_save_pci_info.assert_called_once_with(self.vif.id, self.pci_info)
|
||||
m_annot_dev.assert_called_once_with(self.vif.pod_link, self.pci)
|
||||
|
||||
@mock.patch('kuryr_kubernetes.cni.binding.sriov.VIFSriovDriver.'
|
||||
'_remove_pci_info')
|
||||
def test_disconnect(self, m_remove_pci):
|
||||
m_remove_pci.return_value = None
|
||||
self._test_disconnect()
|
||||
|
||||
@mock.patch('kuryr_kubernetes.clients.get_pod_resources_client')
|
||||
@mock.patch('kuryr_kubernetes.cni.binding.sriov.VIFSriovDriver.'
|
||||
'_get_resource_by_physnet')
|
||||
def test_choose_pci(self, m_get_res_ph, m_get_prc):
|
||||
cls = sriov.VIFSriovDriver
|
||||
m_driver = mock.Mock(spec=cls)
|
||||
|
||||
m_driver._make_resource.return_value = 'intel.com/sriov'
|
||||
m_driver._get_pod_devices.return_value = ['pci_dev_2']
|
||||
|
||||
pod_resources_list = mock.Mock()
|
||||
pod_resources_list.pod_resources = self.resources
|
||||
pod_resources_client = mock.Mock()
|
||||
pod_resources_client.list.return_value = pod_resources_list
|
||||
m_get_prc.return_value = pod_resources_client
|
||||
|
||||
self.assertEqual('pci_dev_1', cls._choose_pci(m_driver, self.vif,
|
||||
self.ifname, self.netns))
|
||||
|
||||
def test_get_resource_by_physnet(self):
|
||||
cls = sriov.VIFSriovDriver
|
||||
m_driver = mock.Mock(spec=cls)
|
||||
self.assertEqual(
|
||||
'sriov', cls._get_resource_by_physnet(m_driver, self.vif.physnet))
|
||||
|
||||
def test_make_resource(self):
|
||||
cls = sriov.VIFSriovDriver
|
||||
m_driver = mock.Mock(spec=cls)
|
||||
self.assertEqual('intel.com/sriov', cls._make_resource(m_driver,
|
||||
'sriov'))
|
||||
|
|
Loading…
Reference in New Issue