Provide a proper way to choose VF in CNI

This commit fixes incorrect way for choosing VF in
SR-IOV binding driver.

Previously sriov-device-plugin has choosen device
by it's own way while CNI choosed first available VF.
This entities did not know anything about each other's
choice.

Now SR-IOV binding driver gets a list of used by
kubelet devices with help of Pod Resources Client, then
chooses device from this list which is not used by pod
yet and passes an appropriate VF into container's
network namespace.

Also this commit contains tools for cluster upgrade.

Change-Id: I5b24981f715966369b05b8ab157f8bfe02afc2d4
Closes-Bug: 1826865
Signed-off-by: Danil Golov <d.golov@samsung.com>
This commit is contained in:
Danil Golov 2019-04-29 17:29:25 +03:00
parent fc20b3d7ed
commit 5206717f08
9 changed files with 240 additions and 59 deletions

View File

@ -131,6 +131,33 @@ We have to add to the sriov section following mapping:
device_plugin_resource_prefix = samsung.com
physnet_resource_mappings = physnet1:numa0
5. Enable Kubelet Pod Resources feature
To use SR-IOV functionality properly it is necessary to enable Kubelet Pod
Resources feature. Pod Resources is a service provided by Kubelet via gRPC
server that allows to request list of resources allocated for each pod and
container on the node. These resources are devices allocated by k8s device
plugins. Service was implemented mainly for monitoring purposes, but it also
suitable for SR-IOV binding driver allowing it to know which VF was allocated
for particular container.
To enable Pod Resources service it is needed to add
``--feature-gates KubeletPodResources=true`` into ``/etc/sysconfig/kubelet``.
This file could look like::
KUBELET_EXTRA_ARGS="--feature-gates KubeletPodResources=true"
Note that it is important to set right value for parameter ``kubelet_root_dir``
in ``kuryr.conf``. By default it is ``/var/lib/kubelet``.
In case of using containerized CNI it is necessary to mount
``'kubelet_root_dir'/pod-resources`` directory into CNI container.
To use this feature add ``enable_pod_resource_service`` into kuryr.conf.
.. code-block:: ini
[sriov]
enable_pod_resource_service = True
6. Use privileged user

View File

@ -18,6 +18,7 @@ CLI interface for kuryr status commands.
from __future__ import print_function
import copy
import sys
import textwrap
import traceback
@ -108,6 +109,8 @@ class UpgradeCommands(object):
if obj.obj_name() != objects.vif.PodState.obj_name():
old_count += 1
elif not self._has_valid_sriov_annot(obj):
old_count += 1
if malformed_count == 0 and old_count == 0:
return UpgradeCheckResult(0, 'All annotations are updated.')
@ -193,16 +196,43 @@ class UpgradeCommands(object):
t.add_row(cell)
print(t)
def _has_valid_sriov_annot(self, state):
for obj in state.vifs.values():
if obj.obj_name() != objects.vif.VIFSriov.obj_name():
continue
if hasattr(obj, 'pod_name') and hasattr(obj, 'pod_link'):
continue
return False
return True
def _convert_sriov(self, state):
new_state = copy.deepcopy(state)
for iface, obj in new_state.additional_vifs.items():
if obj.obj_name() != objects.vif.VIFSriov.obj_name():
continue
if hasattr(obj, 'pod_name') and hasattr(obj, 'pod_link'):
continue
new_obj = objects.vif.VIFSriov()
new_obj.__dict__ = obj.__dict__.copy()
new_state.additional_vifs[iface] = new_obj
return new_state
def update_annotations(self):
def test_fn(obj):
return obj.obj_name() != objects.vif.PodState.obj_name()
return (obj.obj_name() != objects.vif.PodState.obj_name() or
not self._has_valid_sriov_annot(obj))
def update_fn(obj):
return vif.PodState(default_vif=obj)
if obj.obj_name() != objects.vif.PodState.obj_name():
return vif.PodState(default_vif=obj)
return self._convert_sriov(obj)
self._convert_annotations(test_fn, update_fn)
def downgrade_annotations(self):
# NOTE(danil): There is no need to downgrade sriov vifs
# when annotations has old format. After downgrade annotations
# will have only one default vif and it could not be sriov vif
def test_fn(obj):
return obj.obj_name() == objects.vif.PodState.obj_name()

View File

@ -20,6 +20,7 @@ from oslo_concurrency import lockutils
from oslo_concurrency import processutils
from oslo_config import cfg
from oslo_log import log as logging
from oslo_serialization import jsonutils
from kuryr_kubernetes import clients
from kuryr_kubernetes.cni.binding import base as b_base
@ -48,13 +49,8 @@ class VIFSriovDriver(object):
@release_lock_object
def connect(self, vif, ifname, netns, container_id):
physnet = vif.physnet
pf_names = self._get_host_pf_names(physnet)
vf_name, vf_index, pf, pci_info = self._get_available_vf_info(pf_names)
if not vf_name:
raise exceptions.CNIError(
"No free interfaces for physnet {} available".format(physnet))
pci = self._choose_pci(vif, ifname, netns)
vf_name, vf_index, pf, pci_info = self._get_vf_info(pci)
LOG.debug("Connect {} as {} (port_id={}) in container_id={}".format(
vf_name, ifname, vif.id, container_id))
@ -74,6 +70,9 @@ class VIFSriovDriver(object):
iface.mtu = vif.network.mtu
iface.up()
pod_link = vif.pod_link
self._annotate_device(pod_link, pci)
self._save_pci_info(vif.id, pci_info)
def disconnect(self, vif, ifname, netns, container_id):
@ -82,48 +81,101 @@ class VIFSriovDriver(object):
# it to all-zero state
self._remove_pci_info(vif.id)
def _get_host_pf_names(self, physnet):
"""Return a list of PFs, that belong to a physnet"""
def _choose_pci(self, vif, ifname, netns):
pr_client = clients.get_pod_resources_client()
pod_resources_list = pr_client.list()
resources = pod_resources_list.pod_resources
pod_name = vif.pod_name
pod_link = vif.pod_link
physnet = vif.physnet
resource_name = self._get_resource_by_physnet(physnet)
resource = self._make_resource(resource_name)
LOG.debug("Vif %s will correspond to pci device belonging to "
"resource %s", vif, resource)
pod_devices = self._get_pod_devices(pod_link)
pod_resource = None
container_devices = None
for res in resources:
if res.name == pod_name:
pod_resource = res
break
if not pod_resource:
raise exceptions.CNIError(
"No resources are discovered for pod {}".format(pod_name))
LOG.debug("Looking for PCI device used by kubelet service and not "
"used by pod %s yet ...", pod_name)
for container in pod_resource.containers:
try:
container_devices = container.devices
except Exception:
LOG.warning("No devices in container %s",
container.name)
continue
if physnet not in self._device_pf_mapping:
raise cfg.Error(
"No mapping for physnet {} in {}".format(
physnet, self._device_pf_mapping))
return self._device_pf_mapping[physnet]
def _get_available_vf_info(self, pf_names):
"""Scan /sys for unacquired VF among PFs in pf_names"""
for pf in pf_names:
pf_sys_path = '/sys/class/net/{}/device'.format(pf)
nvfs = self._get_total_vfs(pf)
for vf_index in range(nvfs):
vf_sys_path = os.path.join(pf_sys_path,
'virtfn{}'.format(vf_index),
'net')
# TODO(kzaitsev): use /var/run/kuryr/smth
lock_path = os.path.join("/tmp",
"{}.{}".format(pf, vf_index))
self._acquire(lock_path)
LOG.debug("Aquired %s lock", lock_path)
try:
vf_names = os.listdir(vf_sys_path)
except OSError:
LOG.debug("Could not open %s. "
"Skipping vf %s for pf %s", vf_sys_path,
vf_index, pf)
self._release()
for dev in container_devices:
if dev.resource_name != resource:
continue
if not vf_names:
LOG.debug("No interfaces in %s. "
"Skipping vf %s for pf %s", vf_sys_path,
vf_index, pf)
self._release()
continue
vf_name = vf_names[0]
pci_info = self._get_pci_info(pf, vf_index)
LOG.debug("Aquiring vf %s of pf %s", vf_index, pf)
return vf_name, vf_index, pf, pci_info
for pci in dev.device_ids:
if pci in pod_devices:
continue
LOG.debug("Appropriate PCI device %s is found", pci)
return pci
def _get_resource_by_physnet(self, physnet):
mapping = config.CONF.sriov.physnet_resource_mappings
try:
resource_name = mapping[physnet]
except KeyError:
LOG.exception("No resource name for physnet %s", physnet)
raise
return resource_name
def _make_resource(self, res_name):
res_prefix = config.CONF.sriov.device_plugin_resource_prefix
return res_prefix + '/' + res_name
def _get_pod_devices(self, pod_link):
k8s = clients.get_kubernetes_client()
pod = k8s.get(pod_link)
annotations = pod['metadata']['annotations']
try:
json_devices = annotations[constants.K8S_ANNOTATION_PCI_DEVICES]
devices = jsonutils.loads(json_devices)
except KeyError:
devices = []
except Exception as ex:
LOG.exception("Exception while getting annotations: %s", ex)
return devices
def _annotate_device(self, pod_link, pci):
k8s = clients.get_kubernetes_client()
pod_devices = self._get_pod_devices(pod_link)
pod_devices.append(pci)
pod_devices = jsonutils.dumps(pod_devices)
LOG.debug("Trying to annotate pod %s with pci %s", pod_link, pci)
k8s.annotate(pod_link,
{constants.K8S_ANNOTATION_PCI_DEVICES: pod_devices})
def _get_vf_info(self, pci):
vf_sys_path = '/sys/bus/pci/devices/{}/net/'.format(pci)
vf_names = os.listdir(vf_sys_path)
vf_name = vf_names[0]
pfysfn_path = '/sys/bus/pci/devices/{}/physfn/net/'.format(pci)
pf_names = os.listdir(pfysfn_path)
pf_name = pf_names[0]
nvfs = self._get_total_vfs(pf_name)
pf_sys_path = '/sys/class/net/{}/device'.format(pf_name)
for vf_index in range(nvfs):
virtfn_path = os.path.join(pf_sys_path,
'virtfn{}'.format(vf_index))
vf_pci = os.path.basename(os.readlink(virtfn_path))
if vf_pci == pci:
pci_info = self._get_pci_info(pf_name, vf_index)
return vf_name, vf_index, pf_name, pci_info
return None, None, None, None
def _get_pci_info(self, pf, vf_index):

View File

@ -279,6 +279,8 @@ class CNIDaemonServiceManager(cotyledon.ServiceManager):
os_vif.initialize()
clients.setup_kubernetes_client()
if CONF.sriov.enable_pod_resource_service:
clients.setup_pod_resources_client()
self.manager = multiprocessing.Manager()
registry = self.manager.dict() # For Watcher->Server communication.

View File

@ -258,6 +258,9 @@ sriov_opts = [
cfg.StrOpt('kubelet_root_dir',
help=_("The root directory of the Kubelet daemon"),
default='/var/lib/kubelet'),
cfg.BoolOpt('enable_pod_resource_service',
help=_("Enable PodResources service"),
default=False),
cfg.DictOpt('default_physnet_subnets',
help=_("A mapping of default subnets for certain physnets "
"in a form of physnet-name:<SUBNET-ID>"),

View File

@ -52,6 +52,7 @@ K8S_ANNOTATION_NPWG_CRD_SUBNET_ID = 'subnetId'
K8S_ANNOTATION_NPWG_CRD_DRIVER_TYPE = 'driverType'
K8S_ANNOTATION_NODE_PCI_DEVICE_INFO = 'openstack.org/kuryr-pci-info'
K8S_ANNOTATION_PCI_DEVICES = K8S_ANNOTATION_PREFIX + '-pci-devices'
K8S_OS_VIF_NOOP_PLUGIN = "noop"

View File

@ -58,6 +58,8 @@ class SriovVIFDriver(neutron_vif.NeutronPodVIFDriver):
c_utils.tag_neutron_resources('ports', [port['id']])
vif = ovu.neutron_to_osvif_vif(vif_plugin, port, subnets)
vif.physnet = physnet
vif.pod_name = pod_name
vif.pod_link = pod['metadata']['selfLink']
LOG.debug("{} vifs are available for the pod {}".format(
amount, pod_name))

View File

@ -73,10 +73,13 @@ class VIFMacvlanNested(obj_osvif.VIFBase):
@obj_base.VersionedObjectRegistry.register
class VIFSriov(obj_osvif.VIFDirect):
# This is OVO based SRIOV vif.
VERSION = '1.0'
# Version 1.0: Initial version
# Version 1.1: Added pod_name field and pod_link field.
VERSION = '1.1'
fields = {
# physnet of the VIF
'physnet': obj_fields.StringField(),
'pod_name': obj_fields.StringField(),
'pod_link': obj_fields.StringField(),
}

View File

@ -19,6 +19,7 @@ from os_vif import objects as osv_objects
from oslo_config import cfg
from kuryr_kubernetes.cni.binding import base
from kuryr_kubernetes.cni.binding import sriov
from kuryr_kubernetes import objects
from kuryr_kubernetes.tests import base as test_base
from kuryr_kubernetes.tests import fake
@ -213,22 +214,50 @@ class TestSriovDriver(TestDriverMixin, test_base.TestCase):
def setUp(self):
super(TestSriovDriver, self).setUp()
self.vif = fake._fake_vif(objects.vif.VIFSriov)
self.vif.physnet = 'test_physnet'
self.vif.physnet = 'physnet2'
self.pci_info = mock.Mock()
self.vif.pod_link = 'pod_link'
self.vif.pod_name = 'pod_1'
self.pci = mock.Mock()
self.device_ids = ['pci_dev_1']
self.device = mock.Mock()
self.device.device_ids = self.device_ids
self.device.resource_name = 'intel.com/sriov'
self.cont_devs = [self.device]
self.container = mock.Mock()
self.container.devices = self.cont_devs
self.pod_containers = [self.container]
self.pod_resource = mock.Mock()
self.pod_resource.containers = self.pod_containers
self.pod_resource.name = 'pod_1'
self.resources = [self.pod_resource]
CONF.set_override('physnet_resource_mappings', 'physnet2:sriov',
group='sriov')
self.addCleanup(CONF.clear_override, 'physnet_resource_mappings',
group='sriov')
CONF.set_override('device_plugin_resource_prefix', 'intel.com',
group='sriov')
@mock.patch('kuryr_kubernetes.cni.binding.sriov.VIFSriovDriver.'
'_get_host_pf_names')
'_annotate_device')
@mock.patch('kuryr_kubernetes.cni.binding.sriov.VIFSriovDriver.'
'_get_available_vf_info')
'_choose_pci')
@mock.patch('kuryr_kubernetes.cni.binding.sriov.VIFSriovDriver.'
'_get_vf_info')
@mock.patch('kuryr_kubernetes.cni.binding.sriov.VIFSriovDriver.'
'_set_vf_mac')
@mock.patch('kuryr_kubernetes.cni.binding.sriov.VIFSriovDriver.'
'_save_pci_info')
def test_connect(self, m_save_pci_info, m_set_vf_mac, m_avail_vf_info,
m_host_pf_names):
m_avail_vf_info.return_value = [self.ifname, 1,
'h_interface', self.pci_info]
m_host_pf_names.return_value = 'h_interface'
def test_connect(self, m_save_pci_info, m_set_vf_mac, m_vf_info,
m_choose_pci, m_annot_dev):
m_vf_info.return_value = [self.ifname, 1, 'h_interface',
self.pci_info]
m_choose_pci.return_value = self.pci
self._test_connect()
self.assertEqual(self.ifname, self.m_c_iface.ifname)
@ -237,9 +266,41 @@ class TestSriovDriver(TestDriverMixin, test_base.TestCase):
m_set_vf_mac.assert_called_once_with('h_interface', 1,
str(self.vif.address))
m_save_pci_info.assert_called_once_with(self.vif.id, self.pci_info)
m_annot_dev.assert_called_once_with(self.vif.pod_link, self.pci)
@mock.patch('kuryr_kubernetes.cni.binding.sriov.VIFSriovDriver.'
'_remove_pci_info')
def test_disconnect(self, m_remove_pci):
m_remove_pci.return_value = None
self._test_disconnect()
@mock.patch('kuryr_kubernetes.clients.get_pod_resources_client')
@mock.patch('kuryr_kubernetes.cni.binding.sriov.VIFSriovDriver.'
'_get_resource_by_physnet')
def test_choose_pci(self, m_get_res_ph, m_get_prc):
cls = sriov.VIFSriovDriver
m_driver = mock.Mock(spec=cls)
m_driver._make_resource.return_value = 'intel.com/sriov'
m_driver._get_pod_devices.return_value = ['pci_dev_2']
pod_resources_list = mock.Mock()
pod_resources_list.pod_resources = self.resources
pod_resources_client = mock.Mock()
pod_resources_client.list.return_value = pod_resources_list
m_get_prc.return_value = pod_resources_client
self.assertEqual('pci_dev_1', cls._choose_pci(m_driver, self.vif,
self.ifname, self.netns))
def test_get_resource_by_physnet(self):
cls = sriov.VIFSriovDriver
m_driver = mock.Mock(spec=cls)
self.assertEqual(
'sriov', cls._get_resource_by_physnet(m_driver, self.vif.physnet))
def test_make_resource(self):
cls = sriov.VIFSriovDriver
m_driver = mock.Mock(spec=cls)
self.assertEqual('intel.com/sriov', cls._make_resource(m_driver,
'sriov'))