Enable OVN CI

Adds basic testing for PXE/iPXE boot secenarios where the OVN
DHCP service is used instead of dnsmasq.

Also adds a release note and documentation to cover the details
and caveats of using ovn as we have discovered through this process.

Change-Id: I28cd20a7f271220d8ca335895ca9e302452fd069
This commit is contained in:
Julia Kreger 2023-06-01 11:08:58 -07:00
parent 0eb3f40f10
commit 3f77091c63
8 changed files with 357 additions and 23 deletions

View File

@ -2032,20 +2032,36 @@ function create_ovs_taps {
# Work around: No netns exists on host until a Neutron port is created. We
# need to create one in Neutron to know what netns to tap into prior to the
# first node booting.
# NOTE(TheJulia): So.. Neutron doesn't allow a port to be created as a
# system scoped admin, which makes sense.
local port_id
port_id=$(openstack --os-cloud devstack-admin port create --network ${ironic_net_id} temp_port -c id -f value)
die_if_not_set $LINENO port_id "Failed to create neutron port"
if [[ "$Q_AGENT" != "ovn" ]]; then
# NOTE(TheJulia): So.. Neutron doesn't allow a port to be created as a
# system scoped admin, which makes sense.
local port_id
port_id=$(openstack --os-cloud devstack-admin port create --network ${ironic_net_id} temp_port -c id -f value)
die_if_not_set $LINENO port_id "Failed to create neutron port"
local tapdev
local tapdev_cmd="sudo ip netns exec qdhcp-${ironic_net_id} ip link list | grep ' tap' | cut -d':' -f2 | cut -d'@' -f1 | cut -b2- | grep '^tap'"
# retry tap device discovery to make sure the tag has been set to port
tapdev=$(test_with_retry "$tapdev_cmd" "Failed to get tap device id" 20 1)
local tag_id
tag_id=$(sudo ovs-vsctl get port ${tapdev} tag)
die_if_not_set $LINENO tag_id "Failed to get tag id"
local tapdev
local tapdev_cmd="sudo ip netns exec qdhcp-${ironic_net_id} ip link list | grep ' tap' | cut -d':' -f2 | cut -d'@' -f1 | cut -b2- | grep '^tap'"
# retry tap device discovery to make sure the tag has been set to port
tapdev=$(test_with_retry "$tapdev_cmd" "Failed to get tap device id" 20 1)
local tag_id
tag_id=$(sudo ovs-vsctl get port ${tapdev} tag)
die_if_not_set $LINENO tag_id "Failed to get tag id"
# Remove the port needed only for workaround.
openstack --os-cloud $OS_CLOUD port delete $port_id
else
# ovs-vsctl set Open_vSwitch . external-ids:ovn-cms-options=\"enable-chassis-as-gw\"
# should already be set -> external-ids:ovn-bridge-mappings=\"public:br-ex\"
# so the tl;dr is that the tag in this port, based on ovn config
# may be something like 841 on public, which in default devstack is
# br-ex. We don't care though, it is a vlan tag and previously we
# just used the integration bridge to make the connection with the
# tag. Basically the same.
# NOTE(TheJulia): Show the network data to ease troubleshooting,
# Normally, this will be the private network for devstack.
tag_id=$(openstack --os-cloud $OS_CLOUD network show $ironic_net_id -c "provider:segmentation_id" -f value)
die_if_not_set $LINENO tag_id "Failed to get tag id"
fi
local ovs_tap=ovs-tap
local brbm_tap=brbm-tap
# make sure veth pair is not existing, otherwise delete its links
@ -2055,12 +2071,18 @@ function create_ovs_taps {
sudo ip link add $brbm_tap type veth peer name $ovs_tap
sudo ip link set dev $brbm_tap up
sudo ip link set dev $ovs_tap up
sudo ip link set dev br-int up
sudo ip link set dev br-ex up
sudo ovs-vsctl -- --if-exists del-port $ovs_tap -- add-port br-int $ovs_tap tag=$tag_id
if [[ "$Q_AGENT" != "ovn" ]]; then
sudo ovs-vsctl -- --if-exists del-port $ovs_tap -- add-port br-int $ovs_tap tag=$tag_id
else
# OVN defaults to everything on "public" which is br-ex
sudo ovs-vsctl -- --if-exists del-port $ovs_tap -- add-port br-ex $ovs_tap tag=$tag_id
fi
sudo ovs-vsctl -- --if-exists del-port $brbm_tap -- add-port $IRONIC_VM_NETWORK_BRIDGE $brbm_tap
# Remove the port needed only for workaround.
openstack --os-cloud $OS_CLOUD port delete $port_id
# Finally, share the fixed tenant network across all tenants. This allows the host
# to serve TFTP to a single network namespace via the tap device created above.
@ -2209,10 +2231,54 @@ SUBSHELL
replace_range=${SUBNETPOOL_PREFIX_V6}
fi
fi
pub_router_id=$(openstack --os-cloud $OS_CLOUD router show $Q_ROUTER_NAME -f value -c id)
# Select the text starting at "src ", and grabbing the following field.
r_net_gateway=$(sudo ip netns exec qrouter-$pub_router_id ip -$IRONIC_IP_VERSION route get $dns_server |grep dev | sed s/^.*src\ // |awk '{ print $1 }')
sudo ip route replace $replace_range via $r_net_gateway
if [[ "$Q_AGENT" != "ovn" ]]; then
pub_router_id=$(openstack --os-cloud $OS_CLOUD router show $Q_ROUTER_NAME -f value -c id)
# Select the text starting at "src ", and grabbing the following field.
r_net_gateway=$(sudo ip netns exec qrouter-$pub_router_id ip -$IRONIC_IP_VERSION route get $dns_server |grep dev | sed s/^.*src\ // |awk '{ print $1 }')
sudo ip route replace $replace_range via $r_net_gateway
else
openstack router set --disable-snat --external-gateway public $Q_ROUTER_NAME
# Need to handle the json dict we get from the API (yeah, wut?!)
# and transform that so jq can do the needful. We also can't store
# it as a variable in ram, otherwise bash tries to escape it.
# Example gw_info (wrapped for readability)
# {'network_id': '3caae040-c0ac-4e8a-883b-12470f3fdeae',
# 'external_fixed_ips': [
# {'subnet_id': 'a8b6641e-53e1-4156-aa2f-2cda79dd4f4a',
# 'ip_address': '172.24.5.36'},
# {'subnet_id': 'bddf41dc-47f6-4124-9afd-744fa7343320',
# 'ip_address': '2001:db8::4d'}], 'enable_snat': False}
# Transform to json jq can parse, which includes replacing Python
# boolean strings.
external_gw_v4=$(openstack router show $Q_ROUTER_NAME -c external_gateway_info -f json | jq -r .external_gateway_info.external_fixed_ips[0].ip_address)
sudo ip addr
sudo ip link set dev br-ex up || true
# This route is only used *if* we actually provision with a
# dedicated ironic provisioning network which does not *always*
# happen. i.e. job specific config.
mtu=$(($PUBLIC_BRIDGE_MTU - 30))
v4mss=$(($PUBLIC_BRIDGE_MTU - 40 ))
# FIXME(TheJulia): We have a conundrum with MTUs.
# 1) OVN mtu handling is not great
# (https://bugs.launchpad.net/neutron/+bug/2032817)
# 2) We previously restricted the MTU down 100 bytes for
# VXLAN tunnel overhead for multinode jobs. This sort
# of conflicts. If we use the stock public bridge mtu,
# We would likely be okay, in the grand scheme of the
# world, but until we figure out that path, we need
# to clamp things, for now.
# NOTE(TheJulia): v6mss should be -60 once it is supported.
# NOTE(TheJulia): The route commands below set *and* lock a
# default MTU (disable PMTU discovery) and sets an Maximum Segment
# Size to advertise for packets to be sent along the path. Normally
# it is derived from the outbound interface MTU, which is wrong in
# this scenario. We're tracking this as LP#2032817.
sudo ip route add $IRONIC_PROVISION_SUBNET_PREFIX via $external_gw_v4 mtu lock $mtu advmss $v4mss
# This is the default space fallback for all neutron networking,
# since this is only if we create a dedicated provisioning
# network in the job config.
sudo ip route add $IPV4_ADDRS_SAFE_TO_USE via $external_gw_v4 mtu lock $mtu advmss $v4mss
fi
fi
# Here is a good place to restart tcpdump to begin capturing packets.
# See: https://docs.openstack.org/devstack/latest/debugging.html

View File

@ -37,7 +37,25 @@ if is_service_enabled ir-api ir-cond; then
if [[ "$IRONIC_BAREMETAL_BASIC_OPS" == "True" && "$IRONIC_IS_HARDWARE" == "False" ]]; then
echo_summary "Precreating bridge: $IRONIC_VM_NETWORK_BRIDGE"
install_package openvswitch-switch
if [[ "$Q_BUILD_OVS_FROM_GIT" == "True" ]]; then
if [[ "$Q_AGENT" == "ovn" ]]; then
# If we're here, we were requested to install from git
# for OVN *and* OVS, but that means basic setup has not been
# performed yet. As such, we need to do that and start
# OVN/OVS where as if we just need to ensure OVS is present,
# vendor packaging does that for us. We start early here,
# because neutron setup for this is deferred until too late
# for our plugin to setup the test environment.
echo_summary "Setting up OVN..."
init_ovn
start_ovn
fi
else
# NOTE(TheJulia): We are likely doing this to ensure
# OVS is running.
echo_summary "Installing OVS to pre-create bridge"
install_package openvswitch-switch
fi
sudo ovs-vsctl -- --may-exist add-br $IRONIC_VM_NETWORK_BRIDGE
fi

View File

@ -60,6 +60,7 @@ Advanced Topics
Role Based Access Control <secure-rbac>
Deploying with Anaconda <anaconda-deploy-interface>
Steps <steps>
OVN Networking <ovn-networking>
.. toctree::
:hidden:

View File

@ -0,0 +1,159 @@
=====================
Use of OVN Networking
=====================
Overview
========
OVN is largely considered an evolution of OVS. While it is recommended that
operators continue to utilize OVS with Ironic, OVN has an attractive a
superset of capabilities and shifts some of the configuration of networking
away from configuration files, towards a service modeled at serving a more
scalable software defined networking experience. However, as with all newer
technologies, there are caveats and issues. The purpose of this documentation
is to help convey OVN's state, capabilities, and provide operators with
the context required to navigate their path forward.
.. Warning:: OVN is under quite a bit of active development, and this
information may grow out of date quickly. We've provided links
to help spread the information and enable operators to learn
the current status.
Challenges
==========
DHCP
----
Historically, while OVN has included a DHCP server, this DHCP server has not
had the capability to handle clients needing custom attributes such as those
used by PXE and iPXE to enable network boot operations.
Typically, this has resulted in operators who use OVN with Bare Metal to
continue to operate the ``neutron-dhcp-agent`` service, along with setting
OVN configuration appropriate to disable OVN from responding to DHCP requests
for baremetal ports. Please see
:neutron-doc:`routed networks <configuration/ovn.html#ovn.disable_ovn_dhcp_for_baremetal_ports>`
for more information on this setting.
As of the 2023.2 Release of Ironic, The Ironic project *can* confirm that
OVN's DHCP server does work for PXE and iPXE operations when using **IPv4**,
OVS version **3.11**, and OVN version **23.06.0**.
Support for IPv6 is presently pending changes to Neutron, as IPv6 requires
additional configuration options and a different pattern of behavior, and
thus has not been tested. Your advised to continue to us the
``neutron-dhcp-agent`` if you need IPv6 at this time. Currently this support
is being worked in Neutron
`change 890683 <https://review.opendev.org/c/openstack/neutron/+/890683>`_ and
`bug 20305201 <https://bugs.launchpad.net/neutron/+bug/20305201>`_.
Maxmium Transmission Units
--------------------------
OVN's handling of MTUs has been identified by OVN as being incomplete.
The reality is that it assumes the MTU is not further constained beyond
the gateway, which sort of works in some caess for virtual machines, but
might not be applicable with baremetal because your traffic may pass
through lower, or higher MTUs.
Ideally, your environment should have consistent MTUs. If you cannot have
consistent MTUs, we recommend clamping the MTU and Maximum Segment Size
(MSS) using your front end router to ensure igress traffic is sized and
fragmented appropriately. Egress traffic should inherent it's MTU size
based upon the DHCP service configuration.
A items you can keep track of regarding MTU handling:
* Bug `2032817 <https://bugs.launchpad.net/neutron/+bug/2032817>`_
* OVN `TODO document <https://github.com/ovn-org/ovn/blob/main/TODO.rst>`_
To clamp the MTU and MSS on a linux based router, you can utilize the
following command::
ip route add $network via $OVN_ROUTER advmss $MAX_SEGMENT_SIZE mtu lock $MTU
NAT of TFTP
-----------
Because the NAT and Connection Tracking layer gets applied differently with
OVN, as the router doesn't appear as a namespace or to the local OS kernel,
you will not be able to enable NAT translation for Bare Metal Networks
under the direct management of OVN, that is if you don't have a separate
TFTP service running from with-in that network.
This is a result of the kernel of the OVN gateway being unable to associate
and handle return packet directly as part of the connection tracking layer.
No direct work around for this is known, but generally Ironic encourages the
use of Virtual Media where possible to sidestep this sort of issue and ensure
a higher operational security posture for the deployment. Users of the
``redfish`` hardware type can learn about
:ref:`redfish-virtual-media` in our Redfish documentation.
.. Warning::
Creation of FIPs, such as those which may be used grant SSH access to
a internal node on a network, for example which may be used by Tempest,
establishes a 1:1 NAT rule. When this is the case, TFTP packets
*cannot* transit OVN and network boot operations will fail.
Rescue
------
Due to the aformentioned NAT issues, we know Rescue operations may not work.
This is being tracked as `bug 2033083 <https://bugs.launchpad.net/ironic/+bug/2033083>`_.
PXE boot of GRUB
----------------
Initial testing has revelaed that EFI booting Grub2 via OVN does not appear
to work with OVN. For some reason, Grub2 believes the network mask is
incorrect based upon the DHCP interaction, and results in a belief
that the TFTP server is locally attached.
For example, if a client is assigned ``10.1.0.13/28``, with a default
gateway of ``10.1.0.1``, and a tftp-sever of ``10.203.101.230``,
then grub2 believes it's default route is 10.0.0.0/8.
This is being tracked as `bug 2033430 <https://bugs.launchpad.net/ironic/+bug/2033430>`_
until we're better able to understand the root cause and file a bug with the
appropriate project.
Required Configuration
======================
OVN is designed to provide packet handling in a distributed fashion for a
each compute hypervisor in a cloud of virtual machines. However with Bare
Metal instances, you will likely need to have a pool of dedicated
"network nodes" to handle OVN traffic.
Chassis as Gateway
------------------
The networking node chassis must be configured to operate as a gateway.
This can be configured manually, but *should* (as far as Ironic is aware) be
configured by Neutron and set on interfaces matching the bridge mappings. At
least, it works that way in Devstack.
ML2 Plugins
-----------
The ``ovn-router`` and ``trunk`` ml2 plugins as supplied with Neutron
*must* be enabled.
If you need to attach to the network...
---------------------------------------
For example if you need to bind something into a network for baremetal,
above and beyond a dedicated interface, you will need to make the attachment
on the ``br-ex`` integration bridge, as opposed to ``br-int`` as one would
have done with OVS.
Unknowns
========
It is presently unknown if it is possible for OVN to perform and enable VXLAN
attachments to physical ports on integrated devices, thus operators are advised
to continue to use ``vlan`` networking with their hosts with existing ML2
integrations.

View File

@ -8,6 +8,15 @@ with the Networking service for DHCP, PXE boot and other requirements.
This section covers configuring Networking for a single flat network for bare
metal provisioning.
.. Warning:: This docuemntation is geared for use of OVS with Neutron along
with the ``neutron-dhcp-agent``. It *is* possible to use OVN
with ``neutron-dhcp-agent``, and depending on version of OVN
and Neutron, OVN's own DHCP service for IPv4 clients, but that
is considered an advanced topic, and we encourage operators
interested in use of OVN to fully undestand it's capabilities
and state before attempting to utilize such a configuration.
Please see :doc:`/admin/ovn-networking` for more details.
It is recommended to use the baremetal ML2 mechanism driver and L2 agent for
proper integration with the Networking service. Documentation regarding
installation and configuration of the baremetal mechanism driver and L2 agent

View File

@ -14,3 +14,14 @@
shell: "ip -6 route > {{ zuul_output_dir }}/logs/post-job-network-routes-v6.txt"
ignore_errors: True
become: yes
- name: Get interfaces
shell: "ip -s -s link > {{ zuul_output_dir }}/logs/post-job-network-interfaces.txt"
ignore_errors: True
become: yes
- name: Get addresses
shell: "ip addr > {{ zuul_output_dir }}/logs/post-job-network-addresses.txt"
ignore_errors: True
become: yes
- name: Get OVS
shell: "osv-vsctl show > {{ zuul_output_dir }}/logs/post-job-network-ovs.txt"
ignore_errors: True

View File

@ -0,0 +1,25 @@
---
features:
- |
While Ironic has not explicitly added support for OVN, because that is
in theory a Neutron implementation detail, we have added some basic
testing and are pleased to announce that you can use OVN's DHCP service
for IPv4 based provisioning with OVN v23.06.00 and beyond. This is not
without issues, and we've added
`ovn documentation <https://docs.openstack.org/ironic/latest/admin/ovn-networking.html>`_
as a result to help provide as much Ironic operator clarity as possible.
issues:
- |
Use of OVN may require disabling SNAT for provisioning with IPv4 when
using TFTP. This is due to the Linux Kernel, and how IP packet handling
occurs with OVN. No solution is known to this issue, and use of
provisioning technologies which do *not* use TFTP is also advisable.
- |
Use of OVN may require careful attention to the MTUs of networks.
Oversized packets and networking may be dropped. That being said this
is more likely an issue for testing than with actual physical baremetal
in a production deployment.
- |
Use of OVN for IPv6 based PXE/iPXE is not supported by Neutron.
The Ironic project expects this to be addressed during the Caracal
(2024.1) development cycle.

View File

@ -347,7 +347,7 @@
# could be an issue with the lookup in ironic-python-agent
- job:
name: ironic-tempest-ipa-wholedisk-bios-agent_ipmitool
description: ironic-tempest-ipa-wholedisk-bios-agent_ipmitool
description: Gate-ish job with classic name. Executes rescue!
parent: ironic-base
vars:
devstack_localrc:
@ -369,7 +369,7 @@
# ubuntu focal for the time being.
- job:
name: ironic-tempest-wholedisk-bios-snmp-pxe
description: SNMP power, no-op management and whole disk images.
description: SNMP power, iPXE, OVN, no-op management and whole disk images.
parent: ironic-base
nodeset: openstack-single-node-focal
vars:
@ -381,14 +381,59 @@
IRONIC_AUTOMATED_CLEAN_ENABLED: False
IRONIC_ENFORCE_SCOPE: True
IRONIC_BOOT_MODE: bios
Q_AGENT: ovn
Q_ML2_TENANT_NETWORK_TYPE: vlan
Q_ML2_PLUGIN_MECHANISM_DRIVERS: ovn
ENABLE_CHASSIS_AS_GW: True
ML2_L3_PLUGIN: "ovn-router,trunk"
OVN_DBS_LOG_LEVEL: dbg
OVN_BUILD_FROM_SOURCE: True
Q_BUILD_OVS_FROM_GIT: True
# NOTE(TheJulia): Ubuntu ships an out of date OVN package, so
# we need to build from source. These are the minimum versions
# representing June 2023 release. Ubuntu Kinetic is shipping Q3 2022
# i.e. OVN 22.09, so likely possible to remove sometime *after*
# Ubuntu Mantic OVN 2023.03.
OVN_BRANCH: v23.06.0
OVS_BRANCH: v3.1.1
devstack_services:
q-agt: False
q-dhcp: False
q-l3: False
ovn-controller: True
ovn-northd: True
q-ovn-metadata-agent: True
- job:
name: ironic-tempest-partition-uefi-ipmi-pxe
description: IPMI power, UEFI, partition image.
description: IPMI power, UEFI, iPXE, OVN, partition image.
parent: ironic-base
vars:
devstack_localrc:
IRONIC_AUTOMATED_CLEAN_ENABLED: False
Q_AGENT: ovn
Q_ML2_TENANT_NETWORK_TYPE: vlan
Q_ML2_PLUGIN_MECHANISM_DRIVERS: ovn
ENABLE_CHASSIS_AS_GW: True
ML2_L3_PLUGIN: "ovn-router,trunk"
OVN_DBS_LOG_LEVEL: dbg
OVN_BUILD_FROM_SOURCE: True
Q_BUILD_OVS_FROM_GIT: True
# NOTE(TheJulia): Ubuntu ships an out of date OVN package, so
# we need to build from source. These are the minimum versions
# representing June 2023 release. Ubuntu Kinetic is shipping Q3 2022
# i.e. OVN 22.09, so likely possible to remove sometime *after*
# Ubuntu Mantic which is OVN 2023.03.
OVN_BRANCH: v23.06.0
OVS_BRANCH: v3.1.1
devstack_services:
q-agt: False
q-dhcp: False
q-l3: False
ovn-controller: True
ovn-northd: True
ovn-vswitchd: True
q-ovn-metadata-agent: True
- job:
name: ironic-tempest-bfv