Keep floating ip reachability during pacemaker migration.

neutron-netns-cleanup-clone and neutron-ovs-cleanup-clone resources
are only one shot resources that need to be activated during bootup
and shutdown of node.  Triggering a stop has the side effect of
removing entries in the ovs db, making floating ips unreachable.

Those are triggered by the the
/usr/lib/ocf/resource.d/neutron/{OVSCleanup,NetnsCleanup} pacemaker
resource.  They, in turn, use the
/usr/lib/ocf/lib/neutron/neutron-{ovs,netns}-cleanup scripts (not the
systemd unit file).  We temporarily disable any action by configuring
the executable to be "/bin/echo" instead of the
/usr/bin/neutron-{ovs,netns}-cleanup and removing the "--force" option
in neutron-netns-cleanup.

As those resources are cloned resources we need to make sure that the
modification is done on all controller nodes before we take action on
the controller bootstraping node.  To do that we move most of Step1 to
Step0 and make the bootstrap node action happens at Step1 of the
pacemaker controller upgrade.

Furthermore we make sure that the ext bridges, if ovs is used, are not
in secure mode by setting them to standalone during the upgrade
process and back to whatever they were before.

Eventually we need to take care of the os-net-config upgrade.  It can
add new parameters to the interface definition which will force a
restart of the interfaces.  To avoid that we add the --no-activate
option.  Currently no major change in os-net-config are required for
the overcloud to continue running after the upgrade.

Co-Authored-By: Raoul Scarazzini <rscarazz@redhat.com>
Change-Id: Ib5d7b447808b51f6e436eaf6d661606132155a23
Depends-On: Ieb5ad6ad429c8388a1cbbd650339b6eecd9b7997
Closes-Bug: #1698373
This commit is contained in:
Sofer Athlan-Guyot 2017-06-16 14:53:14 +02:00
parent 0616f2d0be
commit bce61783bc
7 changed files with 172 additions and 60 deletions

View File

@ -0,0 +1,53 @@
#!/bin/bash
set -eu
check_cluster
check_pcsd
if [[ -n $(is_bootstrap_node) ]]; then
check_clean_cluster
fi
check_python_rpm
check_galera_root_password
check_disk_for_mysql_dump
# M/N Upgrade only: By default RHEL/Centos has an /etc/sysconfig/iptables file which
# allows ssh and icmp only (INPUT table). During the install of OSP9/Mitaka
# usually the live iptables rules are not the ones in /etc/sysconfig/iptables but
# they are completely open (ACCEPT)
# Now when we run the convergence step while migrating to Newton we enable the firewall
# by default and this will actually first load the rules from /etc/sysconfig/iptables
# and only afterwards, it will start adding all the rules permitting openstack traffic.
# This causes an outage of roughly 1 minute in our env, which disrupts the cluster.
# Let's simply move the existing file out of the way, it will be recreated by
# puppet in newton with the proper firewall rules anyway
if [ ! -f /etc/sysconfig/iptables.m-n-upgrade ]; then
mv /etc/sysconfig/iptables /etc/sysconfig/iptables.m-n-upgrade || /bin/true
fi
# We want to disable fencing during the cluster --stop as it might fence
# nodes where a service fails to stop, which could be fatal during an upgrade
# procedure. So we remember the stonith state. If it was enabled we reenable it
# at the end of this script
if [[ -n $(is_bootstrap_node) ]]; then
STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }')
# We create this empty file if stonith was set to true so we can reenable stonith in step2
rm -f /var/tmp/stonith-true
if [ $STONITH_STATE == "true" ]; then
touch /var/tmp/stonith-true
fi
pcs property set stonith-enabled=false
fi
# Before migration and service stopping we make sure that the external
# bridge is set to standalone in ovs. This is because we don't do a
# rolling upgrade, but we don't want the floating ip network to be cut
# off.
for br in $(get_all_bridges); do
# It we be set to whatever is needed at the end of the upgrade by
# ovs-agent processes.
ovs-vsctl set-fail-mode ${br} standalone
done
# Make sure openvswitch *cleanup* does not happen.
deactivate_cleanup_services

View File

@ -2,45 +2,7 @@
set -eu
check_cluster
check_pcsd
if [[ -n $(is_bootstrap_node) ]]; then
check_clean_cluster
fi
check_python_rpm
check_galera_root_password
check_disk_for_mysql_dump
# M/N Upgrade only: By default RHEL/Centos has an /etc/sysconfig/iptables file which
# allows ssh and icmp only (INPUT table). During the install of OSP9/Mitaka
# usually the live iptables rules are not the ones in /etc/sysconfig/iptables but
# they are completely open (ACCEPT)
# Now when we run the convergence step while migrating to Newton we enable the firewall
# by default and this will actually first load the rules from /etc/sysconfig/iptables
# and only afterwards, it will start adding all the rules permitting openstack traffic.
# This causes an outage of roughly 1 minute in our env, which disrupts the cluster.
# Let's simply move the existing file out of the way, it will be recreated by
# puppet in newton with the proper firewall rules anyway
if [ ! -f /etc/sysconfig/iptables.m-n-upgrade ]; then
mv /etc/sysconfig/iptables /etc/sysconfig/iptables.m-n-upgrade || /bin/true
fi
# We want to disable fencing during the cluster --stop as it might fence
# nodes where a service fails to stop, which could be fatal during an upgrade
# procedure. So we remember the stonith state. If it was enabled we reenable it
# at the end of this script
if [[ -n $(is_bootstrap_node) ]]; then
STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }')
# We create this empty file if stonith was set to true so we can reenable stonith in step2
rm -f /var/tmp/stonith-true
if [ $STONITH_STATE == "true" ]; then
touch /var/tmp/stonith-true
fi
pcs property set stonith-enabled=false
fi
# Migrate to HA NG
if [[ -n $(is_bootstrap_node) ]]; then
migrate_full_to_ng_ha
fi

View File

@ -11,6 +11,10 @@ cluster_sync_timeout=1800
# systemctl try-restart is a noop
for service in $(services_to_migrate); do
if [[ ${service%%-clone} =~ .*-cleanup ]]; then
# we don't want to stop {netns,ovs}-cleanup
continue
fi
manage_systemd_service stop "${service%%-clone}"
# So the reason for not reusing check_resource_systemd is that
# I have observed systemctl is-active returning unknown with at least
@ -107,6 +111,7 @@ if [ $DO_MYSQL_UPGRADE -eq 1 ]; then
mv /var/lib/mysql $MYSQL_TEMP_UPGRADE_BACKUP_DIR
fi
update_os_net_config
# Special-case OVS for https://bugs.launchpad.net/tripleo/+bug/1669714
update_network
@ -119,6 +124,7 @@ if grep -q '^pipeline = ssl_header_handler faultwrap osvolumeversionapp' /etc/ci
sed -i '$ { /^$/d }' /etc/cinder/api-paste.ini
fi
restore_cleanup_service_definition
yum -y install python-zaqarclient # needed for os-collect-config
yum -y -q update

View File

@ -2,7 +2,7 @@
set -eu
# We need to start the systemd services we explicitely stopped at step _1.sh
# We need to start the systemd services we explicitely stopped at step _0.sh
# We add the enablement of the systemd services here because if a node gets rebooted
# before the convergence step for whatever reason the migrated services will
# not be enabled and we potentially have a bigger disruption.
@ -10,7 +10,14 @@ services=$(services_to_migrate)
if [[ ${keep_sahara_services_on_upgrade} =~ [Ff]alse ]] ; then
services=${services%%openstack-sahara*}
fi
for service in $services; do
if [[ ${service%%-clone} =~ .*-cleanup ]]; then
# we don't want to start {netns,ovs}-cleanup
log_debug "Skipping ${service}"
continue
fi
manage_systemd_service start "${service%%-clone}"
manage_systemd_service enable "${service%%-clone}"
check_resource_systemd "${service%%-clone}" started 600

View File

@ -60,7 +60,7 @@ resources:
rolling_update:
max_batch_size: 1
ControllerPacemakerUpgradeConfig_Step1:
ControllerPacemakerUpgradeConfig_Step0:
type: OS::Heat::SoftwareConfig
properties:
group: script
@ -81,12 +81,31 @@ resources:
MYSQL_MAJOR_UPGRADE: {get_param: MySqlMajorUpgrade}
- get_file: pacemaker_common_functions.sh
- get_file: major_upgrade_check.sh
- get_file: major_upgrade_pacemaker_migrations.sh
- get_file: major_upgrade_controller_pacemaker_0.sh
ControllerPacemakerUpgradeDeployment_Step0:
type: OS::Heat::SoftwareDeploymentGroup
depends_on: CephMonUpgradeDeployment
properties:
servers: {get_param: [servers, Controller]}
config: {get_resource: ControllerPacemakerUpgradeConfig_Step0}
input_values: {get_param: input_values}
ControllerPacemakerUpgradeConfig_Step1:
type: OS::Heat::SoftwareConfig
properties:
group: script
config:
list_join:
- '#!/bin/bash'
- - get_file: pacemaker_common_functions.sh
- get_file: major_upgrade_pacemaker_migrations.sh
- get_file: major_upgrade_controller_pacemaker_1.sh
ControllerPacemakerUpgradeDeployment_Step1:
type: OS::Heat::SoftwareDeploymentGroup
depends_on: CephMonUpgradeDeployment
depends_on: ControllerPacemakerUpgradeDeployment_Step0
properties:
servers: {get_param: [servers, Controller]}
config: {get_resource: ControllerPacemakerUpgradeConfig_Step1}

View File

@ -101,6 +101,42 @@ function services_to_migrate {
echo $PCMK_RESOURCE_TODELETE
}
# Those are oneshot type services.
function neutron_cleanup_services {
echo "
neutron-netns-cleanup-clone
neutron-ovs-cleanup-clone
"
}
function deactivate_cleanup_services {
for service in $(neutron_cleanup_services); do
log_debug "Changing ocf configuration for '${service}'"
# We prevent any stop action by changing the exec to a noop.
local sysconfig_name=${service%-clone}
# This is loaded by /usr/lib/ocf/lib/neutron/neutron-{ovs,netns}-cleanup
echo "exec=/bin/echo" >> /etc/sysconfig/${sysconfig_name}
sed -i.orig -e 's/clean --force/clean/' /usr/lib/ocf/lib/neutron/neutron-netns-cleanup
done
}
function restore_cleanup_service_definition {
for service in $(neutron_cleanup_services); do
log_debug "Restoring original ocf configuration for '${service}'"
local sysconfig_file=/etc/sysconfig/${service%-clone}
if [ -e "${sysconfig_file}" ]; then
sed -e '/exec=\/bin\/echo/d' $sysconfig_file | \
sed -e '/^ *$/d' > /tmp/$service
if test -s /tmp/$service; then
cp /tmp/$service $sysconfig_file
else
rm -f $sysconfig_file
fi
[ ! -e /tmp/$service ] || rm -f /tmp/$service
fi
done
mv /usr/lib/ocf/lib/neutron/neutron-netns-cleanup{.orig,}
}
# This function will migrate a mitaka system where all the resources are managed
# via pacemaker to a newton setup where only a few services will be managed by pacemaker
# On a high-level it will operate as follows:
@ -148,7 +184,7 @@ function migrate_full_to_ng_ha {
# that will move to systemd.
# We want the systemd resources be stopped before doing "yum update",
# that way "systemctl try-restart <service>" is no-op because the
# service was down already
# service was down already
PCS_STATUS_OUTPUT="$(pcs status)"
for resource in $(services_to_migrate) "delay-clone" "openstack-core-clone"; do
if echo "$PCS_STATUS_OUTPUT" | grep "$resource"; then

View File

@ -4,6 +4,28 @@ set -eu
DEBUG="true" # set false if the verbosity is a problem
SCRIPT_NAME=$(basename $0)
# This block get default for ovs fail mode handling during upgrade.
function get_all_bridges {
local bridges_def=""
local bridges=""
if which ovs-vsctl &>/dev/null; then
if [ -e /etc/neutron/plugins/ml2/openvswitch_agent.ini ]; then
local raw_bridge_def=$(crudini --get /etc/neutron/plugins/ml2/openvswitch_agent.ini ovs bridge_mappings)
local bridges=""
while IFS=: read physnet bridge; do bridges_def="${bridges_def} ${bridge}" ; done \
< <(echo "${raw_bridge_def}" | sed 's/,/\n/g')
local existing_bridges="$(ovs-vsctl -f table -d bare --column=name --no-headings find Bridge)"
for br in ${bridges_def}; do
if echo "${existing_bridges}" | grep -q $br; then
bridges="${bridges} ${br}"
fi
done
fi
fi
echo "${bridges}"
}
function log_debug {
if [[ $DEBUG = "true" ]]; then
echo "`date` $SCRIPT_NAME tripleo-upgrade $(facter hostname) $1"
@ -325,25 +347,32 @@ function special_case_ovs_upgrade_if_needed {
}
# update os-net-config before ovs see https://bugs.launchpad.net/tripleo/+bug/1695893
function update_os_net_config() {
set +e
local need_update="$(yum check-upgrade | grep os-net-config)"
if [ -n "${need_update}" ]; then
yum -q -y update os-net-config
local return_code=$?
log_debug "yum update os-net-config return code: $return_code"
# We're just make sure that os-net-config won't ifdown/ifup
# network interfaces. The current set of changes (Tue Oct 3
# 17:38:37 CEST 2017) doesn't require the os-net-config change
# to be taken live. They will be at next reboot.
os-net-config --no-activate -c /etc/os-net-config/config.json -v \
--detailed-exit-codes
local os_net_retval=$?
if [[ $os_net_retval == 2 ]]; then
log_debug "os-net-config: interface configuration files updated successfully"
elif [[ $os_net_retval != 0 ]]; then
log_debug "ERROR: os-net-config configuration failed"
exit $os_net_retval
fi
fi
set -e
}
function update_network() {
set +e
yum -q -y update os-net-config
return_code=$?
echo "yum update os-net-config return code: $return_code"
# Writes any changes caused by alterations to os-net-config and bounces the
# interfaces *before* restarting the cluster.
os-net-config -c /etc/os-net-config/config.json -v --detailed-exit-codes
RETVAL=$?
if [[ $RETVAL == 2 ]]; then
echo "os-net-config: interface configuration files updated successfully"
elif [[ $RETVAL != 0 ]]; then
echo "ERROR: os-net-config configuration failed"
exit $RETVAL
fi
set -e
# special case https://bugs.launchpad.net/tripleo/+bug/1635205 +bug/1669714
special_case_ovs_upgrade_if_needed
}