Merge "Keep floating ip reachability during pacemaker migration." into stable/newton

This commit is contained in:
Jenkins 2017-10-10 03:53:18 +00:00 committed by Gerrit Code Review
commit 186e2cbb15
7 changed files with 172 additions and 60 deletions

View File

@ -0,0 +1,53 @@
#!/bin/bash
set -eu
check_cluster
check_pcsd
if [[ -n $(is_bootstrap_node) ]]; then
check_clean_cluster
fi
check_python_rpm
check_galera_root_password
check_disk_for_mysql_dump
# M/N Upgrade only: By default RHEL/Centos has an /etc/sysconfig/iptables file which
# allows ssh and icmp only (INPUT table). During the install of OSP9/Mitaka
# usually the live iptables rules are not the ones in /etc/sysconfig/iptables but
# they are completely open (ACCEPT)
# Now when we run the convergence step while migrating to Newton we enable the firewall
# by default and this will actually first load the rules from /etc/sysconfig/iptables
# and only afterwards, it will start adding all the rules permitting openstack traffic.
# This causes an outage of roughly 1 minute in our env, which disrupts the cluster.
# Let's simply move the existing file out of the way, it will be recreated by
# puppet in newton with the proper firewall rules anyway
if [ ! -f /etc/sysconfig/iptables.m-n-upgrade ]; then
mv /etc/sysconfig/iptables /etc/sysconfig/iptables.m-n-upgrade || /bin/true
fi
# We want to disable fencing during the cluster --stop as it might fence
# nodes where a service fails to stop, which could be fatal during an upgrade
# procedure. So we remember the stonith state. If it was enabled we reenable it
# at the end of this script
if [[ -n $(is_bootstrap_node) ]]; then
STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }')
# We create this empty file if stonith was set to true so we can reenable stonith in step2
rm -f /var/tmp/stonith-true
if [ $STONITH_STATE == "true" ]; then
touch /var/tmp/stonith-true
fi
pcs property set stonith-enabled=false
fi
# Before migration and service stopping we make sure that the external
# bridge is set to standalone in ovs. This is because we don't do a
# rolling upgrade, but we don't want the floating ip network to be cut
# off.
for br in $(get_all_bridges); do
# It we be set to whatever is needed at the end of the upgrade by
# ovs-agent processes.
ovs-vsctl set-fail-mode ${br} standalone
done
# Make sure openvswitch *cleanup* does not happen.
deactivate_cleanup_services

View File

@ -2,45 +2,7 @@
set -eu
check_cluster
check_pcsd
if [[ -n $(is_bootstrap_node) ]]; then
check_clean_cluster
fi
check_python_rpm
check_galera_root_password
check_disk_for_mysql_dump
# M/N Upgrade only: By default RHEL/Centos has an /etc/sysconfig/iptables file which
# allows ssh and icmp only (INPUT table). During the install of OSP9/Mitaka
# usually the live iptables rules are not the ones in /etc/sysconfig/iptables but
# they are completely open (ACCEPT)
# Now when we run the convergence step while migrating to Newton we enable the firewall
# by default and this will actually first load the rules from /etc/sysconfig/iptables
# and only afterwards, it will start adding all the rules permitting openstack traffic.
# This causes an outage of roughly 1 minute in our env, which disrupts the cluster.
# Let's simply move the existing file out of the way, it will be recreated by
# puppet in newton with the proper firewall rules anyway
if [ ! -f /etc/sysconfig/iptables.m-n-upgrade ]; then
mv /etc/sysconfig/iptables /etc/sysconfig/iptables.m-n-upgrade || /bin/true
fi
# We want to disable fencing during the cluster --stop as it might fence
# nodes where a service fails to stop, which could be fatal during an upgrade
# procedure. So we remember the stonith state. If it was enabled we reenable it
# at the end of this script
if [[ -n $(is_bootstrap_node) ]]; then
STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }')
# We create this empty file if stonith was set to true so we can reenable stonith in step2
rm -f /var/tmp/stonith-true
if [ $STONITH_STATE == "true" ]; then
touch /var/tmp/stonith-true
fi
pcs property set stonith-enabled=false
fi
# Migrate to HA NG
if [[ -n $(is_bootstrap_node) ]]; then
migrate_full_to_ng_ha
fi

View File

@ -11,6 +11,10 @@ cluster_sync_timeout=1800
# systemctl try-restart is a noop
for service in $(services_to_migrate); do
if [[ ${service%%-clone} =~ .*-cleanup ]]; then
# we don't want to stop {netns,ovs}-cleanup
continue
fi
manage_systemd_service stop "${service%%-clone}"
# So the reason for not reusing check_resource_systemd is that
# I have observed systemctl is-active returning unknown with at least
@ -107,6 +111,7 @@ if [ $DO_MYSQL_UPGRADE -eq 1 ]; then
mv /var/lib/mysql $MYSQL_TEMP_UPGRADE_BACKUP_DIR
fi
update_os_net_config
# Special-case OVS for https://bugs.launchpad.net/tripleo/+bug/1669714
update_network
@ -119,6 +124,7 @@ if grep -q '^pipeline = ssl_header_handler faultwrap osvolumeversionapp' /etc/ci
sed -i '$ { /^$/d }' /etc/cinder/api-paste.ini
fi
restore_cleanup_service_definition
yum -y install python-zaqarclient # needed for os-collect-config
yum -y -q update

View File

@ -2,7 +2,7 @@
set -eu
# We need to start the systemd services we explicitely stopped at step _1.sh
# We need to start the systemd services we explicitely stopped at step _0.sh
# We add the enablement of the systemd services here because if a node gets rebooted
# before the convergence step for whatever reason the migrated services will
# not be enabled and we potentially have a bigger disruption.
@ -10,7 +10,14 @@ services=$(services_to_migrate)
if [[ ${keep_sahara_services_on_upgrade} =~ [Ff]alse ]] ; then
services=${services%%openstack-sahara*}
fi
for service in $services; do
if [[ ${service%%-clone} =~ .*-cleanup ]]; then
# we don't want to start {netns,ovs}-cleanup
log_debug "Skipping ${service}"
continue
fi
manage_systemd_service start "${service%%-clone}"
manage_systemd_service enable "${service%%-clone}"
check_resource_systemd "${service%%-clone}" started 600

View File

@ -60,7 +60,7 @@ resources:
rolling_update:
max_batch_size: 1
ControllerPacemakerUpgradeConfig_Step1:
ControllerPacemakerUpgradeConfig_Step0:
type: OS::Heat::SoftwareConfig
properties:
group: script
@ -81,12 +81,31 @@ resources:
MYSQL_MAJOR_UPGRADE: {get_param: MySqlMajorUpgrade}
- get_file: pacemaker_common_functions.sh
- get_file: major_upgrade_check.sh
- get_file: major_upgrade_pacemaker_migrations.sh
- get_file: major_upgrade_controller_pacemaker_0.sh
ControllerPacemakerUpgradeDeployment_Step0:
type: OS::Heat::SoftwareDeploymentGroup
depends_on: CephMonUpgradeDeployment
properties:
servers: {get_param: [servers, Controller]}
config: {get_resource: ControllerPacemakerUpgradeConfig_Step0}
input_values: {get_param: input_values}
ControllerPacemakerUpgradeConfig_Step1:
type: OS::Heat::SoftwareConfig
properties:
group: script
config:
list_join:
- '#!/bin/bash'
- - get_file: pacemaker_common_functions.sh
- get_file: major_upgrade_pacemaker_migrations.sh
- get_file: major_upgrade_controller_pacemaker_1.sh
ControllerPacemakerUpgradeDeployment_Step1:
type: OS::Heat::SoftwareDeploymentGroup
depends_on: CephMonUpgradeDeployment
depends_on: ControllerPacemakerUpgradeDeployment_Step0
properties:
servers: {get_param: [servers, Controller]}
config: {get_resource: ControllerPacemakerUpgradeConfig_Step1}

View File

@ -101,6 +101,42 @@ function services_to_migrate {
echo $PCMK_RESOURCE_TODELETE
}
# Those are oneshot type services.
function neutron_cleanup_services {
echo "
neutron-netns-cleanup-clone
neutron-ovs-cleanup-clone
"
}
function deactivate_cleanup_services {
for service in $(neutron_cleanup_services); do
log_debug "Changing ocf configuration for '${service}'"
# We prevent any stop action by changing the exec to a noop.
local sysconfig_name=${service%-clone}
# This is loaded by /usr/lib/ocf/lib/neutron/neutron-{ovs,netns}-cleanup
echo "exec=/bin/echo" >> /etc/sysconfig/${sysconfig_name}
sed -i.orig -e 's/clean --force/clean/' /usr/lib/ocf/lib/neutron/neutron-netns-cleanup
done
}
function restore_cleanup_service_definition {
for service in $(neutron_cleanup_services); do
log_debug "Restoring original ocf configuration for '${service}'"
local sysconfig_file=/etc/sysconfig/${service%-clone}
if [ -e "${sysconfig_file}" ]; then
sed -e '/exec=\/bin\/echo/d' $sysconfig_file | \
sed -e '/^ *$/d' > /tmp/$service
if test -s /tmp/$service; then
cp /tmp/$service $sysconfig_file
else
rm -f $sysconfig_file
fi
[ ! -e /tmp/$service ] || rm -f /tmp/$service
fi
done
mv /usr/lib/ocf/lib/neutron/neutron-netns-cleanup{.orig,}
}
# This function will migrate a mitaka system where all the resources are managed
# via pacemaker to a newton setup where only a few services will be managed by pacemaker
# On a high-level it will operate as follows:
@ -148,7 +184,7 @@ function migrate_full_to_ng_ha {
# that will move to systemd.
# We want the systemd resources be stopped before doing "yum update",
# that way "systemctl try-restart <service>" is no-op because the
# service was down already
# service was down already
PCS_STATUS_OUTPUT="$(pcs status)"
for resource in $(services_to_migrate) "delay-clone" "openstack-core-clone"; do
if echo "$PCS_STATUS_OUTPUT" | grep "$resource"; then

View File

@ -4,6 +4,28 @@ set -eu
DEBUG="true" # set false if the verbosity is a problem
SCRIPT_NAME=$(basename $0)
# This block get default for ovs fail mode handling during upgrade.
function get_all_bridges {
local bridges_def=""
local bridges=""
if which ovs-vsctl &>/dev/null; then
if [ -e /etc/neutron/plugins/ml2/openvswitch_agent.ini ]; then
local raw_bridge_def=$(crudini --get /etc/neutron/plugins/ml2/openvswitch_agent.ini ovs bridge_mappings)
local bridges=""
while IFS=: read physnet bridge; do bridges_def="${bridges_def} ${bridge}" ; done \
< <(echo "${raw_bridge_def}" | sed 's/,/\n/g')
local existing_bridges="$(ovs-vsctl -f table -d bare --column=name --no-headings find Bridge)"
for br in ${bridges_def}; do
if echo "${existing_bridges}" | grep -q $br; then
bridges="${bridges} ${br}"
fi
done
fi
fi
echo "${bridges}"
}
function log_debug {
if [[ $DEBUG = "true" ]]; then
echo "`date` $SCRIPT_NAME tripleo-upgrade $(facter hostname) $1"
@ -325,25 +347,32 @@ function special_case_ovs_upgrade_if_needed {
}
# update os-net-config before ovs see https://bugs.launchpad.net/tripleo/+bug/1695893
function update_os_net_config() {
set +e
local need_update="$(yum check-upgrade | grep os-net-config)"
if [ -n "${need_update}" ]; then
yum -q -y update os-net-config
local return_code=$?
log_debug "yum update os-net-config return code: $return_code"
# We're just make sure that os-net-config won't ifdown/ifup
# network interfaces. The current set of changes (Tue Oct 3
# 17:38:37 CEST 2017) doesn't require the os-net-config change
# to be taken live. They will be at next reboot.
os-net-config --no-activate -c /etc/os-net-config/config.json -v \
--detailed-exit-codes
local os_net_retval=$?
if [[ $os_net_retval == 2 ]]; then
log_debug "os-net-config: interface configuration files updated successfully"
elif [[ $os_net_retval != 0 ]]; then
log_debug "ERROR: os-net-config configuration failed"
exit $os_net_retval
fi
fi
set -e
}
function update_network() {
set +e
yum -q -y update os-net-config
return_code=$?
echo "yum update os-net-config return code: $return_code"
# Writes any changes caused by alterations to os-net-config and bounces the
# interfaces *before* restarting the cluster.
os-net-config -c /etc/os-net-config/config.json -v --detailed-exit-codes
RETVAL=$?
if [[ $RETVAL == 2 ]]; then
echo "os-net-config: interface configuration files updated successfully"
elif [[ $RETVAL != 0 ]]; then
echo "ERROR: os-net-config configuration failed"
exit $RETVAL
fi
set -e
# special case https://bugs.launchpad.net/tripleo/+bug/1635205 +bug/1669714
special_case_ovs_upgrade_if_needed
}