Merge "Keep floating ip reachability during pacemaker migration." into stable/newton
This commit is contained in:
commit
186e2cbb15
|
@ -0,0 +1,53 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -eu
|
||||
|
||||
check_cluster
|
||||
check_pcsd
|
||||
if [[ -n $(is_bootstrap_node) ]]; then
|
||||
check_clean_cluster
|
||||
fi
|
||||
check_python_rpm
|
||||
check_galera_root_password
|
||||
check_disk_for_mysql_dump
|
||||
|
||||
# M/N Upgrade only: By default RHEL/Centos has an /etc/sysconfig/iptables file which
|
||||
# allows ssh and icmp only (INPUT table). During the install of OSP9/Mitaka
|
||||
# usually the live iptables rules are not the ones in /etc/sysconfig/iptables but
|
||||
# they are completely open (ACCEPT)
|
||||
# Now when we run the convergence step while migrating to Newton we enable the firewall
|
||||
# by default and this will actually first load the rules from /etc/sysconfig/iptables
|
||||
# and only afterwards, it will start adding all the rules permitting openstack traffic.
|
||||
# This causes an outage of roughly 1 minute in our env, which disrupts the cluster.
|
||||
# Let's simply move the existing file out of the way, it will be recreated by
|
||||
# puppet in newton with the proper firewall rules anyway
|
||||
if [ ! -f /etc/sysconfig/iptables.m-n-upgrade ]; then
|
||||
mv /etc/sysconfig/iptables /etc/sysconfig/iptables.m-n-upgrade || /bin/true
|
||||
fi
|
||||
|
||||
# We want to disable fencing during the cluster --stop as it might fence
|
||||
# nodes where a service fails to stop, which could be fatal during an upgrade
|
||||
# procedure. So we remember the stonith state. If it was enabled we reenable it
|
||||
# at the end of this script
|
||||
if [[ -n $(is_bootstrap_node) ]]; then
|
||||
STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }')
|
||||
# We create this empty file if stonith was set to true so we can reenable stonith in step2
|
||||
rm -f /var/tmp/stonith-true
|
||||
if [ $STONITH_STATE == "true" ]; then
|
||||
touch /var/tmp/stonith-true
|
||||
fi
|
||||
pcs property set stonith-enabled=false
|
||||
fi
|
||||
|
||||
# Before migration and service stopping we make sure that the external
|
||||
# bridge is set to standalone in ovs. This is because we don't do a
|
||||
# rolling upgrade, but we don't want the floating ip network to be cut
|
||||
# off.
|
||||
for br in $(get_all_bridges); do
|
||||
# It we be set to whatever is needed at the end of the upgrade by
|
||||
# ovs-agent processes.
|
||||
ovs-vsctl set-fail-mode ${br} standalone
|
||||
done
|
||||
|
||||
# Make sure openvswitch *cleanup* does not happen.
|
||||
deactivate_cleanup_services
|
|
@ -2,45 +2,7 @@
|
|||
|
||||
set -eu
|
||||
|
||||
check_cluster
|
||||
check_pcsd
|
||||
if [[ -n $(is_bootstrap_node) ]]; then
|
||||
check_clean_cluster
|
||||
fi
|
||||
check_python_rpm
|
||||
check_galera_root_password
|
||||
check_disk_for_mysql_dump
|
||||
|
||||
# M/N Upgrade only: By default RHEL/Centos has an /etc/sysconfig/iptables file which
|
||||
# allows ssh and icmp only (INPUT table). During the install of OSP9/Mitaka
|
||||
# usually the live iptables rules are not the ones in /etc/sysconfig/iptables but
|
||||
# they are completely open (ACCEPT)
|
||||
# Now when we run the convergence step while migrating to Newton we enable the firewall
|
||||
# by default and this will actually first load the rules from /etc/sysconfig/iptables
|
||||
# and only afterwards, it will start adding all the rules permitting openstack traffic.
|
||||
# This causes an outage of roughly 1 minute in our env, which disrupts the cluster.
|
||||
# Let's simply move the existing file out of the way, it will be recreated by
|
||||
# puppet in newton with the proper firewall rules anyway
|
||||
if [ ! -f /etc/sysconfig/iptables.m-n-upgrade ]; then
|
||||
mv /etc/sysconfig/iptables /etc/sysconfig/iptables.m-n-upgrade || /bin/true
|
||||
fi
|
||||
|
||||
# We want to disable fencing during the cluster --stop as it might fence
|
||||
# nodes where a service fails to stop, which could be fatal during an upgrade
|
||||
# procedure. So we remember the stonith state. If it was enabled we reenable it
|
||||
# at the end of this script
|
||||
if [[ -n $(is_bootstrap_node) ]]; then
|
||||
STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }')
|
||||
# We create this empty file if stonith was set to true so we can reenable stonith in step2
|
||||
rm -f /var/tmp/stonith-true
|
||||
if [ $STONITH_STATE == "true" ]; then
|
||||
touch /var/tmp/stonith-true
|
||||
fi
|
||||
pcs property set stonith-enabled=false
|
||||
fi
|
||||
|
||||
# Migrate to HA NG
|
||||
if [[ -n $(is_bootstrap_node) ]]; then
|
||||
migrate_full_to_ng_ha
|
||||
fi
|
||||
|
||||
|
|
|
@ -11,6 +11,10 @@ cluster_sync_timeout=1800
|
|||
# systemctl try-restart is a noop
|
||||
|
||||
for service in $(services_to_migrate); do
|
||||
if [[ ${service%%-clone} =~ .*-cleanup ]]; then
|
||||
# we don't want to stop {netns,ovs}-cleanup
|
||||
continue
|
||||
fi
|
||||
manage_systemd_service stop "${service%%-clone}"
|
||||
# So the reason for not reusing check_resource_systemd is that
|
||||
# I have observed systemctl is-active returning unknown with at least
|
||||
|
@ -107,6 +111,7 @@ if [ $DO_MYSQL_UPGRADE -eq 1 ]; then
|
|||
mv /var/lib/mysql $MYSQL_TEMP_UPGRADE_BACKUP_DIR
|
||||
fi
|
||||
|
||||
update_os_net_config
|
||||
# Special-case OVS for https://bugs.launchpad.net/tripleo/+bug/1669714
|
||||
update_network
|
||||
|
||||
|
@ -119,6 +124,7 @@ if grep -q '^pipeline = ssl_header_handler faultwrap osvolumeversionapp' /etc/ci
|
|||
sed -i '$ { /^$/d }' /etc/cinder/api-paste.ini
|
||||
fi
|
||||
|
||||
restore_cleanup_service_definition
|
||||
|
||||
yum -y install python-zaqarclient # needed for os-collect-config
|
||||
yum -y -q update
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
set -eu
|
||||
|
||||
# We need to start the systemd services we explicitely stopped at step _1.sh
|
||||
# We need to start the systemd services we explicitely stopped at step _0.sh
|
||||
# We add the enablement of the systemd services here because if a node gets rebooted
|
||||
# before the convergence step for whatever reason the migrated services will
|
||||
# not be enabled and we potentially have a bigger disruption.
|
||||
|
@ -10,7 +10,14 @@ services=$(services_to_migrate)
|
|||
if [[ ${keep_sahara_services_on_upgrade} =~ [Ff]alse ]] ; then
|
||||
services=${services%%openstack-sahara*}
|
||||
fi
|
||||
|
||||
for service in $services; do
|
||||
if [[ ${service%%-clone} =~ .*-cleanup ]]; then
|
||||
# we don't want to start {netns,ovs}-cleanup
|
||||
log_debug "Skipping ${service}"
|
||||
continue
|
||||
fi
|
||||
|
||||
manage_systemd_service start "${service%%-clone}"
|
||||
manage_systemd_service enable "${service%%-clone}"
|
||||
check_resource_systemd "${service%%-clone}" started 600
|
||||
|
|
|
@ -60,7 +60,7 @@ resources:
|
|||
rolling_update:
|
||||
max_batch_size: 1
|
||||
|
||||
ControllerPacemakerUpgradeConfig_Step1:
|
||||
ControllerPacemakerUpgradeConfig_Step0:
|
||||
type: OS::Heat::SoftwareConfig
|
||||
properties:
|
||||
group: script
|
||||
|
@ -81,12 +81,31 @@ resources:
|
|||
MYSQL_MAJOR_UPGRADE: {get_param: MySqlMajorUpgrade}
|
||||
- get_file: pacemaker_common_functions.sh
|
||||
- get_file: major_upgrade_check.sh
|
||||
- get_file: major_upgrade_pacemaker_migrations.sh
|
||||
- get_file: major_upgrade_controller_pacemaker_0.sh
|
||||
|
||||
ControllerPacemakerUpgradeDeployment_Step0:
|
||||
type: OS::Heat::SoftwareDeploymentGroup
|
||||
depends_on: CephMonUpgradeDeployment
|
||||
properties:
|
||||
servers: {get_param: [servers, Controller]}
|
||||
config: {get_resource: ControllerPacemakerUpgradeConfig_Step0}
|
||||
input_values: {get_param: input_values}
|
||||
|
||||
ControllerPacemakerUpgradeConfig_Step1:
|
||||
type: OS::Heat::SoftwareConfig
|
||||
properties:
|
||||
group: script
|
||||
config:
|
||||
list_join:
|
||||
- '#!/bin/bash'
|
||||
- - get_file: pacemaker_common_functions.sh
|
||||
- get_file: major_upgrade_pacemaker_migrations.sh
|
||||
- get_file: major_upgrade_controller_pacemaker_1.sh
|
||||
|
||||
ControllerPacemakerUpgradeDeployment_Step1:
|
||||
type: OS::Heat::SoftwareDeploymentGroup
|
||||
depends_on: CephMonUpgradeDeployment
|
||||
depends_on: ControllerPacemakerUpgradeDeployment_Step0
|
||||
properties:
|
||||
servers: {get_param: [servers, Controller]}
|
||||
config: {get_resource: ControllerPacemakerUpgradeConfig_Step1}
|
||||
|
|
|
@ -101,6 +101,42 @@ function services_to_migrate {
|
|||
echo $PCMK_RESOURCE_TODELETE
|
||||
}
|
||||
|
||||
# Those are oneshot type services.
|
||||
function neutron_cleanup_services {
|
||||
echo "
|
||||
neutron-netns-cleanup-clone
|
||||
neutron-ovs-cleanup-clone
|
||||
"
|
||||
}
|
||||
|
||||
function deactivate_cleanup_services {
|
||||
for service in $(neutron_cleanup_services); do
|
||||
log_debug "Changing ocf configuration for '${service}'"
|
||||
# We prevent any stop action by changing the exec to a noop.
|
||||
local sysconfig_name=${service%-clone}
|
||||
# This is loaded by /usr/lib/ocf/lib/neutron/neutron-{ovs,netns}-cleanup
|
||||
echo "exec=/bin/echo" >> /etc/sysconfig/${sysconfig_name}
|
||||
sed -i.orig -e 's/clean --force/clean/' /usr/lib/ocf/lib/neutron/neutron-netns-cleanup
|
||||
done
|
||||
}
|
||||
|
||||
function restore_cleanup_service_definition {
|
||||
for service in $(neutron_cleanup_services); do
|
||||
log_debug "Restoring original ocf configuration for '${service}'"
|
||||
local sysconfig_file=/etc/sysconfig/${service%-clone}
|
||||
if [ -e "${sysconfig_file}" ]; then
|
||||
sed -e '/exec=\/bin\/echo/d' $sysconfig_file | \
|
||||
sed -e '/^ *$/d' > /tmp/$service
|
||||
if test -s /tmp/$service; then
|
||||
cp /tmp/$service $sysconfig_file
|
||||
else
|
||||
rm -f $sysconfig_file
|
||||
fi
|
||||
[ ! -e /tmp/$service ] || rm -f /tmp/$service
|
||||
fi
|
||||
done
|
||||
mv /usr/lib/ocf/lib/neutron/neutron-netns-cleanup{.orig,}
|
||||
}
|
||||
# This function will migrate a mitaka system where all the resources are managed
|
||||
# via pacemaker to a newton setup where only a few services will be managed by pacemaker
|
||||
# On a high-level it will operate as follows:
|
||||
|
@ -148,7 +184,7 @@ function migrate_full_to_ng_ha {
|
|||
# that will move to systemd.
|
||||
# We want the systemd resources be stopped before doing "yum update",
|
||||
# that way "systemctl try-restart <service>" is no-op because the
|
||||
# service was down already
|
||||
# service was down already
|
||||
PCS_STATUS_OUTPUT="$(pcs status)"
|
||||
for resource in $(services_to_migrate) "delay-clone" "openstack-core-clone"; do
|
||||
if echo "$PCS_STATUS_OUTPUT" | grep "$resource"; then
|
||||
|
|
|
@ -4,6 +4,28 @@ set -eu
|
|||
|
||||
DEBUG="true" # set false if the verbosity is a problem
|
||||
SCRIPT_NAME=$(basename $0)
|
||||
|
||||
# This block get default for ovs fail mode handling during upgrade.
|
||||
function get_all_bridges {
|
||||
local bridges_def=""
|
||||
local bridges=""
|
||||
if which ovs-vsctl &>/dev/null; then
|
||||
if [ -e /etc/neutron/plugins/ml2/openvswitch_agent.ini ]; then
|
||||
local raw_bridge_def=$(crudini --get /etc/neutron/plugins/ml2/openvswitch_agent.ini ovs bridge_mappings)
|
||||
local bridges=""
|
||||
while IFS=: read physnet bridge; do bridges_def="${bridges_def} ${bridge}" ; done \
|
||||
< <(echo "${raw_bridge_def}" | sed 's/,/\n/g')
|
||||
local existing_bridges="$(ovs-vsctl -f table -d bare --column=name --no-headings find Bridge)"
|
||||
for br in ${bridges_def}; do
|
||||
if echo "${existing_bridges}" | grep -q $br; then
|
||||
bridges="${bridges} ${br}"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
fi
|
||||
echo "${bridges}"
|
||||
}
|
||||
|
||||
function log_debug {
|
||||
if [[ $DEBUG = "true" ]]; then
|
||||
echo "`date` $SCRIPT_NAME tripleo-upgrade $(facter hostname) $1"
|
||||
|
@ -325,25 +347,32 @@ function special_case_ovs_upgrade_if_needed {
|
|||
}
|
||||
|
||||
# update os-net-config before ovs see https://bugs.launchpad.net/tripleo/+bug/1695893
|
||||
function update_os_net_config() {
|
||||
set +e
|
||||
local need_update="$(yum check-upgrade | grep os-net-config)"
|
||||
if [ -n "${need_update}" ]; then
|
||||
yum -q -y update os-net-config
|
||||
local return_code=$?
|
||||
log_debug "yum update os-net-config return code: $return_code"
|
||||
|
||||
# We're just make sure that os-net-config won't ifdown/ifup
|
||||
# network interfaces. The current set of changes (Tue Oct 3
|
||||
# 17:38:37 CEST 2017) doesn't require the os-net-config change
|
||||
# to be taken live. They will be at next reboot.
|
||||
os-net-config --no-activate -c /etc/os-net-config/config.json -v \
|
||||
--detailed-exit-codes
|
||||
local os_net_retval=$?
|
||||
if [[ $os_net_retval == 2 ]]; then
|
||||
log_debug "os-net-config: interface configuration files updated successfully"
|
||||
elif [[ $os_net_retval != 0 ]]; then
|
||||
log_debug "ERROR: os-net-config configuration failed"
|
||||
exit $os_net_retval
|
||||
fi
|
||||
fi
|
||||
set -e
|
||||
}
|
||||
|
||||
function update_network() {
|
||||
set +e
|
||||
yum -q -y update os-net-config
|
||||
return_code=$?
|
||||
echo "yum update os-net-config return code: $return_code"
|
||||
|
||||
# Writes any changes caused by alterations to os-net-config and bounces the
|
||||
# interfaces *before* restarting the cluster.
|
||||
os-net-config -c /etc/os-net-config/config.json -v --detailed-exit-codes
|
||||
|
||||
RETVAL=$?
|
||||
if [[ $RETVAL == 2 ]]; then
|
||||
echo "os-net-config: interface configuration files updated successfully"
|
||||
elif [[ $RETVAL != 0 ]]; then
|
||||
echo "ERROR: os-net-config configuration failed"
|
||||
exit $RETVAL
|
||||
fi
|
||||
set -e
|
||||
|
||||
# special case https://bugs.launchpad.net/tripleo/+bug/1635205 +bug/1669714
|
||||
special_case_ovs_upgrade_if_needed
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue