From bce61783bc175e98b535c678d90829344dab5c47 Mon Sep 17 00:00:00 2001 From: Sofer Athlan-Guyot Date: Fri, 16 Jun 2017 14:53:14 +0200 Subject: [PATCH] Keep floating ip reachability during pacemaker migration. neutron-netns-cleanup-clone and neutron-ovs-cleanup-clone resources are only one shot resources that need to be activated during bootup and shutdown of node. Triggering a stop has the side effect of removing entries in the ovs db, making floating ips unreachable. Those are triggered by the the /usr/lib/ocf/resource.d/neutron/{OVSCleanup,NetnsCleanup} pacemaker resource. They, in turn, use the /usr/lib/ocf/lib/neutron/neutron-{ovs,netns}-cleanup scripts (not the systemd unit file). We temporarily disable any action by configuring the executable to be "/bin/echo" instead of the /usr/bin/neutron-{ovs,netns}-cleanup and removing the "--force" option in neutron-netns-cleanup. As those resources are cloned resources we need to make sure that the modification is done on all controller nodes before we take action on the controller bootstraping node. To do that we move most of Step1 to Step0 and make the bootstrap node action happens at Step1 of the pacemaker controller upgrade. Furthermore we make sure that the ext bridges, if ovs is used, are not in secure mode by setting them to standalone during the upgrade process and back to whatever they were before. Eventually we need to take care of the os-net-config upgrade. It can add new parameters to the interface definition which will force a restart of the interfaces. To avoid that we add the --no-activate option. Currently no major change in os-net-config are required for the overcloud to continue running after the upgrade. Co-Authored-By: Raoul Scarazzini Change-Id: Ib5d7b447808b51f6e436eaf6d661606132155a23 Depends-On: Ieb5ad6ad429c8388a1cbbd650339b6eecd9b7997 Closes-Bug: #1698373 --- .../major_upgrade_controller_pacemaker_0.sh | 53 +++++++++++++++ .../major_upgrade_controller_pacemaker_1.sh | 38 ----------- .../major_upgrade_controller_pacemaker_2.sh | 6 ++ .../major_upgrade_controller_pacemaker_6.sh | 9 ++- .../tasks/major_upgrade_pacemaker.yaml | 23 ++++++- .../major_upgrade_pacemaker_migrations.sh | 38 ++++++++++- .../tasks/pacemaker_common_functions.sh | 65 ++++++++++++++----- 7 files changed, 172 insertions(+), 60 deletions(-) create mode 100644 extraconfig/tasks/major_upgrade_controller_pacemaker_0.sh diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_0.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_0.sh new file mode 100644 index 0000000000..efed76d8ff --- /dev/null +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_0.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +set -eu + +check_cluster +check_pcsd +if [[ -n $(is_bootstrap_node) ]]; then + check_clean_cluster +fi +check_python_rpm +check_galera_root_password +check_disk_for_mysql_dump + +# M/N Upgrade only: By default RHEL/Centos has an /etc/sysconfig/iptables file which +# allows ssh and icmp only (INPUT table). During the install of OSP9/Mitaka +# usually the live iptables rules are not the ones in /etc/sysconfig/iptables but +# they are completely open (ACCEPT) +# Now when we run the convergence step while migrating to Newton we enable the firewall +# by default and this will actually first load the rules from /etc/sysconfig/iptables +# and only afterwards, it will start adding all the rules permitting openstack traffic. +# This causes an outage of roughly 1 minute in our env, which disrupts the cluster. +# Let's simply move the existing file out of the way, it will be recreated by +# puppet in newton with the proper firewall rules anyway +if [ ! -f /etc/sysconfig/iptables.m-n-upgrade ]; then + mv /etc/sysconfig/iptables /etc/sysconfig/iptables.m-n-upgrade || /bin/true +fi + +# We want to disable fencing during the cluster --stop as it might fence +# nodes where a service fails to stop, which could be fatal during an upgrade +# procedure. So we remember the stonith state. If it was enabled we reenable it +# at the end of this script +if [[ -n $(is_bootstrap_node) ]]; then + STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }') + # We create this empty file if stonith was set to true so we can reenable stonith in step2 + rm -f /var/tmp/stonith-true + if [ $STONITH_STATE == "true" ]; then + touch /var/tmp/stonith-true + fi + pcs property set stonith-enabled=false +fi + +# Before migration and service stopping we make sure that the external +# bridge is set to standalone in ovs. This is because we don't do a +# rolling upgrade, but we don't want the floating ip network to be cut +# off. +for br in $(get_all_bridges); do + # It we be set to whatever is needed at the end of the upgrade by + # ovs-agent processes. + ovs-vsctl set-fail-mode ${br} standalone +done + +# Make sure openvswitch *cleanup* does not happen. +deactivate_cleanup_services diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh index 2eb36139a5..3401b90725 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh @@ -2,45 +2,7 @@ set -eu -check_cluster -check_pcsd -if [[ -n $(is_bootstrap_node) ]]; then - check_clean_cluster -fi -check_python_rpm -check_galera_root_password -check_disk_for_mysql_dump - -# M/N Upgrade only: By default RHEL/Centos has an /etc/sysconfig/iptables file which -# allows ssh and icmp only (INPUT table). During the install of OSP9/Mitaka -# usually the live iptables rules are not the ones in /etc/sysconfig/iptables but -# they are completely open (ACCEPT) -# Now when we run the convergence step while migrating to Newton we enable the firewall -# by default and this will actually first load the rules from /etc/sysconfig/iptables -# and only afterwards, it will start adding all the rules permitting openstack traffic. -# This causes an outage of roughly 1 minute in our env, which disrupts the cluster. -# Let's simply move the existing file out of the way, it will be recreated by -# puppet in newton with the proper firewall rules anyway -if [ ! -f /etc/sysconfig/iptables.m-n-upgrade ]; then - mv /etc/sysconfig/iptables /etc/sysconfig/iptables.m-n-upgrade || /bin/true -fi - -# We want to disable fencing during the cluster --stop as it might fence -# nodes where a service fails to stop, which could be fatal during an upgrade -# procedure. So we remember the stonith state. If it was enabled we reenable it -# at the end of this script -if [[ -n $(is_bootstrap_node) ]]; then - STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }') - # We create this empty file if stonith was set to true so we can reenable stonith in step2 - rm -f /var/tmp/stonith-true - if [ $STONITH_STATE == "true" ]; then - touch /var/tmp/stonith-true - fi - pcs property set stonith-enabled=false -fi - # Migrate to HA NG if [[ -n $(is_bootstrap_node) ]]; then migrate_full_to_ng_ha fi - diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh index fa1e223adf..534a0a2f9f 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh @@ -11,6 +11,10 @@ cluster_sync_timeout=1800 # systemctl try-restart is a noop for service in $(services_to_migrate); do + if [[ ${service%%-clone} =~ .*-cleanup ]]; then + # we don't want to stop {netns,ovs}-cleanup + continue + fi manage_systemd_service stop "${service%%-clone}" # So the reason for not reusing check_resource_systemd is that # I have observed systemctl is-active returning unknown with at least @@ -107,6 +111,7 @@ if [ $DO_MYSQL_UPGRADE -eq 1 ]; then mv /var/lib/mysql $MYSQL_TEMP_UPGRADE_BACKUP_DIR fi +update_os_net_config # Special-case OVS for https://bugs.launchpad.net/tripleo/+bug/1669714 update_network @@ -119,6 +124,7 @@ if grep -q '^pipeline = ssl_header_handler faultwrap osvolumeversionapp' /etc/ci sed -i '$ { /^$/d }' /etc/cinder/api-paste.ini fi +restore_cleanup_service_definition yum -y install python-zaqarclient # needed for os-collect-config yum -y -q update diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh index 719ac2a39a..f70f10de68 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh @@ -2,7 +2,7 @@ set -eu -# We need to start the systemd services we explicitely stopped at step _1.sh +# We need to start the systemd services we explicitely stopped at step _0.sh # We add the enablement of the systemd services here because if a node gets rebooted # before the convergence step for whatever reason the migrated services will # not be enabled and we potentially have a bigger disruption. @@ -10,7 +10,14 @@ services=$(services_to_migrate) if [[ ${keep_sahara_services_on_upgrade} =~ [Ff]alse ]] ; then services=${services%%openstack-sahara*} fi + for service in $services; do + if [[ ${service%%-clone} =~ .*-cleanup ]]; then + # we don't want to start {netns,ovs}-cleanup + log_debug "Skipping ${service}" + continue + fi + manage_systemd_service start "${service%%-clone}" manage_systemd_service enable "${service%%-clone}" check_resource_systemd "${service%%-clone}" started 600 diff --git a/extraconfig/tasks/major_upgrade_pacemaker.yaml b/extraconfig/tasks/major_upgrade_pacemaker.yaml index 370660f172..c8f21463c4 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker.yaml +++ b/extraconfig/tasks/major_upgrade_pacemaker.yaml @@ -60,7 +60,7 @@ resources: rolling_update: max_batch_size: 1 - ControllerPacemakerUpgradeConfig_Step1: + ControllerPacemakerUpgradeConfig_Step0: type: OS::Heat::SoftwareConfig properties: group: script @@ -81,12 +81,31 @@ resources: MYSQL_MAJOR_UPGRADE: {get_param: MySqlMajorUpgrade} - get_file: pacemaker_common_functions.sh - get_file: major_upgrade_check.sh + - get_file: major_upgrade_pacemaker_migrations.sh + - get_file: major_upgrade_controller_pacemaker_0.sh + + ControllerPacemakerUpgradeDeployment_Step0: + type: OS::Heat::SoftwareDeploymentGroup + depends_on: CephMonUpgradeDeployment + properties: + servers: {get_param: [servers, Controller]} + config: {get_resource: ControllerPacemakerUpgradeConfig_Step0} + input_values: {get_param: input_values} + + ControllerPacemakerUpgradeConfig_Step1: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: + list_join: + - '#!/bin/bash' + - - get_file: pacemaker_common_functions.sh - get_file: major_upgrade_pacemaker_migrations.sh - get_file: major_upgrade_controller_pacemaker_1.sh ControllerPacemakerUpgradeDeployment_Step1: type: OS::Heat::SoftwareDeploymentGroup - depends_on: CephMonUpgradeDeployment + depends_on: ControllerPacemakerUpgradeDeployment_Step0 properties: servers: {get_param: [servers, Controller]} config: {get_resource: ControllerPacemakerUpgradeConfig_Step1} diff --git a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh index d7b50aa192..ec9fc0ac55 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh +++ b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh @@ -101,6 +101,42 @@ function services_to_migrate { echo $PCMK_RESOURCE_TODELETE } +# Those are oneshot type services. +function neutron_cleanup_services { + echo " + neutron-netns-cleanup-clone + neutron-ovs-cleanup-clone + " +} + +function deactivate_cleanup_services { + for service in $(neutron_cleanup_services); do + log_debug "Changing ocf configuration for '${service}'" + # We prevent any stop action by changing the exec to a noop. + local sysconfig_name=${service%-clone} + # This is loaded by /usr/lib/ocf/lib/neutron/neutron-{ovs,netns}-cleanup + echo "exec=/bin/echo" >> /etc/sysconfig/${sysconfig_name} + sed -i.orig -e 's/clean --force/clean/' /usr/lib/ocf/lib/neutron/neutron-netns-cleanup + done +} + +function restore_cleanup_service_definition { + for service in $(neutron_cleanup_services); do + log_debug "Restoring original ocf configuration for '${service}'" + local sysconfig_file=/etc/sysconfig/${service%-clone} + if [ -e "${sysconfig_file}" ]; then + sed -e '/exec=\/bin\/echo/d' $sysconfig_file | \ + sed -e '/^ *$/d' > /tmp/$service + if test -s /tmp/$service; then + cp /tmp/$service $sysconfig_file + else + rm -f $sysconfig_file + fi + [ ! -e /tmp/$service ] || rm -f /tmp/$service + fi + done + mv /usr/lib/ocf/lib/neutron/neutron-netns-cleanup{.orig,} +} # This function will migrate a mitaka system where all the resources are managed # via pacemaker to a newton setup where only a few services will be managed by pacemaker # On a high-level it will operate as follows: @@ -148,7 +184,7 @@ function migrate_full_to_ng_ha { # that will move to systemd. # We want the systemd resources be stopped before doing "yum update", # that way "systemctl try-restart " is no-op because the - # service was down already + # service was down already PCS_STATUS_OUTPUT="$(pcs status)" for resource in $(services_to_migrate) "delay-clone" "openstack-core-clone"; do if echo "$PCS_STATUS_OUTPUT" | grep "$resource"; then diff --git a/extraconfig/tasks/pacemaker_common_functions.sh b/extraconfig/tasks/pacemaker_common_functions.sh index adf24e735e..cc8cf8c63a 100755 --- a/extraconfig/tasks/pacemaker_common_functions.sh +++ b/extraconfig/tasks/pacemaker_common_functions.sh @@ -4,6 +4,28 @@ set -eu DEBUG="true" # set false if the verbosity is a problem SCRIPT_NAME=$(basename $0) + +# This block get default for ovs fail mode handling during upgrade. +function get_all_bridges { + local bridges_def="" + local bridges="" + if which ovs-vsctl &>/dev/null; then + if [ -e /etc/neutron/plugins/ml2/openvswitch_agent.ini ]; then + local raw_bridge_def=$(crudini --get /etc/neutron/plugins/ml2/openvswitch_agent.ini ovs bridge_mappings) + local bridges="" + while IFS=: read physnet bridge; do bridges_def="${bridges_def} ${bridge}" ; done \ + < <(echo "${raw_bridge_def}" | sed 's/,/\n/g') + local existing_bridges="$(ovs-vsctl -f table -d bare --column=name --no-headings find Bridge)" + for br in ${bridges_def}; do + if echo "${existing_bridges}" | grep -q $br; then + bridges="${bridges} ${br}" + fi + done + fi + fi + echo "${bridges}" +} + function log_debug { if [[ $DEBUG = "true" ]]; then echo "`date` $SCRIPT_NAME tripleo-upgrade $(facter hostname) $1" @@ -325,25 +347,32 @@ function special_case_ovs_upgrade_if_needed { } # update os-net-config before ovs see https://bugs.launchpad.net/tripleo/+bug/1695893 +function update_os_net_config() { + set +e + local need_update="$(yum check-upgrade | grep os-net-config)" + if [ -n "${need_update}" ]; then + yum -q -y update os-net-config + local return_code=$? + log_debug "yum update os-net-config return code: $return_code" + + # We're just make sure that os-net-config won't ifdown/ifup + # network interfaces. The current set of changes (Tue Oct 3 + # 17:38:37 CEST 2017) doesn't require the os-net-config change + # to be taken live. They will be at next reboot. + os-net-config --no-activate -c /etc/os-net-config/config.json -v \ + --detailed-exit-codes + local os_net_retval=$? + if [[ $os_net_retval == 2 ]]; then + log_debug "os-net-config: interface configuration files updated successfully" + elif [[ $os_net_retval != 0 ]]; then + log_debug "ERROR: os-net-config configuration failed" + exit $os_net_retval + fi + fi + set -e +} + function update_network() { - set +e - yum -q -y update os-net-config - return_code=$? - echo "yum update os-net-config return code: $return_code" - - # Writes any changes caused by alterations to os-net-config and bounces the - # interfaces *before* restarting the cluster. - os-net-config -c /etc/os-net-config/config.json -v --detailed-exit-codes - - RETVAL=$? - if [[ $RETVAL == 2 ]]; then - echo "os-net-config: interface configuration files updated successfully" - elif [[ $RETVAL != 0 ]]; then - echo "ERROR: os-net-config configuration failed" - exit $RETVAL - fi - set -e - # special case https://bugs.launchpad.net/tripleo/+bug/1635205 +bug/1669714 special_case_ovs_upgrade_if_needed }