From bce61783bc175e98b535c678d90829344dab5c47 Mon Sep 17 00:00:00 2001
From: Sofer Athlan-Guyot <sathlang@redhat.com>
Date: Fri, 16 Jun 2017 14:53:14 +0200
Subject: [PATCH] Keep floating ip reachability during pacemaker migration.

neutron-netns-cleanup-clone and neutron-ovs-cleanup-clone resources
are only one shot resources that need to be activated during bootup
and shutdown of node.  Triggering a stop has the side effect of
removing entries in the ovs db, making floating ips unreachable.

Those are triggered by the the
/usr/lib/ocf/resource.d/neutron/{OVSCleanup,NetnsCleanup} pacemaker
resource.  They, in turn, use the
/usr/lib/ocf/lib/neutron/neutron-{ovs,netns}-cleanup scripts (not the
systemd unit file).  We temporarily disable any action by configuring
the executable to be "/bin/echo" instead of the
/usr/bin/neutron-{ovs,netns}-cleanup and removing the "--force" option
in neutron-netns-cleanup.

As those resources are cloned resources we need to make sure that the
modification is done on all controller nodes before we take action on
the controller bootstraping node.  To do that we move most of Step1 to
Step0 and make the bootstrap node action happens at Step1 of the
pacemaker controller upgrade.

Furthermore we make sure that the ext bridges, if ovs is used, are not
in secure mode by setting them to standalone during the upgrade
process and back to whatever they were before.

Eventually we need to take care of the os-net-config upgrade.  It can
add new parameters to the interface definition which will force a
restart of the interfaces.  To avoid that we add the --no-activate
option.  Currently no major change in os-net-config are required for
the overcloud to continue running after the upgrade.

Co-Authored-By: Raoul Scarazzini <rscarazz@redhat.com>
Change-Id: Ib5d7b447808b51f6e436eaf6d661606132155a23
Depends-On: Ieb5ad6ad429c8388a1cbbd650339b6eecd9b7997
Closes-Bug: #1698373
---
 .../major_upgrade_controller_pacemaker_0.sh   | 53 +++++++++++++++
 .../major_upgrade_controller_pacemaker_1.sh   | 38 -----------
 .../major_upgrade_controller_pacemaker_2.sh   |  6 ++
 .../major_upgrade_controller_pacemaker_6.sh   |  9 ++-
 .../tasks/major_upgrade_pacemaker.yaml        | 23 ++++++-
 .../major_upgrade_pacemaker_migrations.sh     | 38 ++++++++++-
 .../tasks/pacemaker_common_functions.sh       | 65 ++++++++++++++-----
 7 files changed, 172 insertions(+), 60 deletions(-)
 create mode 100644 extraconfig/tasks/major_upgrade_controller_pacemaker_0.sh

diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_0.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_0.sh
new file mode 100644
index 0000000000..efed76d8ff
--- /dev/null
+++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_0.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+set -eu
+
+check_cluster
+check_pcsd
+if [[ -n $(is_bootstrap_node) ]]; then
+    check_clean_cluster
+fi
+check_python_rpm
+check_galera_root_password
+check_disk_for_mysql_dump
+
+# M/N Upgrade only: By default RHEL/Centos has an /etc/sysconfig/iptables file which
+# allows ssh and icmp only (INPUT table). During the install of OSP9/Mitaka
+# usually the live iptables rules are not the ones in /etc/sysconfig/iptables but
+# they are completely open (ACCEPT)
+# Now when we run the convergence step while migrating to Newton we enable the firewall
+# by default and this will actually first load the rules from /etc/sysconfig/iptables
+# and only afterwards, it will start adding all the rules permitting openstack traffic.
+# This causes an outage of roughly 1 minute in our env, which disrupts the cluster.
+# Let's simply move the existing file out of the way, it will be recreated by
+# puppet in newton with the proper firewall rules anyway
+if [ ! -f /etc/sysconfig/iptables.m-n-upgrade ]; then
+    mv /etc/sysconfig/iptables /etc/sysconfig/iptables.m-n-upgrade || /bin/true
+fi
+
+# We want to disable fencing during the cluster --stop as it might fence
+# nodes where a service fails to stop, which could be fatal during an upgrade
+# procedure. So we remember the stonith state. If it was enabled we reenable it
+# at the end of this script
+if [[ -n $(is_bootstrap_node) ]]; then
+    STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }')
+    # We create this empty file if stonith was set to true so we can reenable stonith in step2
+    rm -f /var/tmp/stonith-true
+    if [ $STONITH_STATE == "true" ]; then
+        touch /var/tmp/stonith-true
+    fi
+    pcs property set stonith-enabled=false
+fi
+
+# Before migration and service stopping we make sure that the external
+# bridge is set to standalone in ovs.  This is because we don't do a
+# rolling upgrade, but we don't want the floating ip network to be cut
+# off.
+for br in $(get_all_bridges); do
+    # It we be set to whatever is needed at the end of the upgrade by
+    # ovs-agent processes.
+    ovs-vsctl set-fail-mode ${br} standalone
+done
+
+# Make sure openvswitch *cleanup* does not happen.
+deactivate_cleanup_services
diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh
index 2eb36139a5..3401b90725 100755
--- a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh
+++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh
@@ -2,45 +2,7 @@
 
 set -eu
 
-check_cluster
-check_pcsd
-if [[ -n $(is_bootstrap_node) ]]; then
-    check_clean_cluster
-fi
-check_python_rpm
-check_galera_root_password
-check_disk_for_mysql_dump
-
-# M/N Upgrade only: By default RHEL/Centos has an /etc/sysconfig/iptables file which
-# allows ssh and icmp only (INPUT table). During the install of OSP9/Mitaka
-# usually the live iptables rules are not the ones in /etc/sysconfig/iptables but
-# they are completely open (ACCEPT)
-# Now when we run the convergence step while migrating to Newton we enable the firewall
-# by default and this will actually first load the rules from /etc/sysconfig/iptables
-# and only afterwards, it will start adding all the rules permitting openstack traffic.
-# This causes an outage of roughly 1 minute in our env, which disrupts the cluster.
-# Let's simply move the existing file out of the way, it will be recreated by
-# puppet in newton with the proper firewall rules anyway
-if [ ! -f /etc/sysconfig/iptables.m-n-upgrade ]; then
-    mv /etc/sysconfig/iptables /etc/sysconfig/iptables.m-n-upgrade || /bin/true
-fi
-
-# We want to disable fencing during the cluster --stop as it might fence
-# nodes where a service fails to stop, which could be fatal during an upgrade
-# procedure. So we remember the stonith state. If it was enabled we reenable it
-# at the end of this script
-if [[ -n $(is_bootstrap_node) ]]; then
-    STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }')
-    # We create this empty file if stonith was set to true so we can reenable stonith in step2
-    rm -f /var/tmp/stonith-true
-    if [ $STONITH_STATE == "true" ]; then
-        touch /var/tmp/stonith-true
-    fi
-    pcs property set stonith-enabled=false
-fi
-
 # Migrate to HA NG
 if [[ -n $(is_bootstrap_node) ]]; then
     migrate_full_to_ng_ha
 fi
-
diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh
index fa1e223adf..534a0a2f9f 100755
--- a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh
+++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh
@@ -11,6 +11,10 @@ cluster_sync_timeout=1800
 # systemctl try-restart is a noop
 
 for service in $(services_to_migrate); do
+    if [[ ${service%%-clone} =~ .*-cleanup ]]; then
+        # we don't want to stop {netns,ovs}-cleanup
+        continue
+    fi
     manage_systemd_service stop "${service%%-clone}"
     # So the reason for not reusing check_resource_systemd is that
     # I have observed systemctl is-active returning unknown with at least
@@ -107,6 +111,7 @@ if [ $DO_MYSQL_UPGRADE -eq 1 ]; then
     mv /var/lib/mysql $MYSQL_TEMP_UPGRADE_BACKUP_DIR
 fi
 
+update_os_net_config
 # Special-case OVS for https://bugs.launchpad.net/tripleo/+bug/1669714
 update_network
 
@@ -119,6 +124,7 @@ if grep -q '^pipeline = ssl_header_handler faultwrap osvolumeversionapp' /etc/ci
     sed -i '$ { /^$/d }' /etc/cinder/api-paste.ini
 fi
 
+restore_cleanup_service_definition
 
 yum -y install python-zaqarclient  # needed for os-collect-config
 yum -y -q update
diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh
index 719ac2a39a..f70f10de68 100755
--- a/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh
+++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh
@@ -2,7 +2,7 @@
 
 set -eu
 
-# We need to start the systemd services we explicitely stopped at step _1.sh
+# We need to start the systemd services we explicitely stopped at step _0.sh
 # We add the enablement of the systemd services here because if a node gets rebooted
 # before the convergence step for whatever reason the migrated services will
 # not be enabled and we potentially have a bigger disruption.
@@ -10,7 +10,14 @@ services=$(services_to_migrate)
 if [[ ${keep_sahara_services_on_upgrade} =~ [Ff]alse ]] ; then
     services=${services%%openstack-sahara*}
 fi
+
 for service in $services; do
+    if [[ ${service%%-clone} =~ .*-cleanup ]]; then
+        # we don't want to start {netns,ovs}-cleanup
+        log_debug "Skipping ${service}"
+        continue
+    fi
+
     manage_systemd_service start "${service%%-clone}"
     manage_systemd_service enable "${service%%-clone}"
     check_resource_systemd "${service%%-clone}" started 600
diff --git a/extraconfig/tasks/major_upgrade_pacemaker.yaml b/extraconfig/tasks/major_upgrade_pacemaker.yaml
index 370660f172..c8f21463c4 100644
--- a/extraconfig/tasks/major_upgrade_pacemaker.yaml
+++ b/extraconfig/tasks/major_upgrade_pacemaker.yaml
@@ -60,7 +60,7 @@ resources:
       rolling_update:
         max_batch_size: 1
 
-  ControllerPacemakerUpgradeConfig_Step1:
+  ControllerPacemakerUpgradeConfig_Step0:
     type: OS::Heat::SoftwareConfig
     properties:
       group: script
@@ -81,12 +81,31 @@ resources:
                 MYSQL_MAJOR_UPGRADE: {get_param: MySqlMajorUpgrade}
           - get_file: pacemaker_common_functions.sh
           - get_file: major_upgrade_check.sh
+          - get_file: major_upgrade_pacemaker_migrations.sh
+          - get_file: major_upgrade_controller_pacemaker_0.sh
+
+  ControllerPacemakerUpgradeDeployment_Step0:
+    type: OS::Heat::SoftwareDeploymentGroup
+    depends_on: CephMonUpgradeDeployment
+    properties:
+      servers:  {get_param: [servers, Controller]}
+      config: {get_resource: ControllerPacemakerUpgradeConfig_Step0}
+      input_values: {get_param: input_values}
+
+  ControllerPacemakerUpgradeConfig_Step1:
+    type: OS::Heat::SoftwareConfig
+    properties:
+      group: script
+      config:
+        list_join:
+        - '#!/bin/bash'
+        - - get_file: pacemaker_common_functions.sh
           - get_file: major_upgrade_pacemaker_migrations.sh
           - get_file: major_upgrade_controller_pacemaker_1.sh
 
   ControllerPacemakerUpgradeDeployment_Step1:
     type: OS::Heat::SoftwareDeploymentGroup
-    depends_on: CephMonUpgradeDeployment
+    depends_on: ControllerPacemakerUpgradeDeployment_Step0
     properties:
       servers:  {get_param: [servers, Controller]}
       config: {get_resource: ControllerPacemakerUpgradeConfig_Step1}
diff --git a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh
index d7b50aa192..ec9fc0ac55 100644
--- a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh
+++ b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh
@@ -101,6 +101,42 @@ function services_to_migrate {
     echo $PCMK_RESOURCE_TODELETE
 }
 
+# Those are oneshot type services.
+function neutron_cleanup_services {
+    echo "
+    neutron-netns-cleanup-clone
+    neutron-ovs-cleanup-clone
+    "
+}
+
+function deactivate_cleanup_services {
+    for service in $(neutron_cleanup_services); do
+        log_debug "Changing ocf configuration for '${service}'"
+        # We prevent any stop action by changing the exec to a noop.
+        local sysconfig_name=${service%-clone}
+        # This is loaded by /usr/lib/ocf/lib/neutron/neutron-{ovs,netns}-cleanup
+        echo "exec=/bin/echo" >> /etc/sysconfig/${sysconfig_name}
+        sed -i.orig -e 's/clean --force/clean/' /usr/lib/ocf/lib/neutron/neutron-netns-cleanup
+    done
+}
+
+function restore_cleanup_service_definition {
+    for service in $(neutron_cleanup_services); do
+        log_debug "Restoring original ocf configuration for '${service}'"
+        local sysconfig_file=/etc/sysconfig/${service%-clone}
+        if [ -e "${sysconfig_file}" ]; then
+            sed -e '/exec=\/bin\/echo/d' $sysconfig_file | \
+                sed -e '/^ *$/d' > /tmp/$service
+            if test -s /tmp/$service; then
+                cp /tmp/$service $sysconfig_file
+            else
+                rm -f $sysconfig_file
+            fi
+            [ ! -e /tmp/$service ] || rm -f /tmp/$service
+        fi
+    done
+    mv /usr/lib/ocf/lib/neutron/neutron-netns-cleanup{.orig,}
+}
 # This function will migrate a mitaka system where all the resources are managed
 # via pacemaker to a newton setup where only a few services will be managed by pacemaker
 # On a high-level it will operate as follows:
@@ -148,7 +184,7 @@ function migrate_full_to_ng_ha {
         # that will move to systemd.
         # We want the systemd resources be stopped before doing "yum update",
         # that way "systemctl try-restart <service>" is no-op because the
-        # service was down already 
+        # service was down already
         PCS_STATUS_OUTPUT="$(pcs status)"
         for resource in $(services_to_migrate) "delay-clone" "openstack-core-clone"; do
              if echo "$PCS_STATUS_OUTPUT" | grep "$resource"; then
diff --git a/extraconfig/tasks/pacemaker_common_functions.sh b/extraconfig/tasks/pacemaker_common_functions.sh
index adf24e735e..cc8cf8c63a 100755
--- a/extraconfig/tasks/pacemaker_common_functions.sh
+++ b/extraconfig/tasks/pacemaker_common_functions.sh
@@ -4,6 +4,28 @@ set -eu
 
 DEBUG="true" # set false if the verbosity is a problem
 SCRIPT_NAME=$(basename $0)
+
+# This block get default for ovs fail mode handling during upgrade.
+function get_all_bridges {
+    local bridges_def=""
+    local bridges=""
+    if which ovs-vsctl &>/dev/null; then
+      if [ -e /etc/neutron/plugins/ml2/openvswitch_agent.ini ]; then
+        local raw_bridge_def=$(crudini --get /etc/neutron/plugins/ml2/openvswitch_agent.ini ovs bridge_mappings)
+        local bridges=""
+        while IFS=: read physnet bridge; do bridges_def="${bridges_def} ${bridge}" ; done \
+          < <(echo "${raw_bridge_def}" | sed 's/,/\n/g')
+        local existing_bridges="$(ovs-vsctl -f table -d bare --column=name --no-headings find Bridge)"
+        for br in ${bridges_def}; do
+            if echo "${existing_bridges}" | grep -q $br; then
+              bridges="${bridges} ${br}"
+            fi
+        done
+      fi
+    fi
+    echo "${bridges}"
+}
+
 function log_debug {
   if [[ $DEBUG = "true" ]]; then
     echo "`date` $SCRIPT_NAME tripleo-upgrade $(facter hostname) $1"
@@ -325,25 +347,32 @@ function special_case_ovs_upgrade_if_needed {
 }
 
 # update os-net-config before ovs see https://bugs.launchpad.net/tripleo/+bug/1695893
+function update_os_net_config() {
+  set +e
+  local need_update="$(yum check-upgrade | grep os-net-config)"
+  if [ -n "${need_update}" ]; then
+      yum -q -y update os-net-config
+      local return_code=$?
+      log_debug "yum update os-net-config return code: $return_code"
+
+      # We're just make sure that os-net-config won't ifdown/ifup
+      # network interfaces.  The current set of changes (Tue Oct 3
+      # 17:38:37 CEST 2017) doesn't require the os-net-config change
+      # to be taken live.  They will be at next reboot.
+      os-net-config --no-activate -c /etc/os-net-config/config.json -v \
+                    --detailed-exit-codes
+      local os_net_retval=$?
+      if [[ $os_net_retval == 2 ]]; then
+          log_debug "os-net-config: interface configuration files updated successfully"
+      elif [[ $os_net_retval != 0 ]]; then
+          log_debug "ERROR: os-net-config configuration failed"
+          exit $os_net_retval
+      fi
+  fi
+  set -e
+}
+
 function update_network() {
-    set +e
-    yum -q -y update os-net-config
-    return_code=$?
-    echo "yum update os-net-config return code: $return_code"
-
-    # Writes any changes caused by alterations to os-net-config and bounces the
-    # interfaces *before* restarting the cluster.
-    os-net-config -c /etc/os-net-config/config.json -v --detailed-exit-codes
-
-    RETVAL=$?
-    if [[ $RETVAL == 2 ]]; then
-        echo "os-net-config: interface configuration files updated successfully"
-    elif [[ $RETVAL != 0 ]]; then
-        echo "ERROR: os-net-config configuration failed"
-        exit $RETVAL
-    fi
-    set -e
-
     # special case https://bugs.launchpad.net/tripleo/+bug/1635205 +bug/1669714
     special_case_ovs_upgrade_if_needed
 }