Keep floating ip reachability during pacemaker migration.

neutron-netns-cleanup-clone and neutron-ovs-cleanup-clone resources are only one shot resources that need to be activated during bootup and shutdown of node. Triggering a stop has the side effect of removing entries in the ovs db, making floating ips unreachable. Those are triggered by the the /usr/lib/ocf/resource.d/neutron/{OVSCleanup,NetnsCleanup} pacemaker resource. They, in turn, use the /usr/lib/ocf/lib/neutron/neutron-{ovs,netns}-cleanup scripts (not the systemd unit file). We temporarily disable any action by configuring the executable to be "/bin/echo" instead of the /usr/bin/neutron-{ovs,netns}-cleanup and removing the "--force" option in neutron-netns-cleanup. As those resources are cloned resources we need to make sure that the modification is done on all controller nodes before we take action on the controller bootstraping node. To do that we move most of Step1 to Step0 and make the bootstrap node action happens at Step1 of the pacemaker controller upgrade. Furthermore we make sure that the ext bridges, if ovs is used, are not in secure mode by setting them to standalone during the upgrade process and back to whatever they were before. Eventually we need to take care of the os-net-config upgrade. It can add new parameters to the interface definition which will force a restart of the interfaces. To avoid that we add the --no-activate option. Currently no major change in os-net-config are required for the overcloud to continue running after the upgrade. Co-Authored-By: Raoul Scarazzini <rscarazz@redhat.com> Change-Id: Ib5d7b447808b51f6e436eaf6d661606132155a23 Depends-On: Ieb5ad6ad429c8388a1cbbd650339b6eecd9b7997 Closes-Bug: #1698373
2017-06-16 14:53:14 +02:00 · 2017-06-16 14:53:14 +02:00 · bce61783bc
parent 0616f2d0be
commit bce61783bc
7 changed files with 172 additions and 60 deletions
--- a/extraconfig/tasks/major_upgrade_controller_pacemaker_0.sh
+++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_0.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+
+set -eu
+
+check_cluster
+check_pcsd
+if [[ -n $(is_bootstrap_node) ]]; then
+    check_clean_cluster
+fi
+check_python_rpm
+check_galera_root_password
+check_disk_for_mysql_dump
+
+# M/N Upgrade only: By default RHEL/Centos has an /etc/sysconfig/iptables file which
+# allows ssh and icmp only (INPUT table). During the install of OSP9/Mitaka
+# usually the live iptables rules are not the ones in /etc/sysconfig/iptables but
+# they are completely open (ACCEPT)
+# Now when we run the convergence step while migrating to Newton we enable the firewall
+# by default and this will actually first load the rules from /etc/sysconfig/iptables
+# and only afterwards, it will start adding all the rules permitting openstack traffic.
+# This causes an outage of roughly 1 minute in our env, which disrupts the cluster.
+# Let's simply move the existing file out of the way, it will be recreated by
+# puppet in newton with the proper firewall rules anyway
+if [ ! -f /etc/sysconfig/iptables.m-n-upgrade ]; then
+    mv /etc/sysconfig/iptables /etc/sysconfig/iptables.m-n-upgrade || /bin/true
+fi
+
+# We want to disable fencing during the cluster --stop as it might fence
+# nodes where a service fails to stop, which could be fatal during an upgrade
+# procedure. So we remember the stonith state. If it was enabled we reenable it
+# at the end of this script
+if [[ -n $(is_bootstrap_node) ]]; then
+    STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }')
+    # We create this empty file if stonith was set to true so we can reenable stonith in step2
+    rm -f /var/tmp/stonith-true
+    if [ $STONITH_STATE == "true" ]; then
+        touch /var/tmp/stonith-true
+    fi
+    pcs property set stonith-enabled=false
+fi
+
+# Before migration and service stopping we make sure that the external
+# bridge is set to standalone in ovs.  This is because we don't do a
+# rolling upgrade, but we don't want the floating ip network to be cut
+# off.
+for br in $(get_all_bridges); do
+    # It we be set to whatever is needed at the end of the upgrade by
+    # ovs-agent processes.
+    ovs-vsctl set-fail-mode ${br} standalone
+done
+
+# Make sure openvswitch *cleanup* does not happen.
+deactivate_cleanup_services
--- a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh
+++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh
@ -2,45 +2,7 @@

 set -eu

-check_cluster
-check_pcsd
-if [[ -n $(is_bootstrap_node) ]]; then
-    check_clean_cluster
-fi
-check_python_rpm
-check_galera_root_password
-check_disk_for_mysql_dump
-
-# M/N Upgrade only: By default RHEL/Centos has an /etc/sysconfig/iptables file which
-# allows ssh and icmp only (INPUT table). During the install of OSP9/Mitaka
-# usually the live iptables rules are not the ones in /etc/sysconfig/iptables but
-# they are completely open (ACCEPT)
-# Now when we run the convergence step while migrating to Newton we enable the firewall
-# by default and this will actually first load the rules from /etc/sysconfig/iptables
-# and only afterwards, it will start adding all the rules permitting openstack traffic.
-# This causes an outage of roughly 1 minute in our env, which disrupts the cluster.
-# Let's simply move the existing file out of the way, it will be recreated by
-# puppet in newton with the proper firewall rules anyway
-if [ ! -f /etc/sysconfig/iptables.m-n-upgrade ]; then
-    mv /etc/sysconfig/iptables /etc/sysconfig/iptables.m-n-upgrade || /bin/true
-fi
-
-# We want to disable fencing during the cluster --stop as it might fence
-# nodes where a service fails to stop, which could be fatal during an upgrade
-# procedure. So we remember the stonith state. If it was enabled we reenable it
-# at the end of this script
-if [[ -n $(is_bootstrap_node) ]]; then
-    STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }')
-    # We create this empty file if stonith was set to true so we can reenable stonith in step2
-    rm -f /var/tmp/stonith-true
-    if [ $STONITH_STATE == "true" ]; then
-        touch /var/tmp/stonith-true
-    fi
-    pcs property set stonith-enabled=false
-fi
-
 # Migrate to HA NG
 if [[ -n $(is_bootstrap_node) ]]; then
    migrate_full_to_ng_ha
 fi
-
--- a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh
+++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh
@ -11,6 +11,10 @@ cluster_sync_timeout=1800
 # systemctl try-restart is a noop

 for service in $(services_to_migrate); do
+    if [[ ${service%%-clone} =~ .*-cleanup ]]; then
+        # we don't want to stop {netns,ovs}-cleanup
+        continue
+    fi
    manage_systemd_service stop "${service%%-clone}"
    # So the reason for not reusing check_resource_systemd is that
    # I have observed systemctl is-active returning unknown with at least
@ -107,6 +111,7 @@ if [ $DO_MYSQL_UPGRADE -eq 1 ]; then
    mv /var/lib/mysql $MYSQL_TEMP_UPGRADE_BACKUP_DIR
 fi

+update_os_net_config
 # Special-case OVS for https://bugs.launchpad.net/tripleo/+bug/1669714
 update_network

@ -119,6 +124,7 @@ if grep -q '^pipeline = ssl_header_handler faultwrap osvolumeversionapp' /etc/ci
    sed -i '$ { /^$/d }' /etc/cinder/api-paste.ini
 fi

+restore_cleanup_service_definition

 yum -y install python-zaqarclient  # needed for os-collect-config
 yum -y -q update
--- a/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh
+++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh
@ -2,7 +2,7 @@

 set -eu

-# We need to start the systemd services we explicitely stopped at step _1.sh
+# We need to start the systemd services we explicitely stopped at step _0.sh
 # We add the enablement of the systemd services here because if a node gets rebooted
 # before the convergence step for whatever reason the migrated services will
 # not be enabled and we potentially have a bigger disruption.
@ -10,7 +10,14 @@ services=$(services_to_migrate)
 if [[ ${keep_sahara_services_on_upgrade} =~ [Ff]alse ]] ; then
    services=${services%%openstack-sahara*}
 fi
+
 for service in $services; do
+    if [[ ${service%%-clone} =~ .*-cleanup ]]; then
+        # we don't want to start {netns,ovs}-cleanup
+        log_debug "Skipping ${service}"
+        continue
+    fi
+
    manage_systemd_service start "${service%%-clone}"
    manage_systemd_service enable "${service%%-clone}"
    check_resource_systemd "${service%%-clone}" started 600
--- a/extraconfig/tasks/major_upgrade_pacemaker.yaml
+++ b/extraconfig/tasks/major_upgrade_pacemaker.yaml
@ -60,7 +60,7 @@ resources:
      rolling_update:
        max_batch_size: 1

-  ControllerPacemakerUpgradeConfig_Step1:
+  ControllerPacemakerUpgradeConfig_Step0:
    type: OS::Heat::SoftwareConfig
    properties:
      group: script
@ -81,12 +81,31 @@ resources:
                MYSQL_MAJOR_UPGRADE: {get_param: MySqlMajorUpgrade}
          - get_file: pacemaker_common_functions.sh
          - get_file: major_upgrade_check.sh
+          - get_file: major_upgrade_pacemaker_migrations.sh
+          - get_file: major_upgrade_controller_pacemaker_0.sh
+
+  ControllerPacemakerUpgradeDeployment_Step0:
+    type: OS::Heat::SoftwareDeploymentGroup
+    depends_on: CephMonUpgradeDeployment
+    properties:
+      servers:  {get_param: [servers, Controller]}
+      config: {get_resource: ControllerPacemakerUpgradeConfig_Step0}
+      input_values: {get_param: input_values}
+
+  ControllerPacemakerUpgradeConfig_Step1:
+    type: OS::Heat::SoftwareConfig
+    properties:
+      group: script
+      config:
+        list_join:
+        - '#!/bin/bash'
+        - - get_file: pacemaker_common_functions.sh
          - get_file: major_upgrade_pacemaker_migrations.sh
          - get_file: major_upgrade_controller_pacemaker_1.sh

  ControllerPacemakerUpgradeDeployment_Step1:
    type: OS::Heat::SoftwareDeploymentGroup
-    depends_on: CephMonUpgradeDeployment
+    depends_on: ControllerPacemakerUpgradeDeployment_Step0
    properties:
      servers:  {get_param: [servers, Controller]}
      config: {get_resource: ControllerPacemakerUpgradeConfig_Step1}
--- a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh
+++ b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh
@ -101,6 +101,42 @@ function services_to_migrate {
    echo $PCMK_RESOURCE_TODELETE
 }

+# Those are oneshot type services.
+function neutron_cleanup_services {
+    echo "
+    neutron-netns-cleanup-clone
+    neutron-ovs-cleanup-clone
+    "
+}
+
+function deactivate_cleanup_services {
+    for service in $(neutron_cleanup_services); do
+        log_debug "Changing ocf configuration for '${service}'"
+        # We prevent any stop action by changing the exec to a noop.
+        local sysconfig_name=${service%-clone}
+        # This is loaded by /usr/lib/ocf/lib/neutron/neutron-{ovs,netns}-cleanup
+        echo "exec=/bin/echo" >> /etc/sysconfig/${sysconfig_name}
+        sed -i.orig -e 's/clean --force/clean/' /usr/lib/ocf/lib/neutron/neutron-netns-cleanup
+    done
+}
+
+function restore_cleanup_service_definition {
+    for service in $(neutron_cleanup_services); do
+        log_debug "Restoring original ocf configuration for '${service}'"
+        local sysconfig_file=/etc/sysconfig/${service%-clone}
+        if [ -e "${sysconfig_file}" ]; then
+            sed -e '/exec=\/bin\/echo/d' $sysconfig_file | \
+                sed -e '/^ *$/d' > /tmp/$service
+            if test -s /tmp/$service; then
+                cp /tmp/$service $sysconfig_file
+            else
+                rm -f $sysconfig_file
+            fi
+            [ ! -e /tmp/$service ] || rm -f /tmp/$service
+        fi
+    done
+    mv /usr/lib/ocf/lib/neutron/neutron-netns-cleanup{.orig,}
+}
 # This function will migrate a mitaka system where all the resources are managed
 # via pacemaker to a newton setup where only a few services will be managed by pacemaker
 # On a high-level it will operate as follows:
@ -148,7 +184,7 @@ function migrate_full_to_ng_ha {
        # that will move to systemd.
        # We want the systemd resources be stopped before doing "yum update",
        # that way "systemctl try-restart <service>" is no-op because the
-        # service was down already 
+        # service was down already
        PCS_STATUS_OUTPUT="$(pcs status)"
        for resource in $(services_to_migrate) "delay-clone" "openstack-core-clone"; do
             if echo "$PCS_STATUS_OUTPUT" | grep "$resource"; then
--- a/extraconfig/tasks/pacemaker_common_functions.sh
+++ b/extraconfig/tasks/pacemaker_common_functions.sh
@ -4,6 +4,28 @@ set -eu

 DEBUG="true" # set false if the verbosity is a problem
 SCRIPT_NAME=$(basename $0)
+
+# This block get default for ovs fail mode handling during upgrade.
+function get_all_bridges {
+    local bridges_def=""
+    local bridges=""
+    if which ovs-vsctl &>/dev/null; then
+      if [ -e /etc/neutron/plugins/ml2/openvswitch_agent.ini ]; then
+        local raw_bridge_def=$(crudini --get /etc/neutron/plugins/ml2/openvswitch_agent.ini ovs bridge_mappings)
+        local bridges=""
+        while IFS=: read physnet bridge; do bridges_def="${bridges_def} ${bridge}" ; done \
+          < <(echo "${raw_bridge_def}" | sed 's/,/\n/g')
+        local existing_bridges="$(ovs-vsctl -f table -d bare --column=name --no-headings find Bridge)"
+        for br in ${bridges_def}; do
+            if echo "${existing_bridges}" | grep -q $br; then
+              bridges="${bridges} ${br}"
+            fi
+        done
+      fi
+    fi
+    echo "${bridges}"
+}
+
 function log_debug {
  if [[ $DEBUG = "true" ]]; then
    echo "`date` $SCRIPT_NAME tripleo-upgrade $(facter hostname) $1"
@ -325,25 +347,32 @@ function special_case_ovs_upgrade_if_needed {
 }

 # update os-net-config before ovs see https://bugs.launchpad.net/tripleo/+bug/1695893
+function update_os_net_config() {
+  set +e
+  local need_update="$(yum check-upgrade | grep os-net-config)"
+  if [ -n "${need_update}" ]; then
+      yum -q -y update os-net-config
+      local return_code=$?
+      log_debug "yum update os-net-config return code: $return_code"
+
+      # We're just make sure that os-net-config won't ifdown/ifup
+      # network interfaces.  The current set of changes (Tue Oct 3
+      # 17:38:37 CEST 2017) doesn't require the os-net-config change
+      # to be taken live.  They will be at next reboot.
+      os-net-config --no-activate -c /etc/os-net-config/config.json -v \
+                    --detailed-exit-codes
+      local os_net_retval=$?
+      if [[ $os_net_retval == 2 ]]; then
+          log_debug "os-net-config: interface configuration files updated successfully"
+      elif [[ $os_net_retval != 0 ]]; then
+          log_debug "ERROR: os-net-config configuration failed"
+          exit $os_net_retval
+      fi
+  fi
+  set -e
+}
+
 function update_network() {
-    set +e
-    yum -q -y update os-net-config
-    return_code=$?
-    echo "yum update os-net-config return code: $return_code"
-
-    # Writes any changes caused by alterations to os-net-config and bounces the
-    # interfaces *before* restarting the cluster.
-    os-net-config -c /etc/os-net-config/config.json -v --detailed-exit-codes
-
-    RETVAL=$?
-    if [[ $RETVAL == 2 ]]; then
-        echo "os-net-config: interface configuration files updated successfully"
-    elif [[ $RETVAL != 0 ]]; then
-        echo "ERROR: os-net-config configuration failed"
-        exit $RETVAL
-    fi
-    set -e
-
    # special case https://bugs.launchpad.net/tripleo/+bug/1635205 +bug/1669714
    special_case_ovs_upgrade_if_needed
 }