diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_0.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_0.sh new file mode 100644 index 0000000000..efed76d8ff --- /dev/null +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_0.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +set -eu + +check_cluster +check_pcsd +if [[ -n $(is_bootstrap_node) ]]; then + check_clean_cluster +fi +check_python_rpm +check_galera_root_password +check_disk_for_mysql_dump + +# M/N Upgrade only: By default RHEL/Centos has an /etc/sysconfig/iptables file which +# allows ssh and icmp only (INPUT table). During the install of OSP9/Mitaka +# usually the live iptables rules are not the ones in /etc/sysconfig/iptables but +# they are completely open (ACCEPT) +# Now when we run the convergence step while migrating to Newton we enable the firewall +# by default and this will actually first load the rules from /etc/sysconfig/iptables +# and only afterwards, it will start adding all the rules permitting openstack traffic. +# This causes an outage of roughly 1 minute in our env, which disrupts the cluster. +# Let's simply move the existing file out of the way, it will be recreated by +# puppet in newton with the proper firewall rules anyway +if [ ! -f /etc/sysconfig/iptables.m-n-upgrade ]; then + mv /etc/sysconfig/iptables /etc/sysconfig/iptables.m-n-upgrade || /bin/true +fi + +# We want to disable fencing during the cluster --stop as it might fence +# nodes where a service fails to stop, which could be fatal during an upgrade +# procedure. So we remember the stonith state. If it was enabled we reenable it +# at the end of this script +if [[ -n $(is_bootstrap_node) ]]; then + STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }') + # We create this empty file if stonith was set to true so we can reenable stonith in step2 + rm -f /var/tmp/stonith-true + if [ $STONITH_STATE == "true" ]; then + touch /var/tmp/stonith-true + fi + pcs property set stonith-enabled=false +fi + +# Before migration and service stopping we make sure that the external +# bridge is set to standalone in ovs. This is because we don't do a +# rolling upgrade, but we don't want the floating ip network to be cut +# off. +for br in $(get_all_bridges); do + # It we be set to whatever is needed at the end of the upgrade by + # ovs-agent processes. + ovs-vsctl set-fail-mode ${br} standalone +done + +# Make sure openvswitch *cleanup* does not happen. +deactivate_cleanup_services diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh index 2eb36139a5..3401b90725 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh @@ -2,45 +2,7 @@ set -eu -check_cluster -check_pcsd -if [[ -n $(is_bootstrap_node) ]]; then - check_clean_cluster -fi -check_python_rpm -check_galera_root_password -check_disk_for_mysql_dump - -# M/N Upgrade only: By default RHEL/Centos has an /etc/sysconfig/iptables file which -# allows ssh and icmp only (INPUT table). During the install of OSP9/Mitaka -# usually the live iptables rules are not the ones in /etc/sysconfig/iptables but -# they are completely open (ACCEPT) -# Now when we run the convergence step while migrating to Newton we enable the firewall -# by default and this will actually first load the rules from /etc/sysconfig/iptables -# and only afterwards, it will start adding all the rules permitting openstack traffic. -# This causes an outage of roughly 1 minute in our env, which disrupts the cluster. -# Let's simply move the existing file out of the way, it will be recreated by -# puppet in newton with the proper firewall rules anyway -if [ ! -f /etc/sysconfig/iptables.m-n-upgrade ]; then - mv /etc/sysconfig/iptables /etc/sysconfig/iptables.m-n-upgrade || /bin/true -fi - -# We want to disable fencing during the cluster --stop as it might fence -# nodes where a service fails to stop, which could be fatal during an upgrade -# procedure. So we remember the stonith state. If it was enabled we reenable it -# at the end of this script -if [[ -n $(is_bootstrap_node) ]]; then - STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }') - # We create this empty file if stonith was set to true so we can reenable stonith in step2 - rm -f /var/tmp/stonith-true - if [ $STONITH_STATE == "true" ]; then - touch /var/tmp/stonith-true - fi - pcs property set stonith-enabled=false -fi - # Migrate to HA NG if [[ -n $(is_bootstrap_node) ]]; then migrate_full_to_ng_ha fi - diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh index fa1e223adf..534a0a2f9f 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh @@ -11,6 +11,10 @@ cluster_sync_timeout=1800 # systemctl try-restart is a noop for service in $(services_to_migrate); do + if [[ ${service%%-clone} =~ .*-cleanup ]]; then + # we don't want to stop {netns,ovs}-cleanup + continue + fi manage_systemd_service stop "${service%%-clone}" # So the reason for not reusing check_resource_systemd is that # I have observed systemctl is-active returning unknown with at least @@ -107,6 +111,7 @@ if [ $DO_MYSQL_UPGRADE -eq 1 ]; then mv /var/lib/mysql $MYSQL_TEMP_UPGRADE_BACKUP_DIR fi +update_os_net_config # Special-case OVS for https://bugs.launchpad.net/tripleo/+bug/1669714 update_network @@ -119,6 +124,7 @@ if grep -q '^pipeline = ssl_header_handler faultwrap osvolumeversionapp' /etc/ci sed -i '$ { /^$/d }' /etc/cinder/api-paste.ini fi +restore_cleanup_service_definition yum -y install python-zaqarclient # needed for os-collect-config yum -y -q update diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh index 719ac2a39a..f70f10de68 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh @@ -2,7 +2,7 @@ set -eu -# We need to start the systemd services we explicitely stopped at step _1.sh +# We need to start the systemd services we explicitely stopped at step _0.sh # We add the enablement of the systemd services here because if a node gets rebooted # before the convergence step for whatever reason the migrated services will # not be enabled and we potentially have a bigger disruption. @@ -10,7 +10,14 @@ services=$(services_to_migrate) if [[ ${keep_sahara_services_on_upgrade} =~ [Ff]alse ]] ; then services=${services%%openstack-sahara*} fi + for service in $services; do + if [[ ${service%%-clone} =~ .*-cleanup ]]; then + # we don't want to start {netns,ovs}-cleanup + log_debug "Skipping ${service}" + continue + fi + manage_systemd_service start "${service%%-clone}" manage_systemd_service enable "${service%%-clone}" check_resource_systemd "${service%%-clone}" started 600 diff --git a/extraconfig/tasks/major_upgrade_pacemaker.yaml b/extraconfig/tasks/major_upgrade_pacemaker.yaml index 370660f172..c8f21463c4 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker.yaml +++ b/extraconfig/tasks/major_upgrade_pacemaker.yaml @@ -60,7 +60,7 @@ resources: rolling_update: max_batch_size: 1 - ControllerPacemakerUpgradeConfig_Step1: + ControllerPacemakerUpgradeConfig_Step0: type: OS::Heat::SoftwareConfig properties: group: script @@ -81,12 +81,31 @@ resources: MYSQL_MAJOR_UPGRADE: {get_param: MySqlMajorUpgrade} - get_file: pacemaker_common_functions.sh - get_file: major_upgrade_check.sh + - get_file: major_upgrade_pacemaker_migrations.sh + - get_file: major_upgrade_controller_pacemaker_0.sh + + ControllerPacemakerUpgradeDeployment_Step0: + type: OS::Heat::SoftwareDeploymentGroup + depends_on: CephMonUpgradeDeployment + properties: + servers: {get_param: [servers, Controller]} + config: {get_resource: ControllerPacemakerUpgradeConfig_Step0} + input_values: {get_param: input_values} + + ControllerPacemakerUpgradeConfig_Step1: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: + list_join: + - '#!/bin/bash' + - - get_file: pacemaker_common_functions.sh - get_file: major_upgrade_pacemaker_migrations.sh - get_file: major_upgrade_controller_pacemaker_1.sh ControllerPacemakerUpgradeDeployment_Step1: type: OS::Heat::SoftwareDeploymentGroup - depends_on: CephMonUpgradeDeployment + depends_on: ControllerPacemakerUpgradeDeployment_Step0 properties: servers: {get_param: [servers, Controller]} config: {get_resource: ControllerPacemakerUpgradeConfig_Step1} diff --git a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh index d7b50aa192..ec9fc0ac55 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh +++ b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh @@ -101,6 +101,42 @@ function services_to_migrate { echo $PCMK_RESOURCE_TODELETE } +# Those are oneshot type services. +function neutron_cleanup_services { + echo " + neutron-netns-cleanup-clone + neutron-ovs-cleanup-clone + " +} + +function deactivate_cleanup_services { + for service in $(neutron_cleanup_services); do + log_debug "Changing ocf configuration for '${service}'" + # We prevent any stop action by changing the exec to a noop. + local sysconfig_name=${service%-clone} + # This is loaded by /usr/lib/ocf/lib/neutron/neutron-{ovs,netns}-cleanup + echo "exec=/bin/echo" >> /etc/sysconfig/${sysconfig_name} + sed -i.orig -e 's/clean --force/clean/' /usr/lib/ocf/lib/neutron/neutron-netns-cleanup + done +} + +function restore_cleanup_service_definition { + for service in $(neutron_cleanup_services); do + log_debug "Restoring original ocf configuration for '${service}'" + local sysconfig_file=/etc/sysconfig/${service%-clone} + if [ -e "${sysconfig_file}" ]; then + sed -e '/exec=\/bin\/echo/d' $sysconfig_file | \ + sed -e '/^ *$/d' > /tmp/$service + if test -s /tmp/$service; then + cp /tmp/$service $sysconfig_file + else + rm -f $sysconfig_file + fi + [ ! -e /tmp/$service ] || rm -f /tmp/$service + fi + done + mv /usr/lib/ocf/lib/neutron/neutron-netns-cleanup{.orig,} +} # This function will migrate a mitaka system where all the resources are managed # via pacemaker to a newton setup where only a few services will be managed by pacemaker # On a high-level it will operate as follows: @@ -148,7 +184,7 @@ function migrate_full_to_ng_ha { # that will move to systemd. # We want the systemd resources be stopped before doing "yum update", # that way "systemctl try-restart " is no-op because the - # service was down already + # service was down already PCS_STATUS_OUTPUT="$(pcs status)" for resource in $(services_to_migrate) "delay-clone" "openstack-core-clone"; do if echo "$PCS_STATUS_OUTPUT" | grep "$resource"; then diff --git a/extraconfig/tasks/pacemaker_common_functions.sh b/extraconfig/tasks/pacemaker_common_functions.sh index adf24e735e..cc8cf8c63a 100755 --- a/extraconfig/tasks/pacemaker_common_functions.sh +++ b/extraconfig/tasks/pacemaker_common_functions.sh @@ -4,6 +4,28 @@ set -eu DEBUG="true" # set false if the verbosity is a problem SCRIPT_NAME=$(basename $0) + +# This block get default for ovs fail mode handling during upgrade. +function get_all_bridges { + local bridges_def="" + local bridges="" + if which ovs-vsctl &>/dev/null; then + if [ -e /etc/neutron/plugins/ml2/openvswitch_agent.ini ]; then + local raw_bridge_def=$(crudini --get /etc/neutron/plugins/ml2/openvswitch_agent.ini ovs bridge_mappings) + local bridges="" + while IFS=: read physnet bridge; do bridges_def="${bridges_def} ${bridge}" ; done \ + < <(echo "${raw_bridge_def}" | sed 's/,/\n/g') + local existing_bridges="$(ovs-vsctl -f table -d bare --column=name --no-headings find Bridge)" + for br in ${bridges_def}; do + if echo "${existing_bridges}" | grep -q $br; then + bridges="${bridges} ${br}" + fi + done + fi + fi + echo "${bridges}" +} + function log_debug { if [[ $DEBUG = "true" ]]; then echo "`date` $SCRIPT_NAME tripleo-upgrade $(facter hostname) $1" @@ -325,25 +347,32 @@ function special_case_ovs_upgrade_if_needed { } # update os-net-config before ovs see https://bugs.launchpad.net/tripleo/+bug/1695893 +function update_os_net_config() { + set +e + local need_update="$(yum check-upgrade | grep os-net-config)" + if [ -n "${need_update}" ]; then + yum -q -y update os-net-config + local return_code=$? + log_debug "yum update os-net-config return code: $return_code" + + # We're just make sure that os-net-config won't ifdown/ifup + # network interfaces. The current set of changes (Tue Oct 3 + # 17:38:37 CEST 2017) doesn't require the os-net-config change + # to be taken live. They will be at next reboot. + os-net-config --no-activate -c /etc/os-net-config/config.json -v \ + --detailed-exit-codes + local os_net_retval=$? + if [[ $os_net_retval == 2 ]]; then + log_debug "os-net-config: interface configuration files updated successfully" + elif [[ $os_net_retval != 0 ]]; then + log_debug "ERROR: os-net-config configuration failed" + exit $os_net_retval + fi + fi + set -e +} + function update_network() { - set +e - yum -q -y update os-net-config - return_code=$? - echo "yum update os-net-config return code: $return_code" - - # Writes any changes caused by alterations to os-net-config and bounces the - # interfaces *before* restarting the cluster. - os-net-config -c /etc/os-net-config/config.json -v --detailed-exit-codes - - RETVAL=$? - if [[ $RETVAL == 2 ]]; then - echo "os-net-config: interface configuration files updated successfully" - elif [[ $RETVAL != 0 ]]; then - echo "ERROR: os-net-config configuration failed" - exit $RETVAL - fi - set -e - # special case https://bugs.launchpad.net/tripleo/+bug/1635205 +bug/1669714 special_case_ovs_upgrade_if_needed }