diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh index e0d5aa7c9b..0c590a42cb 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh @@ -19,7 +19,7 @@ STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk pcs property set stonith-enabled=false # Migrate to HA NG -if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then +if [[ -n $(is_bootstrap_node) ]]; then migrate_full_to_ng_ha fi @@ -29,9 +29,26 @@ fi # is going to take a long time because rabbit is down. By having the service stopped # systemctl try-restart is a noop -for $service in $(services_to_migrate); do +for service in $(services_to_migrate); do manage_systemd_service stop "${service%%-clone}" - check_resource_systemd "${service%%-clone}" stopped 600 + # So the reason for not reusing check_resource_systemd is that + # I have observed systemctl is-active returning unknown with at least + # one service that was stopped (See LP 1627254) + timeout=600 + tstart=$(date +%s) + tend=$(( $tstart + $timeout )) + check_interval=3 + while (( $(date +%s) < $tend )); do + if [[ "$(systemctl is-active ${service%%-clone})" = "active" ]]; then + echo "$service still active, sleeping $check_interval seconds." + sleep $check_interval + else + # we do not care if it is inactive, unknown or failed as long as it is + # not running + break + fi + + done done # In case the mysql package is updated, the database on disk must be @@ -46,7 +63,7 @@ done # on mysql package versionning, but this can be overriden manually # to support specific upgrade scenario -if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then +if [[ -n $(is_bootstrap_node) ]]; then if [ $DO_MYSQL_UPGRADE -eq 1 ]; then mysqldump $backup_flags > "$MYSQL_BACKUP_DIR/openstack_database.sql" cp -rdp /etc/my.cnf* "$MYSQL_BACKUP_DIR" @@ -68,7 +85,7 @@ if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname) fi -# Swift isn't controled by pacemaker +# Swift isn't controlled by pacemaker systemctl_swift stop tstart=$(date +%s) diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh index aef54f847e..6055a3f968 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh @@ -76,7 +76,7 @@ systemctl_swift start # We need to start the systemd services we explicitely stopped at step _1.sh # FIXME: Should we let puppet during the convergence step do the service enabling or # should we add it here? -for $service in $(services_to_migrate); do - manage_systemd_service stop "${service%%-clone}" +for service in $(services_to_migrate); do + manage_systemd_service start "${service%%-clone}" check_resource_systemd "${service%%-clone}" started 600 done diff --git a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh index b8c5321b45..d974bb7965 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh +++ b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh @@ -109,10 +109,11 @@ function services_to_migrate { # during the conversion # 2. Remove all the colocation constraints and then the ordering constraints, except the # ones related to haproxy/VIPs which exist in Newton as well -# 3. Remove all the resources that won't be managed by pacemaker in newton. Note that they -# will show up as ORPHANED but they will keep running normally via systemd. They will be -# enabled to start at boot by puppet during the converge step -# 4. Take the cluster out of maintenance-mode and do a resource cleanup +# 3. Take the cluster out of maintenance-mode and do a resource cleanup +# 4. Remove all the resources that won't be managed by pacemaker in newton. The +# outcome will be +# that they are stopped and removed from pacemakers control +# 5. Do a resource cleanup to make sure the cluster is in a clean state function migrate_full_to_ng_ha { if [[ -n $(pcmk_running) ]]; then pcs property set maintenance-mode=true @@ -135,32 +136,35 @@ function migrate_full_to_ng_ha { log_debug "Deleting ordering constraint $constraint from CIB" pcs constraint remove "$constraint" done + # At this stage all the pacemaker resources are removed from the CIB. + # Once we remove the maintenance-mode those systemd resources will keep + # on running. They shall be systemd enabled via the puppet converge + # step later on + pcs property set maintenance-mode=false # At this stage there are no constraints whatsoever except the haproxy/ip ones - # which we want to keep. We now delete each resource that will move to systemd - # Note that the corresponding systemd resource will stay running, which means that - # later when we do the "yum update", things will be a bit slower because each - # "systemctl try-restart " is not a no-op any longer because the service is up - # and running and it will be restarted with rabbitmq being down. + # which we want to keep. We now disable and then delete each resource + # that will move to systemd. + # We want the systemd resources be stopped before doing "yum update", + # that way "systemctl try-restart " is no-op because the + # service was down already PCS_STATUS_OUTPUT="$(pcs status)" for resource in $(services_to_migrate) "delay-clone" "openstack-core-clone"; do if echo "$PCS_STATUS_OUTPUT" | grep "$resource"; then log_debug "Deleting $resource from the CIB" - - # We need to add --force because the cluster is in maintenance mode and the resource - # is unmanaged. The if serves to make this idempotent + if ! pcs resource disable "$resource" --wait=600; then + echo_error "ERROR: resource $resource failed to be disabled" + exit 1 + fi pcs resource delete --force "$resource" else log_debug "Service $service not found as a pacemaker resource, not trying to delete." fi done - # At this stage all the pacemaker resources are removed from the CIB. Once we remove the - # maintenance-mode those systemd resources will keep on running. They shall be systemd enabled - # via the puppet converge step later on - pcs property set maintenance-mode=false - # We need to do a pcs resource cleanup here + crm_resource --wait to make sure the - # cluster is in a clean state before we stop everything, upgrade and restart everything + # We need to do a pcs resource cleanup here + crm_resource --wait to + # make sure the cluster is in a clean state before we stop everything, + # upgrade and restart everything pcs resource cleanup # We are making sure here that the cluster is stable before proceeding if ! timeout -k 10 600 crm_resource --wait; then