From b4f59ef86bdaa09554c6741661a13a8f3fb12cba Mon Sep 17 00:00:00 2001
From: Nicholas Randon <nicholas.randon@hp.com>
Date: Mon, 18 Aug 2014 19:18:31 +0100
Subject: [PATCH] Fix RabbitMQ element clustering start and stop

Prevent upstart respawning from SIGTERM due to post-stop pkill running.

Separate config files out of the install.d script into files to help
readability.

Renumber 40-rabbitmq to 51-rabbitmq and 50-rabbitmq-passwords to
52-rabbitmq-passwords so that ntp runs before these scripts

Graceful start and stop, to prevent split-brain issues. In the non-cluster
case: just restart.

In the cluster case: stop everything gracefully. Start everything. Join
bootstrap node if not bootstrap, otherwise join any node. This prevents getting
two disjoint clusters.

"graceful" means RAM nodes sync with disk nodes before they stop. If they are
stopped unceremoniously, they lose data.

Closes-Bug: #1334314
Change-Id: Ic758256481fdd31d10f4e4a341ae93cb372a0766
---
 elements/rabbitmq-server/element-deps         |   1 +
 .../files/etc/init/rabbitmq-server.conf       |  27 +++
 .../files/etc/rabbitmq/rabbitmq-env.conf      |   4 +
 .../install.d/20-rabbitmq-server              |  35 +---
 .../var/lib/rabbitmq/.erlang.cookie           |   0
 .../configure.d/20-rabbitmq-server-selinux    |   8 +-
 .../post-configure.d/40-rabbitmq              | 101 -----------
 .../post-configure.d/51-rabbitmq              | 167 ++++++++++++++++++
 ...bbitmq-passwords => 52-rabbitmq-passwords} |   0
 .../pre-configure.d/80-rabbitmq-cluster       |   4 +-
 10 files changed, 210 insertions(+), 137 deletions(-)
 create mode 100644 elements/rabbitmq-server/files/etc/init/rabbitmq-server.conf
 create mode 100644 elements/rabbitmq-server/files/etc/rabbitmq/rabbitmq-env.conf
 rename elements/rabbitmq-server/os-apply-config/{mnt/state => }/var/lib/rabbitmq/.erlang.cookie (100%)
 delete mode 100755 elements/rabbitmq-server/os-refresh-config/post-configure.d/40-rabbitmq
 create mode 100755 elements/rabbitmq-server/os-refresh-config/post-configure.d/51-rabbitmq
 rename elements/rabbitmq-server/os-refresh-config/post-configure.d/{50-rabbitmq-passwords => 52-rabbitmq-passwords} (100%)

diff --git a/elements/rabbitmq-server/element-deps b/elements/rabbitmq-server/element-deps
index a2ba69110..49da280bc 100644
--- a/elements/rabbitmq-server/element-deps
+++ b/elements/rabbitmq-server/element-deps
@@ -5,3 +5,4 @@ os-refresh-config
 os-svc-install
 sysctl
 use-ephemeral
+ntp
diff --git a/elements/rabbitmq-server/files/etc/init/rabbitmq-server.conf b/elements/rabbitmq-server/files/etc/init/rabbitmq-server.conf
new file mode 100644
index 000000000..3d419001c
--- /dev/null
+++ b/elements/rabbitmq-server/files/etc/init/rabbitmq-server.conf
@@ -0,0 +1,27 @@
+start on runlevel [2345]
+stop on runlevel [016]
+respawn
+# The default post-start of 1 second sleep delays respawning enough to
+# not hit the default of 10 times in 5 seconds. Make it 2 times in 5s.
+respawn limit 2 5
+
+# Process will exit from SIGTERM due to post-stop pkill, prevent this
+# causing a respawn
+normal exit 0 TERM
+
+env RUN_DIR=/var/run/rabbitmq
+env PID_FILE=$RUN_DIR/pid
+env OS_SVC_ENABLE_CONTROL=1
+export OS_SVC_ENABLE_CONTROL
+
+pre-start script
+    [ -d "$RUN_DIR" ] || install -d -D -m 0755 -o rabbitmq -g rabbitmq $RUN_DIR
+end script
+exec /usr/sbin/rabbitmq-server > /var/log/rabbitmq/startup_log \
+                              2> /var/log/rabbitmq/startup_err
+
+post-start exec /usr/sbin/rabbitmqctl wait $PID_FILE >/dev/null 2>&1
+pre-stop exec /usr/sbin/rabbitmqctl stop $PID_FILE >/dev/null 2>&1
+
+# Get the Erlang nameserver too.
+post-stop exec /usr/bin/pkill -u rabbitmq >/dev/null 2>&1
diff --git a/elements/rabbitmq-server/files/etc/rabbitmq/rabbitmq-env.conf b/elements/rabbitmq-server/files/etc/rabbitmq/rabbitmq-env.conf
new file mode 100644
index 000000000..aadde2db9
--- /dev/null
+++ b/elements/rabbitmq-server/files/etc/rabbitmq/rabbitmq-env.conf
@@ -0,0 +1,4 @@
+HOME=/var/lib/rabbitmq
+LOG_BASE=/var/log/rabbitmq
+MNESIA_BASE=/var/lib/rabbitmq/mnesia
+PID_FILE=/var/run/rabbitmq/pid
diff --git a/elements/rabbitmq-server/install.d/20-rabbitmq-server b/elements/rabbitmq-server/install.d/20-rabbitmq-server
index f7e33d819..41bb0c4a6 100755
--- a/elements/rabbitmq-server/install.d/20-rabbitmq-server
+++ b/elements/rabbitmq-server/install.d/20-rabbitmq-server
@@ -8,6 +8,8 @@ install-packages rabbitmq-server
 register-state-path --leave-symlink /var/lib/rabbitmq
 register-state-path --leave-symlink /var/log/rabbitmq
 
+FILES="$(dirname $0)/../files"
+
 # Note(jang): the rabbitmq-server service is installed, but not started, since
 # the first run of os-collect-config is required to configure it properly.
 
@@ -20,28 +22,8 @@ if [ "$DISTRO_NAME" = "ubuntu" ] || [ "$DISTRO_NAME" = "debian" -a "$DIB_INIT_SY
     # that it'll be running a venv-based service to use directly. Install an upstart
     # configuration that's compatible with os-svc-enable and os-svc-restart
 
-    cat > /etc/init/rabbitmq-server.conf <<eof
-start on runlevel [2345]
-stop on runlevel [016]
-respawn
-# the default post-start of 1 second sleep delays respawning enough to
-# not hit the default of 10 times in 5 seconds. Make it 2 times in 5s.
-respawn limit 2 5
-
-env OS_SVC_ENABLE_CONTROL=1
-export OS_SVC_ENABLE_CONTROL
-
-pre-start script
-    [ -d "/var/run/rabbitmq" ] || install -d -D -m 0755 -o rabbitmq -g rabbitmq /var/run/rabbitmq
-end script
-exec /usr/sbin/rabbitmq-server > /var/log/rabbitmq/startup_log 2> /var/log/rabbitmq/startup_err
-post-start exec /usr/sbin/rabbitmqctl wait /var/run/rabbitmq/pid >/dev/null 2>&1
-pre-stop exec /usr/sbin/rabbitmqctl stop /var/run/rabbitmq/pid >/dev/null 2>&1
-
-# Get the Erlang nameserver too.
-post-stop exec /usr/bin/pkill -u rabbitmq >/dev/null 2>&1
-
-eof
+    FILE=/etc/init/rabbitmq-server.conf
+    install -g root -o root -m 0755 "${FILES}${FILE}" "${FILE}"
 fi
 
 if [ "$DIB_INIT_SYSTEM" = "systemd" ]; then
@@ -52,13 +34,8 @@ if [ "$DIB_INIT_SYSTEM" = "systemd" ]; then
     sed -i 's/\[Service\]/\[Service\]\nRestart=on-failure/g' /lib/systemd/system/rabbitmq-server.service
 fi
 
-
-cat > /etc/rabbitmq/rabbitmq-env.conf <<EOF
-HOME=/mnt/state/var/lib/rabbitmq
-LOG_BASE=/mnt/state/var/log/rabbitmq
-MNESIA_BASE=/mnt/state/var/lib/rabbitmq/mnesia
-PID_FILE=/var/run/rabbitmq/pid
-EOF
+FILE=/etc/rabbitmq/rabbitmq-env.conf
+install -g root -o root -m 0755 "${FILES}${FILE}" "${FILE}"
 
 # Enable ulimits in pam if needed
 PAM_FILE=/etc/pam.d/su
diff --git a/elements/rabbitmq-server/os-apply-config/mnt/state/var/lib/rabbitmq/.erlang.cookie b/elements/rabbitmq-server/os-apply-config/var/lib/rabbitmq/.erlang.cookie
similarity index 100%
rename from elements/rabbitmq-server/os-apply-config/mnt/state/var/lib/rabbitmq/.erlang.cookie
rename to elements/rabbitmq-server/os-apply-config/var/lib/rabbitmq/.erlang.cookie
diff --git a/elements/rabbitmq-server/os-refresh-config/configure.d/20-rabbitmq-server-selinux b/elements/rabbitmq-server/os-refresh-config/configure.d/20-rabbitmq-server-selinux
index 3eea7f249..5633e37bb 100755
--- a/elements/rabbitmq-server/os-refresh-config/configure.d/20-rabbitmq-server-selinux
+++ b/elements/rabbitmq-server/os-refresh-config/configure.d/20-rabbitmq-server-selinux
@@ -3,8 +3,8 @@ set -eu
 
 [ -x /usr/sbin/semanage ] || exit 0
 
-semanage fcontext -a -t rabbitmq_var_lib_t "/mnt/state/var/lib/rabbitmq(/.*)?"
-restorecon -Rv /mnt/state/var/lib/rabbitmq
+semanage fcontext -a -t rabbitmq_var_lib_t "/var/lib/rabbitmq(/.*)?"
+restorecon -Rv /var/lib/rabbitmq
 
-semanage fcontext -a -t rabbitmq_var_log_t "/mnt/state/var/log/rabbitmq(/.*)?"
-restorecon -Rv /mnt/state/var/log/rabbitmq
+semanage fcontext -a -t rabbitmq_var_log_t "/var/log/rabbitmq(/.*)?"
+restorecon -Rv /var/log/rabbitmq
diff --git a/elements/rabbitmq-server/os-refresh-config/post-configure.d/40-rabbitmq b/elements/rabbitmq-server/os-refresh-config/post-configure.d/40-rabbitmq
deleted file mode 100755
index 0189a9ad7..000000000
--- a/elements/rabbitmq-server/os-refresh-config/post-configure.d/40-rabbitmq
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/bin/bash
-
-set -eux
-
-[ -d /mnt/state/var/log/rabbitmq ] || install -d -D -m 0770 -o rabbitmq -g rabbitmq /mnt/state/var/log/rabbitmq
-
-os-svc-enable -n rabbitmq-server
-os-svc-restart -n rabbitmq-server
-
-
-# Cluster setup
-# Why not using auto-configuration of cluster (specifying 'cluster_nodes' in
-# rabbitmq.conf):
-# 1) This is not robust because when joining a node, it iterates
-# through all nodes and joins to first available node, if no suitable node is
-# found, joining node is started standalone.
-# 2) This is done only for fresh nodes (first start, or reset db).
-# 3) You might end up with multiple different clusters A joins with B, C joins
-# with D
-#
-# When joining a node into rabbitmq cluster:
-# - if this node is already in cluster with current master[1] node, do nothing
-# - iterate through all nodes and check if there is a node which is in a
-# cluster[2], if such node exists, join to this node
-# - if no existing cluster is found:
-#     - if this is master node, start this node standalone
-#     - if it's not master node, try to join with master node otherwise fail (if
-#     fail we retry on next os-refresh-config run)
-#
-# [1] master node is first node in alphabetically sorted list of 'rabbit.nodes'
-# [2] cluster is any cluster with at least 2 running nodes
-
-function is_in_cluster() {
-    local node=$1
-    # Returns true if the list following "running_nodes" in rabbitmqctl
-    # cluster_status contains at least two nodes.
-    rabbitmqctl -n rabbit@$node cluster_status|grep -q "running_nodes,\[[^]]\+,"
-}
-
-function join_with() {
-    local node=$1
-    rabbitmqctl stop_app
-    rabbitmqctl join_cluster rabbit@$node || return 1
-    rabbitmqctl start_app
-}
-
-LOCAL=$(hostname -s)
-# TODO - nodes are comma separated hostnames, there is probably no type for this
-NODES=$(os-apply-config --key rabbit.nodes --type raw --key-default '' | sed 's/,/\n/g')
-MASTER=$(echo "$NODES"|sort -n|head -1)
-
-# Heat can return hostname with capital letters, cloud-init converts to lowercase. Make sure
-# we can compare them in a case-insensitive manor:
-LOCAL=${LOCAL,,}
-NODES=${NODES,,}
-MASTER=${MASTER,,}
-
-if [ -n "$NODES" ];then
-    if os-is-bootstrap-host; then
-        # if this is master node which is already clustered, do nothing
-        if is_in_cluster $LOCAL; then
-            exit 0
-        fi
-    else
-        # if this node is already in cluster with current master node, do nothing
-        if rabbitmqctl cluster_status|grep -q "$MASTER"; then
-            exit 0
-        fi
-    fi
-
-    JOINED_WITH=''
-    # find another node which is already clustered and try join with it
-    for NODE in $NODES;do
-        if [ ! "$NODE" = "$LOCAL" ] && is_in_cluster $NODE; then
-            if join_with $NODE; then
-                JOINED_WITH=$NODE
-                break
-            fi
-        fi
-    done
-
-    if [ -z "$JOINED_WITH"]; then
-        # if there is no existing cluster yet and this is master node, start this
-        # node standalone (other nodes will join to this one)
-        if os-is-bootstrap-host; then
-            rabbitmqctl start_app
-        else
-            if ! join_with $MASTER; then
-                echo "failed to join this node into cluster"
-                exit 1
-            fi
-        fi
-    fi
-
-    # wait until rabbitmq node is up
-    timeout 60 rabbitmqctl wait /var/run/rabbitmq/pid
-
-    # make sure that all queues (except those with auto-generated names) are
-    # mirrored across all nodes in running:
-    rabbitmqctl set_policy HA '^(?!amq\.).*' '{"ha-mode": "all"}'
-fi
diff --git a/elements/rabbitmq-server/os-refresh-config/post-configure.d/51-rabbitmq b/elements/rabbitmq-server/os-refresh-config/post-configure.d/51-rabbitmq
new file mode 100755
index 000000000..fba50f0fe
--- /dev/null
+++ b/elements/rabbitmq-server/os-refresh-config/post-configure.d/51-rabbitmq
@@ -0,0 +1,167 @@
+#!/bin/bash
+set -eux
+set -o pipefail
+
+LOCAL_RABBIT_HOST="$(os-apply-config --key bootstrap_host.nodeid --type netaddress --key-default '')"
+NODES=($(os-apply-config --key rabbit.nodes --type raw --key-default '' | sed 's/,/\n/g' | sort))
+TOTAL_NODES=${#NODES[@]}
+
+# Insufficient meta-data to attempt to start-up RabbitMQ.
+if [ -z "${LOCAL_RABBIT_HOST}" ]; then
+    echo "RabbitMQ bootstrap_host.nodeid is not defined in meta-data, aborting."
+    exit 255
+fi
+
+os-svc-enable -n rabbitmq-server
+
+## Non-cluster configuration set-up. ##
+if [ ${TOTAL_NODES} -le 1 ]; then
+    os-svc-restart -n rabbitmq-server
+    echo "RabbitMQ non-cluster configuration complete..."
+    exit 0
+fi
+
+## Cluster configuration set-up. ##
+function is_in_cluster() {
+    # Returns true if the list following "running_nodes" in rabbitmqctl
+    # cluster_status contains at least two nodes.
+    rabbitmqctl cluster_status 2>/dev/null |
+        grep -q "running_nodes,\[[^]]\+,"
+}
+
+# Number of nodes in the cluster according to remote node $1.
+# If $1 isn't in a cluster or it's in a cluster by itself, then this will
+# return 0.
+function cluster_size() {
+    local remote_node="${1}"
+    rabbitmqctl -n "rabbit@${remote_node}" cluster_status 2>/dev/null |
+        sed -n '/{running_nodes,\[[^]]\+,/,/\]\},/p' |
+        wc -l
+}
+
+function leave_cluster() {
+    rabbitmqctl stop_app
+    # This syncs all data into the cluster, then removes this node, cleaning local mnesia.
+    rabbitmqctl reset
+}
+export -f leave_cluster
+
+function join_cluster_with() {
+    local remote_node="${1}"
+    local local_node="${2}"
+    rabbitmqctl stop_app
+    rabbitmqctl join_cluster "rabbit@${remote_node}" 2>/dev/null || true
+    rabbitmqctl start_app
+
+    if ! is_in_cluster; then
+       echo "Failed to join node [${local_node}] with [${remote_node}]..."
+       return 1
+    fi
+}
+
+BOOTSTRAP_NODE="$(os-apply-config --key bootstrap_host.bootstrap_nodeid --type netaddress --key-default '')"
+NODE_INDEX=""
+# Find the nodes being worked on in the NODES array.
+for (( index = 0; index < ${TOTAL_NODES}; index++ )); do
+   if [ "${NODES[$index]}" == "${LOCAL_RABBIT_HOST}" ]; then
+      NODE_INDEX=${index}
+   fi
+done
+
+if [ -z "${BOOTSTRAP_NODE}" -o ${TOTAL_NODES} -lt 3 -o -z "${NODE_INDEX}" ]; then
+    #      We do not know who the bootstrap is, why are we attempting to bring up a Rabbit cluster?
+    # -OR- we do not have sufficient nodes to support HA so lets abort.
+    # -OR- we did not find our node in the array and hence did not set node_indexs.
+    echo "bootstrap_host.bootstrap_nodeid: ${BOOTSTRAP_NODE}, TOTAL_NODES: ${TOTAL_NODES}, NODE_INDEX: ${NODE_INDEX}"
+    echo "RabbitMQ cluster configuration prerequisites not met, aborting."
+    exit 255
+fi
+
+for (( index = 0; index < ${TOTAL_NODES}; index++ )); do
+    if ! ping -c1 "${NODES[$index]}"; then
+        echo "RabbitMQ host unreachable: ${NODES[$index]}"
+        HOST_UNREACHABLE=1
+    fi
+done
+[ -z "${HOST_UNREACHABLE:-}" ] || exit 1
+
+# Refuse to stop unless all nodes are running, this avoids pause_minority.
+# From the RabbitMQ docs: pause_minority
+#     Your network is maybe less reliable. You have clustered across 3 AZs
+#     in EC2, and you assume that only one AZ will fail at once. In that
+#     scenario you want the remaining two AZs to continue working and the
+#     nodes from the failed AZ to rejoin automatically and without fuss when
+#     the AZ comes back.
+# (See: os-apply-config/etc/rabbitmq/rabbitmq.config)
+#
+# We want to orchestrate nodes leaving the cluster. We'll do this using a
+# metronome.  For example, if we have 3 nodes, there will be six periods.
+# The first node may leave in period 0. The second node may leave in period
+# 1. The third node may leave in period 2.
+#
+# Metronome:   0 .. 1 .. 2 .. 3 .. 4 .. 5 ..
+# Node leaves: 0 ....... 1 ....... 2 .......
+#
+# The dead periods in between allow for $PERIOD seconds of clock
+# desynchronization. PERIOD should be about the half the length of time it
+# takes for a node to join the cluster.
+PERIOD=10
+NODE_LEAVES_AT=$(( ${NODE_INDEX} * 2 ))
+while is_in_cluster; do
+    NODES_IN_CLUSTER=$(cluster_size "${BOOTSTRAP_NODE}")
+    if [ ${NODES_IN_CLUSTER} -gt ${TOTAL_NODES} ]; then
+        echo "A node we don't know about appears to have joined the cluster, aborting."
+        exit 255
+    fi
+
+    METRONOME=$(( ($(date +%s) / ${PERIOD}) % (${TOTAL_NODES} * 2) ))
+    if [ ${NODES_IN_CLUSTER} -eq ${TOTAL_NODES} -a \
+         ${METRONOME} -eq ${NODE_LEAVES_AT} ]; then
+        # All other nodes are in the cluster and it's our allotted time,
+        # safe to leave. Tell other nodes we're about to leave the cluster.
+        echo "Leaving cluster..."
+        timeout 300 bash -c leave_cluster || { rabbitmqctl start_app && exit 1; }
+    else
+        echo "Refusing to allow node to leave cluster..."
+    fi
+    sleep 2
+done
+
+# Restart RabbitMQ. We need to have left the cluster first or we risk data loss.
+os-svc-restart -n rabbitmq-server
+
+# We're the bootstrap node
+if [ "${LOCAL_RABBIT_HOST}" == "${BOOTSTRAP_NODE}" ]; then
+   # If we are not in a cluster keep trying to join a node.
+   # Note: This loop is required as the BOOTSTRAP_NODE may have left a running
+   #       cluster and it therefore must re-join.
+   while ! is_in_cluster; do
+       # Try to join with each node in turn.
+       COUNT=$(( (${COUNT:-0} + 1)  % ${TOTAL_NODES} ))
+       if [ ${COUNT} -ne ${NODE_INDEX} ]; then
+          join_cluster_with "${NODES[${COUNT}]}" "${LOCAL_RABBIT_HOST}" || true
+       fi
+   done
+
+   # Check that we have not got a partition i.e. The case where we do not have
+   # synced clocks and hence we can get split in the clustering A+B C. If we
+   # get this we will wait as this is more favourable than a bad/broken
+   # cluster set-up.
+   while [[ $(cluster_size "${LOCAL_RABBIT_HOST}") -ne ${TOTAL_NODES} ]]; do
+       echo "Waiting for nodes to join [${BOOTSTRAP_NODE}]..."
+       sleep 10
+   done
+else
+   # Wait until the BOOTSTRAP_NODE has at least formed a cluster with one node.
+   while [[ $(cluster_size "${BOOTSTRAP_NODE}") -lt 2 ]]; do
+       echo "Waiting for bootstrap node to initialise the cluster..."
+       sleep 10
+   done
+   is_in_cluster || join_cluster_with "${BOOTSTRAP_NODE}" "${LOCAL_RABBIT_HOST}"
+fi
+
+# Make sure that all queues (except those with auto-generated names) are
+# mirrored across all nodes in the cluster running:
+rabbitmqctl set_policy HA '^(?!amq\.).*' '{"ha-mode": "all"}'
+
+echo "RabbitMQ cluster configuration complete..."
diff --git a/elements/rabbitmq-server/os-refresh-config/post-configure.d/50-rabbitmq-passwords b/elements/rabbitmq-server/os-refresh-config/post-configure.d/52-rabbitmq-passwords
similarity index 100%
rename from elements/rabbitmq-server/os-refresh-config/post-configure.d/50-rabbitmq-passwords
rename to elements/rabbitmq-server/os-refresh-config/post-configure.d/52-rabbitmq-passwords
diff --git a/elements/rabbitmq-server/os-refresh-config/pre-configure.d/80-rabbitmq-cluster b/elements/rabbitmq-server/os-refresh-config/pre-configure.d/80-rabbitmq-cluster
index 8dce997a7..2f1f8b723 100755
--- a/elements/rabbitmq-server/os-refresh-config/pre-configure.d/80-rabbitmq-cluster
+++ b/elements/rabbitmq-server/os-refresh-config/pre-configure.d/80-rabbitmq-cluster
@@ -1,6 +1,4 @@
 #!/bin/bash
 set -eux
 
-[ -d /mnt/state/var/lib/rabbitmq ] || install -d -D -m 0770 -o rabbitmq -g rabbitmq /mnt/state/var/lib/rabbitmq
-install -m 600 -o rabbitmq -g rabbitmq /dev/null /mnt/state/var/lib/rabbitmq/.erlang.cookie
-[ -d /mnt/state/var/log/rabbitmq ] || install -d -D -m 0770 -o rabbitmq -g rabbitmq /mnt/state/var/log/rabbitmq
+install -m 600 -o rabbitmq -g rabbitmq /dev/null /var/lib/rabbitmq/.erlang.cookie