From 34447f8fa84b8e28d969cfea2638d6d888a328e8 Mon Sep 17 00:00:00 2001
From: Adam Spiers <aspiers@suse.com>
Date: Thu, 10 Mar 2016 15:53:41 +0000
Subject: [PATCH] Fix neutron-ha-tool for active/passive usage

The neutron-ha-tool Pacemaker resource primitive is only intended to be
run on a single node at a time, i.e. in active/passive mode, rather than
as a clone.  However until now, the RA didn't change behaviour depending
on whether it was supposed to be active on the current node.  So if
Pacemaker did a probe on a node where it was not expecting it to be
active, the monitor action would typically return OCF_SUCCESS, causing
messages from pengine like:

  error: Resource neutron-ha-tool (ocf::neutron-ha-tool) is active on 2 nodes attempting recovery
  warning: See http://clusterlabs.org/wiki/FAQ#Resource_is_Too_Active for more information.

and then Pacemaker could attempt unnecessary recovery according to the
value of the cluster-wide "multiple-active" option, which defaults to
"stop-start".  This would stop the resource everywhere (which is a
noop), and then start it on one node, resulting in unnecessary cluster
transitions and unnecessary runs of this RA's "start" action.

To avoid this, we introduce a state file to keep track of whether it's
active on the current node, and if so, skip the l3-agent check and
always return OCF_NOT_RUNNING.  This is the same technique already used
by NovaEvacuate.

Change-Id: I459e49d27802552ef5424d290ef3fca51640723b
Closes-Bug: #1555711
Signed-off-by: Adam Spiers <aspiers@suse.com>
---
 ocf/neutron-ha-tool | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/ocf/neutron-ha-tool b/ocf/neutron-ha-tool
index cc81621..8cb1d9f 100644
--- a/ocf/neutron-ha-tool
+++ b/ocf/neutron-ha-tool
@@ -192,6 +192,26 @@ neutron_ha_tool_status() {
 }
 
 neutron_ha_tool_monitor() {
+    if ! [ -e "$statefile" ]; then
+        # neutron-ha-tool is run on a single node at a time, i.e. in
+        # active/passive mode.  So we use this state file to keep
+        # track of whether it's active on the current node, and if
+        # Pacemaker does a probe on a node where it's not active, we
+        # skip the l3-agent check and always return OCF_NOT_RUNNING,
+        # otherwise we'd get messages from pengine like:
+        #
+        #   error: Resource neutron-ha-tool (ocf::neutron-ha-tool) is active on
+        #       2 nodes attempting recovery
+        #   warning: See http://clusterlabs.org/wiki/FAQ#Resource_is_Too_Active
+        #       for more information.
+        #
+        # and Pacemaker could attempt unnecessary recovery according to the
+        # value of the cluster-wide "multiple-active" option.
+        ocf_log debug "neutron-ha-tool not currently active on this node; " \
+            "skipping l3-agent check"
+        return $OCF_NOT_RUNNING
+    fi
+
     INSECURE=""
     if ocf_is_true $OCF_RESKEY_os_insecure; then
         INSECURE="--insecure"
@@ -210,6 +230,12 @@ neutron_ha_tool_monitor() {
 }
 
 neutron_ha_tool_start() {
+    touch "$statefile"
+    if ! [ -e "$statefile" ]; then
+        ocf_log err "Failed to create $statefile - aborting!"
+        return $OCF_ERR_GENERIC
+    fi
+
     INSECURE=""
     if ocf_is_true $OCF_RESKEY_os_insecure; then
         INSECURE="--insecure"
@@ -238,7 +264,13 @@ neutron_ha_tool_start() {
 }
 
 neutron_ha_tool_stop() {
-    # This is a noop
+    rm -f "$statefile"
+    if [ -e "$statefile" ]; then
+        ocf_log err "Uh-oh - failed to remove $statefile!"
+        # If we can't even remove a file in tmpfs (/run), something
+        # is *really* badly wrong, so fence the node.
+        return $OCF_ERR_GENERIC
+    fi
     return $OCF_SUCCESS
 }
 
@@ -268,6 +300,8 @@ if [ -n "$OCF_RESKEY_os_cacert" ]; then
     export OS_CACERT=$OCF_RESKEY_os_cacert
 fi
 
+statefile="${HA_RSCTMP}/${OCF_RESOURCE_INSTANCE}.active"
+
 # What kind of method was invoked?
 case "$1" in
     start)