From 34447f8fa84b8e28d969cfea2638d6d888a328e8 Mon Sep 17 00:00:00 2001 From: Adam Spiers Date: Thu, 10 Mar 2016 15:53:41 +0000 Subject: [PATCH] Fix neutron-ha-tool for active/passive usage The neutron-ha-tool Pacemaker resource primitive is only intended to be run on a single node at a time, i.e. in active/passive mode, rather than as a clone. However until now, the RA didn't change behaviour depending on whether it was supposed to be active on the current node. So if Pacemaker did a probe on a node where it was not expecting it to be active, the monitor action would typically return OCF_SUCCESS, causing messages from pengine like: error: Resource neutron-ha-tool (ocf::neutron-ha-tool) is active on 2 nodes attempting recovery warning: See http://clusterlabs.org/wiki/FAQ#Resource_is_Too_Active for more information. and then Pacemaker could attempt unnecessary recovery according to the value of the cluster-wide "multiple-active" option, which defaults to "stop-start". This would stop the resource everywhere (which is a noop), and then start it on one node, resulting in unnecessary cluster transitions and unnecessary runs of this RA's "start" action. To avoid this, we introduce a state file to keep track of whether it's active on the current node, and if so, skip the l3-agent check and always return OCF_NOT_RUNNING. This is the same technique already used by NovaEvacuate. Change-Id: I459e49d27802552ef5424d290ef3fca51640723b Closes-Bug: #1555711 Signed-off-by: Adam Spiers --- ocf/neutron-ha-tool | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/ocf/neutron-ha-tool b/ocf/neutron-ha-tool index cc81621..8cb1d9f 100644 --- a/ocf/neutron-ha-tool +++ b/ocf/neutron-ha-tool @@ -192,6 +192,26 @@ neutron_ha_tool_status() { } neutron_ha_tool_monitor() { + if ! [ -e "$statefile" ]; then + # neutron-ha-tool is run on a single node at a time, i.e. in + # active/passive mode. So we use this state file to keep + # track of whether it's active on the current node, and if + # Pacemaker does a probe on a node where it's not active, we + # skip the l3-agent check and always return OCF_NOT_RUNNING, + # otherwise we'd get messages from pengine like: + # + # error: Resource neutron-ha-tool (ocf::neutron-ha-tool) is active on + # 2 nodes attempting recovery + # warning: See http://clusterlabs.org/wiki/FAQ#Resource_is_Too_Active + # for more information. + # + # and Pacemaker could attempt unnecessary recovery according to the + # value of the cluster-wide "multiple-active" option. + ocf_log debug "neutron-ha-tool not currently active on this node; " \ + "skipping l3-agent check" + return $OCF_NOT_RUNNING + fi + INSECURE="" if ocf_is_true $OCF_RESKEY_os_insecure; then INSECURE="--insecure" @@ -210,6 +230,12 @@ neutron_ha_tool_monitor() { } neutron_ha_tool_start() { + touch "$statefile" + if ! [ -e "$statefile" ]; then + ocf_log err "Failed to create $statefile - aborting!" + return $OCF_ERR_GENERIC + fi + INSECURE="" if ocf_is_true $OCF_RESKEY_os_insecure; then INSECURE="--insecure" @@ -238,7 +264,13 @@ neutron_ha_tool_start() { } neutron_ha_tool_stop() { - # This is a noop + rm -f "$statefile" + if [ -e "$statefile" ]; then + ocf_log err "Uh-oh - failed to remove $statefile!" + # If we can't even remove a file in tmpfs (/run), something + # is *really* badly wrong, so fence the node. + return $OCF_ERR_GENERIC + fi return $OCF_SUCCESS } @@ -268,6 +300,8 @@ if [ -n "$OCF_RESKEY_os_cacert" ]; then export OS_CACERT=$OCF_RESKEY_os_cacert fi +statefile="${HA_RSCTMP}/${OCF_RESOURCE_INSTANCE}.active" + # What kind of method was invoked? case "$1" in start)