Check cluster_status liveness during OCF checks

Upstream PR - https://github.com/rabbitmq/rabbitmq-server/pull/819
`master`-first policy doesn't apply - OCF script is removed there.

We've observed some autoheal bug that made cluster_status became stuck
forever. This will help aleviate problem before proper fix for autoheal
is developed.

Change-Id: I15c9c5f2257ba7eb6414bf5d1372f5bf2b216e44
Closes-Bug: 1585128
This commit is contained in:
Alexey Lebedeff 2016-06-07 18:02:14 +03:00 committed by Dina Belova
parent e7fd4528e1
commit 3d8fae9943
1 changed files with 40 additions and 3 deletions

View File

@ -1591,6 +1591,10 @@ get_monitor() {
fi
fi
if ! is_cluster_status_ok ; then
rc=$OCF_ERR_GENERIC
fi
# Check if the list of all queues is available,
# Also report some queues stats and total virtual memory.
local queues
@ -1630,6 +1634,36 @@ get_monitor() {
return $rc
}
ocf_update_private_attr() {
local attr_name="${1:?}"
local attr_value="${2:?}"
ocf_run attrd_updater -p --name "$attr_name" --update "$attr_value"
}
rabbitmqctl_with_timeout_check() {
local command="${1:?}"
local timeout_attr_name="${2:?}"
su_rabbit_cmd "${OCF_RESKEY_ctl} $command"
local rc=$?
check_timeouts $rc $timeout_attr_name "$command"
local has_timed_out=$?
case "$has_timed_out" in
0)
return $rc;;
1)
return 0;;
2)
return 1;;
esac
}
is_cluster_status_ok() {
local LH="${LH}: is_cluster_status_ok:"
rabbitmqctl_with_timeout_check cluster_status rabbit_cluster_status_timeouts > /dev/null 2>&1
}
action_monitor() {
local rc=$OCF_ERR_GENERIC
@ -1670,9 +1704,12 @@ action_start() {
return $OCF_SUCCESS
fi
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_channels_timeouts' --update '0'
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_get_alarms_timeouts' --update '0'
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_queues_timeouts' --update '0'
local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts"
local attr_name_to_reset
for attr_name_to_reset in $attrs_to_zero; do
ocf_update_private_attr $attr_name_to_reset 0
done
ocf_log info "${LH} Deleting start time attribute"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
ocf_log info "${LH} Deleting master attribute"