Check cluster_status liveness during OCF checks
Upstream PR - https://github.com/rabbitmq/rabbitmq-server/pull/819 `master`-first policy doesn't apply - OCF script is removed there. We've observed some autoheal bug that made cluster_status became stuck forever. This will help aleviate problem before proper fix for autoheal is developed. Change-Id: I15c9c5f2257ba7eb6414bf5d1372f5bf2b216e44 Closes-Bug: 1585128
This commit is contained in:
parent
e7fd4528e1
commit
3d8fae9943
|
@ -1591,6 +1591,10 @@ get_monitor() {
|
|||
fi
|
||||
fi
|
||||
|
||||
if ! is_cluster_status_ok ; then
|
||||
rc=$OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
# Check if the list of all queues is available,
|
||||
# Also report some queues stats and total virtual memory.
|
||||
local queues
|
||||
|
@ -1630,6 +1634,36 @@ get_monitor() {
|
|||
return $rc
|
||||
}
|
||||
|
||||
ocf_update_private_attr() {
|
||||
local attr_name="${1:?}"
|
||||
local attr_value="${2:?}"
|
||||
ocf_run attrd_updater -p --name "$attr_name" --update "$attr_value"
|
||||
}
|
||||
|
||||
rabbitmqctl_with_timeout_check() {
|
||||
local command="${1:?}"
|
||||
local timeout_attr_name="${2:?}"
|
||||
|
||||
su_rabbit_cmd "${OCF_RESKEY_ctl} $command"
|
||||
local rc=$?
|
||||
|
||||
check_timeouts $rc $timeout_attr_name "$command"
|
||||
local has_timed_out=$?
|
||||
|
||||
case "$has_timed_out" in
|
||||
0)
|
||||
return $rc;;
|
||||
1)
|
||||
return 0;;
|
||||
2)
|
||||
return 1;;
|
||||
esac
|
||||
}
|
||||
|
||||
is_cluster_status_ok() {
|
||||
local LH="${LH}: is_cluster_status_ok:"
|
||||
rabbitmqctl_with_timeout_check cluster_status rabbit_cluster_status_timeouts > /dev/null 2>&1
|
||||
}
|
||||
|
||||
action_monitor() {
|
||||
local rc=$OCF_ERR_GENERIC
|
||||
|
@ -1670,9 +1704,12 @@ action_start() {
|
|||
return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_channels_timeouts' --update '0'
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_get_alarms_timeouts' --update '0'
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_queues_timeouts' --update '0'
|
||||
local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts"
|
||||
local attr_name_to_reset
|
||||
for attr_name_to_reset in $attrs_to_zero; do
|
||||
ocf_update_private_attr $attr_name_to_reset 0
|
||||
done
|
||||
|
||||
ocf_log info "${LH} Deleting start time attribute"
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
|
||||
ocf_log info "${LH} Deleting master attribute"
|
||||
|
|
Loading…
Reference in New Issue