Avoid false-positive split-brain detection for mysql-wss
With change Iaa4855d769fe1e0203fcfb9981413273e0e4dda2 we detect whether the node is running as a primary component while it is not master. While it is a good solution, sometimes we face race condition when the node which is a 'master' gets lower sequence number due to other nodes updating their gtid and the same time. Although it happens rarely and mostly on the slow or overloaded environemnts, it leads to redundant mysql restarts and service downtime for OpenStack APIs. The proper fix would be to use master-slave resource and corresponding script, but this is a far to big change for the bug under question. The solution proposed checks if the node is a primary component during start and monitor operations and also checks for number of currently running primary components by setting and querying an additional attribute `is_pc`. It triggers monitor failure only when the node is not running with the 'master' GTID and is a primary component and if there is more than one primary components. Misc: fix functions return codes to reflect shell 'true' and 'false' numeric values. Change-Id: Id3ea32347ed37a6efffd3ee85dfb3110b2e8c8ca Closes-bug: #1651982
This commit is contained in:
parent
4f691b399f
commit
3a2cc9e24a
|
@ -317,6 +317,20 @@ update_node_gtid() {
|
|||
return 1
|
||||
}
|
||||
|
||||
update_node_pc()
|
||||
{
|
||||
local LH="${LL} update_node_pc():"
|
||||
ocf_log info "${LH} Setting node PC flag to true"
|
||||
crm_attribute --quiet --node $HOSTNAME --lifetime reboot --name is_pc --update 'true'
|
||||
}
|
||||
|
||||
clear_node_pc()
|
||||
{
|
||||
ocf_log info "${LH} Cleaning up is_pc attribute"
|
||||
crm_attribute --quiet --node $HOSTNAME --lifetime reboot --name is_pc \
|
||||
--delete
|
||||
}
|
||||
|
||||
get_master_timeout() {
|
||||
local LH="${LL} get_master_timeout():"
|
||||
local timeout=$(crm_attribute --quiet --name galera_master_timeout \
|
||||
|
@ -377,11 +391,11 @@ check_if_reelection_needed() {
|
|||
--quiet --locate --resource $RESOURCE_NAME | sed -e '/(null)/d' | wc -l 2> /dev/null)
|
||||
rc=$?
|
||||
if [ $RUNNING_INSTANCES -lt 1 ]; then
|
||||
return 1
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
return 0
|
||||
return 1
|
||||
}
|
||||
|
||||
# Return 0 and the pid, if running a new cluster as a seed node
|
||||
|
@ -392,10 +406,10 @@ check_if_new_cluster() {
|
|||
grep -e "${OCF_RESKEY_datadir}.*wsrep-new-cluster" -e "wsrep-new-cluster.*${OCF_RESKEY_datadir}" | \
|
||||
awk '!/wsrep.recover|defunct/ {print $1}')
|
||||
if [ "${pid}" ]; then
|
||||
echo "${pid}"
|
||||
exit 0
|
||||
update_node_pc
|
||||
return 0
|
||||
fi
|
||||
exit 1
|
||||
return 1
|
||||
}
|
||||
|
||||
get_master() {
|
||||
|
@ -467,6 +481,7 @@ check_if_galera_pc() {
|
|||
local timeout=$(get_master_timeout)
|
||||
local GTID
|
||||
local pid
|
||||
local pcnum=0
|
||||
|
||||
ocf_log info "${LH} Checking if Primary Component"
|
||||
|
||||
|
@ -477,18 +492,29 @@ check_if_galera_pc() {
|
|||
if [ "$MASTER" = "$HOSTNAME" ]; then
|
||||
ocf_log info "${LH} I\'m Primary Component. Join me! My GTID: ${GTID}"
|
||||
echo "${GTID}"
|
||||
return 1
|
||||
return 0
|
||||
fi
|
||||
|
||||
if check_if_reelection_needed; then
|
||||
if ! check_if_reelection_needed; then
|
||||
ocf_log info "${LH} My neighbour is Primary Component with GTID: ${GTID}"
|
||||
pid=$(check_if_new_cluster)
|
||||
if [ "${pid}" ]; then
|
||||
ocf_log err "${LH} But I'm running a new cluster, PID:${pid}, this is a split-brain!"
|
||||
exit $OCF_ERR_GENERIC
|
||||
if check_if_new_cluster
|
||||
then
|
||||
for node in ${NODES}
|
||||
do
|
||||
is_pc=$(crm_attribute --quiet --node ${node} --lifetime reboot --query --name is_pc | sed -e '/(null)/d')
|
||||
if [ ${is_pc} == "true" ]
|
||||
then
|
||||
let pcnum=pcnum+1
|
||||
fi
|
||||
if [ ${pcnum} -gt 1 ]
|
||||
then
|
||||
ocf_log err "${LH} But I'm running a new cluster, PID:${pid}, this is a split-brain!"
|
||||
exit $OCF_ERR_GENERIC
|
||||
fi
|
||||
done
|
||||
fi
|
||||
echo "${GTID}"
|
||||
return 0
|
||||
return 1
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
|
@ -497,7 +523,7 @@ check_if_galera_pc() {
|
|||
done
|
||||
|
||||
ocf_log info "${LH} ${HOSTNAME} is not Primary Component"
|
||||
return 0
|
||||
return 1
|
||||
}
|
||||
# Functions invoked by resource manager actions
|
||||
|
||||
|
@ -639,9 +665,10 @@ mysql_monitor() {
|
|||
fi
|
||||
|
||||
# Check if this node is the master and is running the most recent GTID
|
||||
check_if_new_cluster
|
||||
MGTID=$(check_if_galera_pc)
|
||||
rc=$?
|
||||
if [ $rc -eq 1 -a "${MGTID}" != "${GTID}" ]; then
|
||||
if [ $rc -eq 0 -a "${MGTID}" != "${GTID}" ]; then
|
||||
ocf_log err "${LH} I'm a master, and my GTID: ${GTID}, which was not expected"
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
@ -657,6 +684,8 @@ mysql_start() {
|
|||
local rc
|
||||
local dir
|
||||
local mysql_extra_params
|
||||
local set_pc=0
|
||||
|
||||
|
||||
if mysql_status info 1; then
|
||||
ocf_log info "${LH} MySQL already running"
|
||||
|
@ -703,15 +732,17 @@ mysql_start() {
|
|||
check_if_reelection_needed
|
||||
rc=$?
|
||||
|
||||
if [ $rc -eq 1 ]; then
|
||||
if [ $rc -eq 0 ]; then
|
||||
check_if_galera_pc
|
||||
rc=$?
|
||||
|
||||
if [ $rc -eq 1 ]; then
|
||||
if [ $rc -eq 0 ]; then
|
||||
mysql_extra_params="$mysql_extra_params --wsrep-new-cluster"
|
||||
set_pc=1
|
||||
fi
|
||||
fi
|
||||
|
||||
clear_node_pc
|
||||
ocf_log info "${LH} Starting MySQL"
|
||||
${OCF_RESKEY_binary} \
|
||||
--pid-file=$OCF_RESKEY_pid \
|
||||
|
@ -740,6 +771,7 @@ mysql_start() {
|
|||
done
|
||||
|
||||
ocf_log info "${LH} MySQL started"
|
||||
[ ${set_pc} -eq 1 ] && update_node_pc
|
||||
[ $rc -ne $OCF_SUCCESS ] && update_node_gtid
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
@ -750,6 +782,7 @@ mysql_cleanup() {
|
|||
crm_attribute --quiet --node $HOSTNAME --lifetime reboot --name gtid \
|
||||
--delete
|
||||
|
||||
clear_node_pc
|
||||
ocf_log debug "${LH} Delete lock file: /var/lock/subsys/mysqld"
|
||||
rm -f /var/lock/subsys/mysqld
|
||||
|
||||
|
|
Loading…
Reference in New Issue