etcd registration rework with proper locking

Plus small changes in clustercheckcron

Change-Id: Id9039514d034adab5abe627f2de93a1df6d86f85
This commit is contained in:
Proskurin Kirill 2016-11-18 13:27:49 +00:00
parent bc59213349
commit ada422b39e
4 changed files with 180 additions and 73 deletions

View File

@ -15,8 +15,11 @@
# 'clustercheckpassword!';
set -e
# Forward logs to docker log collector
exec 1>/proc/1/fd/2 2>/proc/1/fd/2
if [[ $1 == '-h' || $1 == '--help' ]];then
echo "Usage: $0 <user> <pass> <available_when_donor=0|1> <log_file> <available_when_readonly=0|1> <defaults_extra_file>"
echo "Usage: $0 <available_when_donor=0|1> <log_file> <available_when_readonly=0|1> <defaults_extra_file>"
exit
fi
@ -24,10 +27,11 @@ MYSQL_USERNAME=monitor
MYSQL_PASSWORD={{ percona.monitor_password }}
DISCOVERY_SERVICE={{ address("etcd", etcd.client_port) }}
CLUSTER_NAME={{ percona.cluster_name }}
AVAILABLE_WHEN_DONOR=${3:-0}
ERR_FILE="${4:-/dev/null}"
AVAILABLE_WHEN_READONLY=${5:-1}
DEFAULTS_EXTRA_FILE=${6:-/etc/my.cnf}
AVAILABLE_WHEN_DONOR=${1:-0}
AVAILABLE_WHEN_READONLY=${2:-1}
DEFAULTS_EXTRA_FILE=${3:-/etc/my.cnf}
CURL="curl -sS"
FIRST_RUN=1
# CLUSTER_NAME to be set in enviroment
# DISCOVERY_SERVICE to be set in enviroment
@ -54,11 +58,19 @@ hostname=$(hostname)
while true
do
if [ $FIRST_RUN -eq 1 ]; then
sleep 30
FIRST_RUN=0
fi
#
# Perform the query to check the wsrep_local_state
#
# Race cond, we need to wait 'till mysql is ready, kek
WSREP_STATUS=($($MYSQL_CMDLINE -e "SHOW GLOBAL STATUS LIKE 'wsrep_%';" \
2>${ERR_FILE} | grep -A 1 -E 'wsrep_local_state$|wsrep_cluster_status$' \
| grep -A 1 -E 'wsrep_local_state$|wsrep_cluster_status$' \
| sed -n -e '2p' -e '5p' | tr '\n' ' '))
if [[ ${WSREP_STATUS[1]} == 'Primary' && ( ${WSREP_STATUS[0]} -eq 4 || \
@ -67,28 +79,33 @@ then
# Check only when set to 0 to avoid latency in response.
if [[ $AVAILABLE_WHEN_READONLY -eq 0 ]];then
READ_ONLY=$($MYSQL_CMDLINE -e "SHOW GLOBAL VARIABLES LIKE 'read_only';" \
2>${ERR_FILE} | tail -1 2>>${ERR_FILE})
READ_ONLY=$($MYSQL_CMDLINE -e "SHOW GLOBAL VARIABLES LIKE 'read_only';")
if [[ "${READ_ONLY}" == "ON" ]];then
# Percona XtraDB Cluster node local state is 'Synced', but it is in
# read-only mode. The variable AVAILABLE_WHEN_READONLY is set to 0.
# => return HTTP 503
# Shell return-code is 1
curl http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/?recursive=true -XDELETE >> $ERR_FILE 2>&1
date
echo "Read-only node. Destroying"
$CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/?recursive=true -XDELETE
fi
fi
# Percona XtraDB Cluster node local state is 'Synced' => return HTTP 200
# Shell return-code is 0
curl http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/ipaddr -XPUT -d value="$ipaddr" -d ttl=30 >> $ERR_FILE 2>&1
curl http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/hostname -XPUT -d value="$hostname" -d ttl=30 >> $ERR_FILE 2>&1
curl http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr -XPUT -d ttl=30 -d dir=true -d prevExist=true >> $ERR_FILE 2>&1
date
echo "Node is fine. Updating TTL"
$CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/ipaddr -XPUT -d value="$ipaddr" -d ttl=30
$CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/hostname -XPUT -d value="$hostname" -d ttl=30
$CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr -XPUT -d ttl=30 -d dir=true -d prevExist=true
else
# Percona XtraDB Cluster node local state is not 'Synced' => return HTTP
# 503
# Shell return-code is 1
curl http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/?recursive=true -XDELETE >> $ERR_FILE 2>&1
date
echo "Node state is not Synced. Destroying."
$CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/?recursive=true -XDELETE
fi
sleep 5

View File

@ -0,0 +1,123 @@
#!/usr/bin/env python
import functools
import logging
import os
import socket
import sys
import time
import etcd
CONNECTION_ATTEMPTS = 3
CONNECTION_DELAY = 5
LOG_DATEFMT = "%Y-%m-%d %H:%M:%S"
LOG_FORMAT = "%(asctime)s.%(msecs)03d - %(levelname)s - %(message)s"
logging.basicConfig(format=LOG_FORMAT, datefmt=LOG_DATEFMT)
LOG = logging.getLogger(__name__)
LOG.setLevel(logging.DEBUG)
ETCD_PATH = "/pxc-cluster/{{ percona.cluster_name }}"
HOSTNAME = socket.getfqdn()
IPADDR = socket.gethostbyname(HOSTNAME)
def retry(f):
@functools.wraps(f)
def wrap(*args, **kwargs):
attempts = CONNECTION_ATTEMPTS
delay = CONNECTION_DELAY
while attempts > 1:
try:
return f(*args, **kwargs)
except etcd.EtcdException as e:
LOG.warning('Etcd is not ready: %s', str(e))
LOG.warning('Retrying in %d seconds...', delay)
time.sleep(delay)
attempts -= 1
return f(*args, **kwargs)
return wrap
def get_etcd_client():
return etcd.Client(host="{{ address("etcd") }}",
port={{ etcd.client_port.cont }},
allow_reconnect=True,
read_timeout=2)
@retry
def fetch_status(etcd_client):
key = ETCD_PATH
result = [str(i.key).replace(key + "/", '')
for i in etcd_client.read(key).leaves
if str(i.key) != key]
LOG.info("Current cluster state is: %s", result)
return result
def _etcd_set(etcd_client, data, ttl):
key = os.path.join(ETCD_PATH, IPADDR, data[0])
etcd_client.set(key, data[1], ttl=ttl)
LOG.info("Set %s with value '%s'", key, data[1])
def _etcd_create_dir(etcd_client, ttl):
key = os.path.join(ETCD_PATH, IPADDR)
try:
etcd_client.get(key)
LOG.warning("Found stale key '%s', deleting", key)
etcd_client.delete(key, recursive=True, dir=True)
etcd_client.write(os.path.join(ETCD_PATH, IPADDR), None, ttl=ttl,
dir=True)
LOG.info("Set ttl for '%s' directory to %s", key, ttl)
except etcd.EtcdKeyNotFound:
etcd_client.write(os.path.join(ETCD_PATH, IPADDR), None, ttl=ttl,
dir=True)
LOG.info("Set ttl for '%s' directory to %s", key, ttl)
@retry
def set_status(etcd_client, ttl=30):
etcd_client = get_etcd_client()
_etcd_create_dir(etcd_client, ttl)
_etcd_set(etcd_client, ('ctime', time.time()), ttl)
_etcd_set(etcd_client, ('ipaddr', IPADDR), ttl)
_etcd_set(etcd_client, ('hostname', HOSTNAME), ttl)
def create_join_list(status):
status.remove(IPADDR)
if not status:
return ""
else:
return ','.join(status)
def main(ttl):
try:
etcd_client = get_etcd_client()
lock = etcd.Lock(etcd_client, 'galera_bootstrap')
LOG.info("Locking...")
lock.acquire(blocking=True, lock_ttl=ttl)
LOG.info("Successfuly acquired lock")
set_status(etcd_client, ttl)
status = fetch_status(etcd_client)
# This output will be stdout == data
print(create_join_list(status))
except Exception as err:
LOG.exception(err)
finally:
lock.release()
LOG.info("Successfuly released lock")
if __name__ == "__main__":
main(ttl=60)

82
service/files/percona_entrypoint.sh.j2 Executable file → Normal file
View File

@ -5,7 +5,6 @@ set -ex
exec 1>/proc/1/fd/2 2>/proc/1/fd/2
MYSQL_ROOT_PASSWORD={{ db.root_password }}
DISCOVERY_SERVICE={{ address("etcd", etcd.client_port) }}
CLUSTER_NAME={{ percona.cluster_name }}
XTRABACKUP_PASSWORD={{ percona.xtrabackup_password }}
MONITOR_PASSWORD={{ percona.monitor_password }}
@ -62,19 +61,18 @@ if [ ! -e "$DATADIR/init.ok" ]; then
echo "GENERATED ROOT PASSWORD: $MYSQL_ROOT_PASSWORD"
fi
"${mysql[@]}" <<-EOSQL
-- What's done in this file shouldn't be replicated
-- or products like mysql-fabric won't work
SET @@SESSION.SQL_LOG_BIN=0;
CREATE USER 'root'@'%' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}' ;
GRANT ALL ON *.* TO 'root'@'%' WITH GRANT OPTION ;
ALTER USER 'root'@'localhost' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}';
CREATE USER 'xtrabackup'@'localhost' IDENTIFIED BY '$XTRABACKUP_PASSWORD';
GRANT RELOAD,PROCESS,LOCK TABLES,REPLICATION CLIENT ON *.* TO 'xtrabackup'@'localhost';
GRANT REPLICATION CLIENT ON *.* TO monitor@'%' IDENTIFIED BY '$MONITOR_PASSWORD';
GRANT PROCESS ON *.* TO monitor@localhost IDENTIFIED BY '$MONITOR_PASSWORD';
DROP DATABASE IF EXISTS test ;
FLUSH PRIVILEGES ;
EOSQL
-- What's done in this file shouldn't be replicated
-- or products like mysql-fabric won't work
SET @@SESSION.SQL_LOG_BIN=0;
CREATE USER 'root'@'%' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}' ;
GRANT ALL ON *.* TO 'root'@'%' WITH GRANT OPTION ;
ALTER USER 'root'@'localhost' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}';
CREATE USER 'xtrabackup'@'localhost' IDENTIFIED BY '$XTRABACKUP_PASSWORD';
GRANT RELOAD,PROCESS,LOCK TABLES,REPLICATION CLIENT ON *.* TO 'xtrabackup'@'localhost';
GRANT REPLICATION CLIENT ON *.* TO monitor@'%' IDENTIFIED BY '$MONITOR_PASSWORD';
DROP DATABASE IF EXISTS test ;
FLUSH PRIVILEGES ;
EOSQL
if [ ! -z "$MYSQL_ROOT_PASSWORD" ]; then
mysql+=( -p"${MYSQL_ROOT_PASSWORD}" )
fi
@ -96,8 +94,8 @@ if [ ! -e "$DATADIR/init.ok" ]; then
if [ ! -z "$MYSQL_ONETIME_PASSWORD" ]; then
"${mysql[@]}" <<-EOSQL
ALTER USER 'root'@'%' PASSWORD EXPIRE;
EOSQL
ALTER USER 'root'@'%' PASSWORD EXPIRE;
EOSQL
fi
if ! kill -s TERM "$pid" || ! wait "$pid"; then
echo >&2 'MySQL init process failed.'
@ -110,57 +108,21 @@ if [ ! -e "$DATADIR/init.ok" ]; then
fi
touch $DATADIR/init.ok
if [ -z "$DISCOVERY_SERVICE" ]; then
cluster_join=$CLUSTER_JOIN
available_nodes=$(/opt/ccp/bin/etcd_register.py)
if [ -z "$available_nodes" ]; then
echo "No available nodes found. Assuming Im first"
else
echo
echo 'Registering in the discovery service'
echo
function join { local IFS="$1"; shift; echo "$*"; }
# Read the list of registered IP addresses
ipaddr=$(hostname -i | awk ' { print $1 } ')
hostname=$(hostname)
$CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/queue/$CLUSTER_NAME -XPOST -d value=$ipaddr -d ttl=60
#get list of IP from queue
i=( $($CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/queue/$CLUSTER_NAME | jq -r '.node.nodes[].value') )
# this remove my ip from the list
i1=${i[@]/$ipaddr}
# Register the current IP in the discovery service
# key set to expire in 30 sec. There is a cronjob that should update them regularly
$CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/ipaddr -XPUT -d value="$ipaddr" -d ttl=30
$CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/hostname -XPUT -d value="$hostname" -d ttl=30
$CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr -XPUT -d ttl=30 -d dir=true -d prevExist=true
i=( $($CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/?quorum=true | jq -r '.node.nodes[]?.key' | awk -F'/' '{print $(NF)}') )
# this remove my ip from the list
i2=${i[@]/$ipaddr}
# Combine to arrays and remove duplicates
OLDIFS="$IFS"
IFS=$'\n'
combined=(`for R in "${i1[@]}" "${i2[@]}" ; do echo "$R" ; done | sort -du`)
IFS="$OLDIFS"
cluster_join=$(join , $combined )
echo "Joining cluster $cluster_join"
bash /opt/ccp/bin/clustercheckcron monitor monitor 1 /var/lib/mysql/clustercheck.log 1 &
echo "Joining to nodes: $available_nodes"
fi
bash /opt/ccp/bin/clustercheckcron 1 1 &
mysqld --user=mysql --wsrep_cluster_name=$CLUSTER_NAME \
--wsrep_cluster_address="gcomm://$cluster_join" \
--wsrep_cluster_address="gcomm://$available_nodes" \
--wsrep_sst_method=xtrabackup-v2 \
--wsrep_sst_auth="xtrabackup:$XTRABACKUP_PASSWORD" \
--wsrep_node_address="$ipaddr" \
--pxc_strict_mode=PERMISSIVE \
$CMDARG
# vim: set ts=4 sw=4 tw=0 et :

View File

@ -31,6 +31,7 @@ service:
- mycnf
- check
- readiness
- galera-etcd-register
dependencies:
- etcd
command: /opt/ccp/bin/entrypoint.sh
@ -51,3 +52,7 @@ files:
path: /opt/ccp/bin/percona_readiness.py
content: percona_readiness.py.j2
perm: "0750"
galera-etcd-register:
path: /opt/ccp/bin/etcd_register.py
content: etcd_register.py.j2
perm: "0755"