diff --git a/service/files/clustercheckcron.j2 b/service/files/clustercheckcron.j2 index 63723b4..595d182 100644 --- a/service/files/clustercheckcron.j2 +++ b/service/files/clustercheckcron.j2 @@ -15,8 +15,11 @@ # 'clustercheckpassword!'; set -e +# Forward logs to docker log collector +exec 1>/proc/1/fd/2 2>/proc/1/fd/2 + if [[ $1 == '-h' || $1 == '--help' ]];then - echo "Usage: $0 " + echo "Usage: $0 " exit fi @@ -24,10 +27,11 @@ MYSQL_USERNAME=monitor MYSQL_PASSWORD={{ percona.monitor_password }} DISCOVERY_SERVICE={{ address("etcd", etcd.client_port) }} CLUSTER_NAME={{ percona.cluster_name }} -AVAILABLE_WHEN_DONOR=${3:-0} -ERR_FILE="${4:-/dev/null}" -AVAILABLE_WHEN_READONLY=${5:-1} -DEFAULTS_EXTRA_FILE=${6:-/etc/my.cnf} +AVAILABLE_WHEN_DONOR=${1:-0} +AVAILABLE_WHEN_READONLY=${2:-1} +DEFAULTS_EXTRA_FILE=${3:-/etc/my.cnf} +CURL="curl -sS" +FIRST_RUN=1 # CLUSTER_NAME to be set in enviroment # DISCOVERY_SERVICE to be set in enviroment @@ -54,11 +58,19 @@ hostname=$(hostname) while true do + +if [ $FIRST_RUN -eq 1 ]; then + sleep 30 + FIRST_RUN=0 +fi # # Perform the query to check the wsrep_local_state # + +# Race cond, we need to wait 'till mysql is ready, kek + WSREP_STATUS=($($MYSQL_CMDLINE -e "SHOW GLOBAL STATUS LIKE 'wsrep_%';" \ - 2>${ERR_FILE} | grep -A 1 -E 'wsrep_local_state$|wsrep_cluster_status$' \ + | grep -A 1 -E 'wsrep_local_state$|wsrep_cluster_status$' \ | sed -n -e '2p' -e '5p' | tr '\n' ' ')) if [[ ${WSREP_STATUS[1]} == 'Primary' && ( ${WSREP_STATUS[0]} -eq 4 || \ @@ -67,28 +79,33 @@ then # Check only when set to 0 to avoid latency in response. if [[ $AVAILABLE_WHEN_READONLY -eq 0 ]];then - READ_ONLY=$($MYSQL_CMDLINE -e "SHOW GLOBAL VARIABLES LIKE 'read_only';" \ - 2>${ERR_FILE} | tail -1 2>>${ERR_FILE}) + READ_ONLY=$($MYSQL_CMDLINE -e "SHOW GLOBAL VARIABLES LIKE 'read_only';") if [[ "${READ_ONLY}" == "ON" ]];then # Percona XtraDB Cluster node local state is 'Synced', but it is in # read-only mode. The variable AVAILABLE_WHEN_READONLY is set to 0. # => return HTTP 503 # Shell return-code is 1 - curl http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/?recursive=true -XDELETE >> $ERR_FILE 2>&1 + date + echo "Read-only node. Destroying" + $CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/?recursive=true -XDELETE fi fi # Percona XtraDB Cluster node local state is 'Synced' => return HTTP 200 # Shell return-code is 0 - curl http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/ipaddr -XPUT -d value="$ipaddr" -d ttl=30 >> $ERR_FILE 2>&1 - curl http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/hostname -XPUT -d value="$hostname" -d ttl=30 >> $ERR_FILE 2>&1 - curl http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr -XPUT -d ttl=30 -d dir=true -d prevExist=true >> $ERR_FILE 2>&1 + date + echo "Node is fine. Updating TTL" + $CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/ipaddr -XPUT -d value="$ipaddr" -d ttl=30 + $CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/hostname -XPUT -d value="$hostname" -d ttl=30 + $CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr -XPUT -d ttl=30 -d dir=true -d prevExist=true else # Percona XtraDB Cluster node local state is not 'Synced' => return HTTP # 503 # Shell return-code is 1 - curl http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/?recursive=true -XDELETE >> $ERR_FILE 2>&1 + date + echo "Node state is not Synced. Destroying." + $CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/?recursive=true -XDELETE fi sleep 5 diff --git a/service/files/etcd_register.py.j2 b/service/files/etcd_register.py.j2 new file mode 100644 index 0000000..2d8a32a --- /dev/null +++ b/service/files/etcd_register.py.j2 @@ -0,0 +1,123 @@ +#!/usr/bin/env python + +import functools +import logging +import os +import socket +import sys +import time + +import etcd + +CONNECTION_ATTEMPTS = 3 +CONNECTION_DELAY = 5 + +LOG_DATEFMT = "%Y-%m-%d %H:%M:%S" +LOG_FORMAT = "%(asctime)s.%(msecs)03d - %(levelname)s - %(message)s" +logging.basicConfig(format=LOG_FORMAT, datefmt=LOG_DATEFMT) +LOG = logging.getLogger(__name__) +LOG.setLevel(logging.DEBUG) + +ETCD_PATH = "/pxc-cluster/{{ percona.cluster_name }}" +HOSTNAME = socket.getfqdn() +IPADDR = socket.gethostbyname(HOSTNAME) + + +def retry(f): + @functools.wraps(f) + def wrap(*args, **kwargs): + attempts = CONNECTION_ATTEMPTS + delay = CONNECTION_DELAY + while attempts > 1: + try: + return f(*args, **kwargs) + except etcd.EtcdException as e: + LOG.warning('Etcd is not ready: %s', str(e)) + LOG.warning('Retrying in %d seconds...', delay) + time.sleep(delay) + attempts -= 1 + return f(*args, **kwargs) + return wrap + + +def get_etcd_client(): + + return etcd.Client(host="{{ address("etcd") }}", + port={{ etcd.client_port.cont }}, + allow_reconnect=True, + read_timeout=2) + + +@retry +def fetch_status(etcd_client): + + key = ETCD_PATH + result = [str(i.key).replace(key + "/", '') + for i in etcd_client.read(key).leaves + if str(i.key) != key] + LOG.info("Current cluster state is: %s", result) + return result + + +def _etcd_set(etcd_client, data, ttl): + + key = os.path.join(ETCD_PATH, IPADDR, data[0]) + etcd_client.set(key, data[1], ttl=ttl) + LOG.info("Set %s with value '%s'", key, data[1]) + + +def _etcd_create_dir(etcd_client, ttl): + + key = os.path.join(ETCD_PATH, IPADDR) + try: + etcd_client.get(key) + LOG.warning("Found stale key '%s', deleting", key) + etcd_client.delete(key, recursive=True, dir=True) + etcd_client.write(os.path.join(ETCD_PATH, IPADDR), None, ttl=ttl, + dir=True) + LOG.info("Set ttl for '%s' directory to %s", key, ttl) + except etcd.EtcdKeyNotFound: + etcd_client.write(os.path.join(ETCD_PATH, IPADDR), None, ttl=ttl, + dir=True) + LOG.info("Set ttl for '%s' directory to %s", key, ttl) + + +@retry +def set_status(etcd_client, ttl=30): + + etcd_client = get_etcd_client() + _etcd_create_dir(etcd_client, ttl) + _etcd_set(etcd_client, ('ctime', time.time()), ttl) + _etcd_set(etcd_client, ('ipaddr', IPADDR), ttl) + _etcd_set(etcd_client, ('hostname', HOSTNAME), ttl) + + +def create_join_list(status): + + status.remove(IPADDR) + if not status: + return "" + else: + return ','.join(status) + + +def main(ttl): + + try: + etcd_client = get_etcd_client() + lock = etcd.Lock(etcd_client, 'galera_bootstrap') + LOG.info("Locking...") + lock.acquire(blocking=True, lock_ttl=ttl) + LOG.info("Successfuly acquired lock") + set_status(etcd_client, ttl) + status = fetch_status(etcd_client) + # This output will be stdout == data + print(create_join_list(status)) + except Exception as err: + LOG.exception(err) + finally: + lock.release() + LOG.info("Successfuly released lock") + +if __name__ == "__main__": + main(ttl=60) diff --git a/service/files/percona_entrypoint.sh.j2 b/service/files/percona_entrypoint.sh.j2 old mode 100755 new mode 100644 index 9fcdddc..255784b --- a/service/files/percona_entrypoint.sh.j2 +++ b/service/files/percona_entrypoint.sh.j2 @@ -5,7 +5,6 @@ set -ex exec 1>/proc/1/fd/2 2>/proc/1/fd/2 MYSQL_ROOT_PASSWORD={{ db.root_password }} -DISCOVERY_SERVICE={{ address("etcd", etcd.client_port) }} CLUSTER_NAME={{ percona.cluster_name }} XTRABACKUP_PASSWORD={{ percona.xtrabackup_password }} MONITOR_PASSWORD={{ percona.monitor_password }} @@ -62,19 +61,18 @@ if [ ! -e "$DATADIR/init.ok" ]; then echo "GENERATED ROOT PASSWORD: $MYSQL_ROOT_PASSWORD" fi "${mysql[@]}" <<-EOSQL - -- What's done in this file shouldn't be replicated - -- or products like mysql-fabric won't work - SET @@SESSION.SQL_LOG_BIN=0; - CREATE USER 'root'@'%' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}' ; - GRANT ALL ON *.* TO 'root'@'%' WITH GRANT OPTION ; - ALTER USER 'root'@'localhost' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}'; - CREATE USER 'xtrabackup'@'localhost' IDENTIFIED BY '$XTRABACKUP_PASSWORD'; - GRANT RELOAD,PROCESS,LOCK TABLES,REPLICATION CLIENT ON *.* TO 'xtrabackup'@'localhost'; - GRANT REPLICATION CLIENT ON *.* TO monitor@'%' IDENTIFIED BY '$MONITOR_PASSWORD'; - GRANT PROCESS ON *.* TO monitor@localhost IDENTIFIED BY '$MONITOR_PASSWORD'; - DROP DATABASE IF EXISTS test ; - FLUSH PRIVILEGES ; - EOSQL + -- What's done in this file shouldn't be replicated + -- or products like mysql-fabric won't work + SET @@SESSION.SQL_LOG_BIN=0; + CREATE USER 'root'@'%' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}' ; + GRANT ALL ON *.* TO 'root'@'%' WITH GRANT OPTION ; + ALTER USER 'root'@'localhost' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}'; + CREATE USER 'xtrabackup'@'localhost' IDENTIFIED BY '$XTRABACKUP_PASSWORD'; + GRANT RELOAD,PROCESS,LOCK TABLES,REPLICATION CLIENT ON *.* TO 'xtrabackup'@'localhost'; + GRANT REPLICATION CLIENT ON *.* TO monitor@'%' IDENTIFIED BY '$MONITOR_PASSWORD'; + DROP DATABASE IF EXISTS test ; + FLUSH PRIVILEGES ; +EOSQL if [ ! -z "$MYSQL_ROOT_PASSWORD" ]; then mysql+=( -p"${MYSQL_ROOT_PASSWORD}" ) fi @@ -96,8 +94,8 @@ if [ ! -e "$DATADIR/init.ok" ]; then if [ ! -z "$MYSQL_ONETIME_PASSWORD" ]; then "${mysql[@]}" <<-EOSQL - ALTER USER 'root'@'%' PASSWORD EXPIRE; - EOSQL + ALTER USER 'root'@'%' PASSWORD EXPIRE; +EOSQL fi if ! kill -s TERM "$pid" || ! wait "$pid"; then echo >&2 'MySQL init process failed.' @@ -110,57 +108,21 @@ if [ ! -e "$DATADIR/init.ok" ]; then fi touch $DATADIR/init.ok -if [ -z "$DISCOVERY_SERVICE" ]; then - cluster_join=$CLUSTER_JOIN +available_nodes=$(/opt/ccp/bin/etcd_register.py) +if [ -z "$available_nodes" ]; then + echo "No available nodes found. Assuming Im first" else - -echo -echo 'Registering in the discovery service' -echo - -function join { local IFS="$1"; shift; echo "$*"; } - -# Read the list of registered IP addresses -ipaddr=$(hostname -i | awk ' { print $1 } ') -hostname=$(hostname) - -$CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/queue/$CLUSTER_NAME -XPOST -d value=$ipaddr -d ttl=60 - -#get list of IP from queue -i=( $($CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/queue/$CLUSTER_NAME | jq -r '.node.nodes[].value') ) - -# this remove my ip from the list -i1=${i[@]/$ipaddr} - -# Register the current IP in the discovery service - -# key set to expire in 30 sec. There is a cronjob that should update them regularly -$CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/ipaddr -XPUT -d value="$ipaddr" -d ttl=30 -$CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr/hostname -XPUT -d value="$hostname" -d ttl=30 -$CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/$ipaddr -XPUT -d ttl=30 -d dir=true -d prevExist=true - -i=( $($CURL http://$DISCOVERY_SERVICE/v2/keys/pxc-cluster/$CLUSTER_NAME/?quorum=true | jq -r '.node.nodes[]?.key' | awk -F'/' '{print $(NF)}') ) -# this remove my ip from the list -i2=${i[@]/$ipaddr} - -# Combine to arrays and remove duplicates -OLDIFS="$IFS" -IFS=$'\n' -combined=(`for R in "${i1[@]}" "${i2[@]}" ; do echo "$R" ; done | sort -du`) -IFS="$OLDIFS" - -cluster_join=$(join , $combined ) -echo "Joining cluster $cluster_join" - -bash /opt/ccp/bin/clustercheckcron monitor monitor 1 /var/lib/mysql/clustercheck.log 1 & - + echo "Joining to nodes: $available_nodes" fi +bash /opt/ccp/bin/clustercheckcron 1 1 & + mysqld --user=mysql --wsrep_cluster_name=$CLUSTER_NAME \ - --wsrep_cluster_address="gcomm://$cluster_join" \ + --wsrep_cluster_address="gcomm://$available_nodes" \ --wsrep_sst_method=xtrabackup-v2 \ --wsrep_sst_auth="xtrabackup:$XTRABACKUP_PASSWORD" \ --wsrep_node_address="$ipaddr" \ --pxc_strict_mode=PERMISSIVE \ $CMDARG +# vim: set ts=4 sw=4 tw=0 et : diff --git a/service/galera.yaml b/service/galera.yaml index dcfcb45..b515acd 100644 --- a/service/galera.yaml +++ b/service/galera.yaml @@ -31,6 +31,7 @@ service: - mycnf - check - readiness + - galera-etcd-register dependencies: - etcd command: /opt/ccp/bin/entrypoint.sh @@ -51,3 +52,7 @@ files: path: /opt/ccp/bin/percona_readiness.py content: percona_readiness.py.j2 perm: "0750" + galera-etcd-register: + path: /opt/ccp/bin/etcd_register.py + content: etcd_register.py.j2 + perm: "0755"