From ed739a5243b59596455b3488471c8cd81c15acf5 Mon Sep 17 00:00:00 2001 From: Kevin Carter Date: Thu, 16 Nov 2017 11:30:55 -0600 Subject: [PATCH] Implement a proper WSREP check for galera The galera cluster rely on WSREP for cluster consistency. While the default MySQL monitor will allow us to know when the database node is minimally functional it does not provide the ability to query the node state allowing loadbalancers, operators, and deployers to know a node is healthy prior to being allowed to accept connections. This change implements the checkcluster script as provided by the fine folks at Percona. The implementation of this check follows the guild-lines noted here [0]. With this in-place, we'll be able to convert our haproxy check for the galera cluster nodes to use an HTTP check on port 9200 instead of the default MySQL login which will provide for a more robust and fault tolerant cluster. [0] https://www.percona.com/doc/percona-xtradb-cluster/LATEST/howtos/virt_sandbox.html Combined backport of: - https://review.openstack.org/520665 - https://review.openstack.org/523850 Closes-Bug: #1665667 Change-Id: Ie1b3b9724dd33de1d90634166e585ecceb1f4c96 Signed-off-by: Kevin Carter --- defaults/main.yml | 12 ++ handlers/main.yml | 5 + .../notes/clustecheck-9311d05fb32f13b3.yaml | 7 ++ .../new_healthcheck-9e559565745defd0.yaml | 7 ++ tasks/galera_post_install.yml | 28 +++++ templates/clustercheck.j2 | 110 ++++++++++++++++++ templates/mysqlchk.j2 | 20 ++++ tests/test-galera-server-functional.yml | 6 + vars/redhat-7.yml | 1 + vars/suse-42.yml | 1 + vars/ubuntu-16.04.yml | 1 + 11 files changed, 198 insertions(+) create mode 100644 releasenotes/notes/clustecheck-9311d05fb32f13b3.yaml create mode 100644 releasenotes/notes/new_healthcheck-9e559565745defd0.yaml create mode 100644 templates/clustercheck.j2 create mode 100644 templates/mysqlchk.j2 diff --git a/defaults/main.yml b/defaults/main.yml index 73cb8806..6f9d98d6 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -69,6 +69,18 @@ galera_running_and_bootstrapped: false galera_monitoring_user: monitoring galera_monitoring_user_password: "" + +# WARNING: Set this to open xinetd rules for galera monitoring. +# This is REQUIRED to run a working openstack-ansible deployment. +# If it's undefined the galera cluster state can't be reported, +# and haproxy would fail to do proper load balancing on the cluster. +# Because this opens connections to the cluster status, this +# should be restricted, which we do in the integrated build. +# Please override accordingly to your use case. +# This can be replaced with other hostnames, cidr, ips, and ips + wildcards. +# +#galera_monitoring_allowed_source: "0.0.0.0/0" + galera_root_user: root # WARNING: This option is deprecated and will be removed in v12.0 diff --git a/handlers/main.yml b/handlers/main.yml index d8e67df7..3745d6da 100644 --- a/handlers/main.yml +++ b/handlers/main.yml @@ -103,3 +103,8 @@ notify: - "Remove stale .sst" listen: "Restart all mysql" + +- name: Restart xinetd + service: + name: xinetd + state: restarted diff --git a/releasenotes/notes/clustecheck-9311d05fb32f13b3.yaml b/releasenotes/notes/clustecheck-9311d05fb32f13b3.yaml new file mode 100644 index 00000000..f40f9799 --- /dev/null +++ b/releasenotes/notes/clustecheck-9311d05fb32f13b3.yaml @@ -0,0 +1,7 @@ +--- +features: + - The galera cluster now supports cluster health checks over HTTP using port + 9200. The new cluster check ensures a node is healthy by running a simple + query against the wsrep sync status using monitoring user. This change will + provide for a more robust cluster check ensuring we have the most fault + tolerant galera cluster possible. diff --git a/releasenotes/notes/new_healthcheck-9e559565745defd0.yaml b/releasenotes/notes/new_healthcheck-9e559565745defd0.yaml new file mode 100644 index 00000000..8707d297 --- /dev/null +++ b/releasenotes/notes/new_healthcheck-9e559565745defd0.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + Galera healthcheck has been improved, and relies on an xinetd service. + By default, the service is unaccessible (filtered with the no_access + directive). You can override the directive by setting any xinetd + valid value to ``galera_monitoring_allowed_source``. diff --git a/tasks/galera_post_install.yml b/tasks/galera_post_install.yml index cde05529..7af2e001 100644 --- a/tasks/galera_post_install.yml +++ b/tasks/galera_post_install.yml @@ -137,3 +137,31 @@ command: "systemctl daemon-reload" when: - ansible_service_mgr == 'systemd' + +- name: Create clustercheck script + template: + src: "clustercheck.j2" + dest: "/usr/local/bin/clustercheck" + mode: "0755" + tags: + - galera-config + +- name: Create mysqlchk config + template: + src: "mysqlchk.j2" + dest: "/etc/xinetd.d/mysqlchk" + mode: "0644" + notify: + - Restart xinetd + tags: + - galera-config + +- name: Add galera service check to services + lineinfile: + dest: /etc/services + state: present + regexp: '^mysqlchk' + line: 'mysqlchk 9200/tcp # MySQL check' + backup: yes + tags: + - galera-config diff --git a/templates/clustercheck.j2 b/templates/clustercheck.j2 new file mode 100644 index 00000000..2c77e779 --- /dev/null +++ b/templates/clustercheck.j2 @@ -0,0 +1,110 @@ +#!/bin/bash +# +# Script to make a proxy (ie HAProxy) capable of monitoring Percona XtraDB Cluster nodes properly +# +# Author: Olaf van Zandwijk +# Author: Raghavendra Prabhu +# +# Documentation and download: https://github.com/olafz/percona-clustercheck +# +# Based on the original script from Unai Rodriguez +# + +# {{ ansible_managed }} + +if [[ $1 == '-h' || $1 == '--help' ]];then + echo "Usage: $0 " + exit +fi + +# if the disabled file is present, return 503. This allows +# admins to manually remove a node from a cluster easily. +if [ -e "/var/tmp/clustercheck.disabled" ]; then + # Shell return-code is 1 + echo -en "HTTP/1.1 503 Service Unavailable\r\n" + echo -en "Content-Type: text/plain\r\n" + echo -en "Connection: close\r\n" + echo -en "Content-Length: 51\r\n" + echo -en "\r\n" + echo -en "Percona XtraDB Cluster Node is manually disabled.\r\n" + sleep 0.1 + exit 1 +fi + +MYSQL_USERNAME="${1-{{ galera_monitoring_user }}}" +MYSQL_PASSWORD="${2-{{ galera_monitoring_user_password }}}" +AVAILABLE_WHEN_DONOR=${3:-0} +ERR_FILE="${4:-/dev/null}" +AVAILABLE_WHEN_READONLY=${5:-1} +DEFAULTS_EXTRA_FILE=${6:-/etc/my.cnf} + +#Timeout exists for instances where mysqld may be hung +TIMEOUT=10 + +EXTRA_ARGS="" +if [[ -n "$MYSQL_USERNAME" ]]; then + EXTRA_ARGS="$EXTRA_ARGS --user=${MYSQL_USERNAME}" +fi + +if [[ -n "$MYSQL_PASSWORD" ]]; then + EXTRA_ARGS="$EXTRA_ARGS --password=${MYSQL_PASSWORD}" +else + EXTRA_ARGS="$EXTRA_ARGS --password=" +fi + +if [[ -r $DEFAULTS_EXTRA_FILE ]]; then + MYSQL_CMDLINE="mysql --defaults-extra-file=$DEFAULTS_EXTRA_FILE -nNE --connect-timeout=$TIMEOUT \ + ${EXTRA_ARGS}" +else + MYSQL_CMDLINE="mysql -nNE --connect-timeout=$TIMEOUT ${EXTRA_ARGS}" +fi + +# +# Perform the query to check the wsrep_local_state +# +WSREP_STATUS=$($MYSQL_CMDLINE -e "SHOW STATUS LIKE 'wsrep_local_state';" \ + 2>${ERR_FILE} | tail -1 2>>${ERR_FILE}) + +if [[ "${WSREP_STATUS}" == "4" ]] || [[ "${WSREP_STATUS}" == "2" && ${AVAILABLE_WHEN_DONOR} == 1 ]]; then + # Check only when set to 0 to avoid latency in response. + if [[ $AVAILABLE_WHEN_READONLY -eq 0 ]];then + READ_ONLY=$($MYSQL_CMDLINE -e "SHOW GLOBAL VARIABLES LIKE 'read_only';" \ + 2>${ERR_FILE} | tail -1 2>>${ERR_FILE}) + + if [[ "${READ_ONLY}" == "ON" ]];then + # Percona XtraDB Cluster node local state is 'Synced', but it is in + # read-only mode. The variable AVAILABLE_WHEN_READONLY is set to 0. + # => return HTTP 503 + # Shell return-code is 1 + echo -en "HTTP/1.1 503 Service Unavailable\r\n" + echo -en "Content-Type: text/plain\r\n" + echo -en "Connection: close\r\n" + echo -en "Content-Length: 43\r\n" + echo -en "\r\n" + echo -en "Percona XtraDB Cluster Node is read-only.\r\n" + sleep 0.1 + exit 1 + fi + fi + # Percona XtraDB Cluster node local state is 'Synced' => return HTTP 200 + # Shell return-code is 0 + echo -en "HTTP/1.1 200 OK\r\n" + echo -en "Content-Type: text/plain\r\n" + echo -en "Connection: close\r\n" + echo -en "Content-Length: 40\r\n" + echo -en "\r\n" + echo -en "Percona XtraDB Cluster Node is synced.\r\n" + sleep 0.1 + exit 0 +else + # Percona XtraDB Cluster node local state is not 'Synced' => return HTTP 503 + # Shell return-code is 1 + echo -en "HTTP/1.1 503 Service Unavailable\r\n" + echo -en "Content-Type: text/plain\r\n" + echo -en "Connection: close\r\n" + echo -en "Content-Length: 44\r\n" + echo -en "\r\n" + echo -en "Percona XtraDB Cluster Node is not synced.\r\n" + sleep 0.1 + exit 1 +fi diff --git a/templates/mysqlchk.j2 b/templates/mysqlchk.j2 new file mode 100644 index 00000000..3aa36001 --- /dev/null +++ b/templates/mysqlchk.j2 @@ -0,0 +1,20 @@ +# default: on +# description: mysqlchk +# {{ ansible_managed }} +service mysqlchk +{ + disable = no + flags = REUSE + socket_type = stream + port = 9200 + wait = no + user = nobody + server = /usr/local/bin/clustercheck + log_on_failure += USERID + {% if galera_monitoring_allowed_source is defined %} + only_from = {{ galera_monitoring_allowed_source }} + {% else %} + no_access + {% endif %} + per_source = UNLIMITED +} diff --git a/tests/test-galera-server-functional.yml b/tests/test-galera-server-functional.yml index 8eea4660..bb1fe1bf 100644 --- a/tests/test-galera-server-functional.yml +++ b/tests/test-galera-server-functional.yml @@ -27,6 +27,8 @@ --skip-column-names register: wsrep_incoming_addresses changed_when: false + tags: + - skip_ansible_lint - name: Check cluster local state command: | mysql -h {{ ansible_host }} \ @@ -36,6 +38,8 @@ --skip-column-names register: wsrep_local_state_comment changed_when: false + tags: + - skip_ansible_lint - name: Check cluster evs state command: | mysql -h {{ ansible_host }} \ @@ -45,6 +49,8 @@ --skip-column-names register: wsrep_evs_state changed_when: false + tags: + - skip_ansible_lint - name: Check contents assert: that: diff --git a/vars/redhat-7.yml b/vars/redhat-7.yml index 76c3dafe..ad8851a6 100644 --- a/vars/redhat-7.yml +++ b/vars/redhat-7.yml @@ -27,6 +27,7 @@ galera_server_required_distro_packages: - libgcrypt - MariaDB-client - MariaDB-devel + - xinetd galera_etc_conf_file: "/etc/mysql/my.cnf" galera_etc_include_dir: "/etc/mysql/conf.d" diff --git a/vars/suse-42.yml b/vars/suse-42.yml index 65818abe..e0b285af 100644 --- a/vars/suse-42.yml +++ b/vars/suse-42.yml @@ -26,6 +26,7 @@ galera_server_required_distro_packages: - libmysqlclient-devel - mariadb-client - qpress + - xinetd galera_etc_conf_file: "/etc/my.cnf" galera_etc_include_dir: "/etc/my.cnf.d" diff --git a/vars/ubuntu-16.04.yml b/vars/ubuntu-16.04.yml index 54fcd8db..ed1e7a34 100644 --- a/vars/ubuntu-16.04.yml +++ b/vars/ubuntu-16.04.yml @@ -38,6 +38,7 @@ galera_server_required_distro_packages: - libstdc++6 - python-software-properties - software-properties-common + - xinetd galera_etc_conf_file: "/etc/mysql/my.cnf" galera_etc_include_dir: "/etc/mysql/conf.d"