Fix multiple issues with MariaDB handling

These affected both deploy (and reconfigure) and upgrade
resulting in WSREP issues, failed deploys or need to
recover the cluster.

This patch makes sure k-a does not abruptly terminate
nodes to break cluster.
This is achieved by cleaner separation between stages
(bootstrap, restart current, deploy new) and 3 phases
for restarts (to keep the quorum).

Upgrade actions, which operate on a healthy cluster,
went to its section.

Service restart was refactored.

We no longer rely on the master/slave distinction as
all nodes are masters in Galera.

Backport includes also the:
Followup on MariaDB handling fixes

This fixes issues reported by Mark:
- possible failure with 4-node cluster (however unlikely)
- failure to stop all nodes from progressing when conditions are
  not valid (due to: "any_errors_fatal: False")

Closes-bug: #1857908
Closes-bug: #1859145
Change-Id: I83600c69141714fc412df0976f49019a857655f5
(cherry picked from commit 9f14ad651a)
This commit is contained in:
Radosław Piliszek 2020-01-03 11:20:00 +01:00
parent e2c600d9a1
commit 8acf5c132d
9 changed files with 233 additions and 239 deletions

View File

@ -17,16 +17,10 @@
restart_policy: no
volumes: "{{ service.volumes }}"
dimensions: "{{ service.dimensions }}"
when:
- bootstrap_host is defined
- bootstrap_host == inventory_hostname
listen: Bootstrap MariaDB cluster
notify:
- restart mariadb
# TODO(jeffrey4l), remove the task check when the wait_for bug is fixed
# https://github.com/ansible/ansible-modules-core/issues/2788
- name: wait first mariadb container
# NOTE(yoctozepto): We have to loop this to avoid breaking on connection resets
- name: Wait for first MariaDB service port liveness
wait_for:
host: "{{ api_interface_address }}"
port: "{{ mariadb_port }}"
@ -37,201 +31,61 @@
until: check_mariadb_port is success
retries: 10
delay: 6
when:
- bootstrap_host is defined
- bootstrap_host == inventory_hostname
listen: Bootstrap MariaDB cluster
- name: Wait for MariaDB to become operational
- name: Wait for first MariaDB service to sync WSREP
become: true
command: >-
docker exec {{ mariadb_service.container_name }}
mysql -uroot -p{{ database_password }}
--silent --skip-column-names
-e 'SHOW STATUS LIKE "wsrep_evs_state"'
-e 'SHOW STATUS LIKE "wsrep_local_state_comment"'
changed_when: false
register: result
until: '"OPERATIONAL" in result.stdout'
until: result.stdout == "wsrep_local_state_comment\tSynced"
retries: 10
delay: 6
no_log: true
when:
- bootstrap_host is defined
- bootstrap_host == inventory_hostname
listen: Bootstrap MariaDB cluster
- name: restart slave mariadb
vars:
service_name: "mariadb"
service: "{{ mariadb_services[service_name] }}"
- name: Creating haproxy mysql user
become: true
kolla_docker:
action: "recreate_or_restart_container"
common_options: "{{ docker_common_options }}"
name: "{{ service.container_name }}"
image: "{{ service.image }}"
volumes: "{{ service.volumes }}"
dimensions: "{{ service.dimensions }}"
kolla_toolbox:
module_name: mysql_user
module_args:
login_host: "{{ api_interface_address }}"
login_port: "{{ mariadb_port }}"
login_user: "{{ database_user }}"
login_password: "{{ database_password }}"
name: "haproxy"
password: ""
host: "%"
priv: "*.*:USAGE"
listen: Bootstrap MariaDB cluster
- name: Restart MariaDB on existing cluster members
include_tasks: 'restart_services.yml'
when:
- groups.mariadb_port_alive_True is defined
- inventory_hostname in groups.mariadb_port_alive_True
- groups.mariadb_port_alive_True.index(inventory_hostname) % 4 == item
- kolla_action != "config"
- inventory_hostname != master_host
- not mariadb_recover | default(false)
listen: restart mariadb
loop:
- 0
- 1
- 2
- 3
# TODO(jeffrey4l), remove the task check when the wait_for bug is fixed
# https://github.com/ansible/ansible-modules-core/issues/2788
- name: wait for slave mariadb
wait_for:
host: "{{ api_interface_address }}"
port: "{{ mariadb_port }}"
connect_timeout: 1
timeout: 60
search_regex: "MariaDB"
register: check_mariadb_port
until: check_mariadb_port is success
retries: 10
delay: 6
- name: Start MariaDB on new nodes
include_tasks: 'restart_services.yml'
when:
- bootstrap_host is not defined or bootstrap_host != inventory_hostname
- groups.mariadb_port_alive_False is defined
- inventory_hostname in groups.mariadb_port_alive_False
- kolla_action != "config"
- inventory_hostname != master_host
- not mariadb_recover | default(false)
listen: restart mariadb
- name: run upgrade on slave
vars:
service_name: "mariadb"
service: "{{ mariadb_services[service_name] }}"
become: true
kolla_docker:
action: "start_container"
common_options: "{{ docker_common_options }}"
detach: False
dimensions: "{{ service.dimensions }}"
environment:
KOLLA_UPGRADE:
KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
DB_HOST: "{{ api_interface_address }}"
DB_PORT: "{{ mariadb_port }}"
DB_ROOT_PASSWORD: "{{ database_password }}"
image: "{{ service.image }}"
labels:
UPGRADE:
name: "upgrade_mariadb"
restart_policy: no
volumes: "{{ service.volumes }}"
no_log: true
when:
- kolla_action == "upgrade"
- inventory_hostname != master_host
- not mariadb_recover | default(false)
listen: restart mariadb
- name: restart master mariadb
vars:
service_name: "mariadb"
service: "{{ mariadb_services[service_name] }}"
become: true
kolla_docker:
action: "recreate_or_restart_container"
common_options: "{{ docker_common_options }}"
name: "{{ service.container_name }}"
image: "{{ service.image }}"
volumes: "{{ service.volumes }}"
dimensions: "{{ service.dimensions }}"
when:
- kolla_action != "config"
- inventory_hostname == master_host
- not mariadb_recover | default(false)
listen: restart mariadb
# TODO(jeffrey4l), remove the task check when the wait_for bug is fixed
# https://github.com/ansible/ansible-modules-core/issues/2788
- name: Waiting for master mariadb
wait_for:
host: "{{ api_interface_address }}"
port: "{{ mariadb_port }}"
connect_timeout: 1
timeout: 60
search_regex: "MariaDB"
register: check_mariadb_port
until: check_mariadb_port is success
retries: 10
delay: 6
when:
- kolla_action != "config"
- inventory_hostname == master_host
- not mariadb_recover | default(false)
listen: restart mariadb
- name: run upgrade on master
vars:
service_name: "mariadb"
service: "{{ mariadb_services[service_name] }}"
become: true
kolla_docker:
action: "start_container"
common_options: "{{ docker_common_options }}"
detach: False
dimensions: "{{ service.dimensions }}"
environment:
KOLLA_UPGRADE:
KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
DB_HOST: "{{ api_interface_address }}"
DB_PORT: "{{ mariadb_port }}"
DB_ROOT_PASSWORD: "{{ database_password }}"
image: "{{ service.image }}"
labels:
UPGRADE:
name: "upgrade_mariadb"
restart_policy: no
volumes: "{{ service.volumes }}"
no_log: true
when:
- kolla_action == "upgrade"
- inventory_hostname == master_host
- not mariadb_recover | default(false)
listen: restart mariadb
# NOTE(yoctozepto): due to older (<=10.1) MariaDB releases defaulting to
# non-dynamic (compact) row format, we have to change tables on upgrade
# to dynamic to allow for current migrations to pass
# see also https://bugs.launchpad.net/nova/+bug/1856296 and
# https://mariadb.com/kb/en/library/troubleshooting-row-size-too-large-errors-with-innodb
- name: Find tables to change row format to DYNAMIC
vars:
service_name: "mariadb"
mariadb_service: "{{ mariadb_services[service_name] }}"
become: True
command: >-
docker exec {{ mariadb_service.container_name }}
mysql -uroot -p{{ database_password }}
--silent --skip-column-names
-e "SELECT NAME
FROM information_schema.INNODB_SYS_TABLES
WHERE ROW_FORMAT IN('Redundant', 'Compact')
AND NAME NOT IN('SYS_DATAFILES', 'SYS_FOREIGN', 'SYS_FOREIGN_COLS', 'SYS_TABLESPACES', 'SYS_VIRTUAL', 'SYS_ZIP_DICT', 'SYS_ZIP_DICT_COLS');"
changed_when: False
register: tables_needing_row_format_change
run_once: True
no_log: True
when: kolla_action == "upgrade"
listen: restart mariadb
- name: Change row format to DYNAMIC for legacy tables
vars:
service_name: "mariadb"
mariadb_service: "{{ mariadb_services[service_name] }}"
become: True
command: >-
docker exec {{ mariadb_service.container_name }}
mysql -uroot -p{{ database_password }}
--silent --skip-column-names
-e "{% for table in tables_needing_row_format_change.stdout_lines %}ALTER TABLE {{ table | regex_replace('/', '.') }} ROW_FORMAT=DYNAMIC;{% endfor %}"
changed_when: True
run_once: True
no_log: True
when:
- kolla_action == "upgrade"
- tables_needing_row_format_change.stdout != ''
listen: restart mariadb
- name: Ensure MariaDB is running normally on bootstrap host
include_tasks: 'restart_services.yml'
listen: Bootstrap MariaDB cluster

View File

@ -1,13 +1,9 @@
---
- name: Set a fact about the master host
set_fact:
master_host: "{{ groups['mariadb'][0] }}"
- include_tasks: lookup_cluster.yml
- include_tasks: bootstrap_cluster.yml
when:
- not has_cluster | bool
- not mariadb_cluster_exists
- inventory_hostname == groups['mariadb'][0]
- include_tasks: recover_cluster.yml

View File

@ -1,6 +1,5 @@
---
- name: Set a fact about the master host
set_fact:
master_host: "{{ groups['mariadb'][0] }}"
- import_tasks: check-containers.yml
# NOTE(yoctozepto): handlers prerequisite
- import_tasks: lookup_cluster.yml

View File

@ -1,17 +1,5 @@
---
- name: Cleaning up temp file on localhost
local_action: file path=/tmp/kolla_mariadb_cluster state=absent
changed_when: False
check_mode: no
run_once: True
- name: Creating temp file on localhost
local_action: copy content='' dest=/tmp/kolla_mariadb_cluster mode=0644
changed_when: False
check_mode: no
run_once: True
- name: Creating mariadb volume
- name: Create MariaDB volume
become: true
kolla_docker:
action: "create_volume"
@ -19,18 +7,69 @@
name: "mariadb"
register: mariadb_volume
- name: Writing hostname of host with existing cluster files to temp file
local_action: copy content={{ ansible_hostname }} dest=/tmp/kolla_mariadb_cluster mode=0644
changed_when: False
check_mode: no
when: mariadb_volume is not changed
- name: Divide hosts by their MariaDB volume availability
group_by:
key: mariadb_had_volume_{{ mariadb_volume is not changed }}
- name: Registering host from temp file
- name: Establish whether the cluster has already existed
set_fact:
has_cluster: "{{ lookup('file', '/tmp/kolla_mariadb_cluster') | length > 0 }}"
mariadb_cluster_exists: "{{ groups.mariadb_had_volume_True is defined }}"
- name: Cleaning up temp file on localhost
local_action: file path=/tmp/kolla_mariadb_cluster state=absent
changed_when: False
check_mode: no
run_once: True
- block:
- name: Check MariaDB service port liveness
wait_for:
host: "{{ api_interface_address }}"
port: "{{ mariadb_port }}"
connect_timeout: 1
timeout: 10
search_regex: "MariaDB"
register: check_mariadb_port_liveness
ignore_errors: yes
- name: Divide hosts by their MariaDB service port liveness
group_by:
key: mariadb_port_alive_{{ check_mariadb_port_liveness is success }}
- name: Fail on existing but stopped cluster
fail:
msg: MariaDB cluster exists but is stopped. Please start it using kolla-ansible mariadb_recovery
when:
# NOTE(yoctozepto): we allow single-node cluster to start
- groups['mariadb'] | length > 1
- mariadb_cluster_exists
- groups.mariadb_port_alive_True is not defined
- block:
- name: Check MariaDB service WSREP sync status
become: true
command: >-
docker exec {{ mariadb_service.container_name }}
mysql -uroot -p{{ database_password }}
--silent --skip-column-names
-e 'SHOW STATUS LIKE "wsrep_local_state_comment"'
changed_when: false
register: check_mariadb_sync_status
no_log: true
# NOTE(yoctozepto): this is extracted separately to properly escape
# the TAB character which likes to go wrong due to interaction between
# Python/Ansible/Jinja2/YAML, the way below works
- name: Extract MariaDB service WSREP sync status
set_fact:
mariadb_sync_status: "{{ check_mariadb_sync_status.stdout.split('\t')[1] }}"
when:
- groups.mariadb_port_alive_True is defined
- inventory_hostname in groups.mariadb_port_alive_True
- name: Divide hosts by their MariaDB service WSREP sync status
group_by:
key: mariadb_sync_status_{{ mariadb_sync_status | default('NA') }}
- name: Fail when MariaDB services are not synced across the whole cluster
fail:
msg: MariaDB cluster is not synced. Please wait for WSREP sync before proceeding.
when:
- groups.mariadb_port_alive_True is defined
- groups.mariadb_sync_status_Synced is not defined or
groups.mariadb_port_alive_True | sort != groups.mariadb_sync_status_Synced | sort
when: not mariadb_recover | default(False)

View File

@ -1,7 +1,7 @@
---
- fail:
msg: "MariaDB cluster was not found. Is your inventory correct?"
when: not has_cluster | bool
when: not mariadb_cluster_exists
- name: Cleaning up temp file on mariadb hosts
file:
@ -93,8 +93,6 @@
- set_fact:
bootstrap_host: "{{ mariadb_recover_inventory_name }}"
master_host: "{{ mariadb_recover_inventory_name }}"
changed_when: true
- name: Copying grastate.dat file from MariaDB container in bootstrap host
become: true

View File

@ -1,19 +1,4 @@
---
- name: Creating haproxy mysql user
become: true
kolla_toolbox:
module_name: mysql_user
module_args:
login_host: "{{ api_interface_address }}"
login_port: "{{ mariadb_port }}"
login_user: "{{ database_user }}"
login_password: "{{ database_password }}"
name: "haproxy"
password: ""
host: "%"
priv: "*.*:USAGE"
run_once: True
- import_tasks: wait_for_loadbalancer.yml
- name: Creating the Mariabackup database
@ -65,7 +50,3 @@
run_once: True
when:
- enable_mariabackup | bool
- name: Cleaning up facts
set_fact:
delegate_host: "bootstraped"

View File

@ -0,0 +1,46 @@
---
- name: Restart MariaDB container
vars:
service_name: "mariadb"
service: "{{ mariadb_services[service_name] }}"
become: true
kolla_docker:
action: "recreate_or_restart_container"
common_options: "{{ docker_common_options }}"
name: "{{ service.container_name }}"
image: "{{ service.image }}"
volumes: "{{ service.volumes }}"
dimensions: "{{ service.dimensions }}"
# NOTE(yoctozepto): We have to loop this to avoid breaking on connection resets
- name: Wait for MariaDB service port liveness
wait_for:
host: "{{ api_interface_address }}"
port: "{{ mariadb_port }}"
connect_timeout: 1
timeout: 60
search_regex: "MariaDB"
register: check_mariadb_port
until: check_mariadb_port is success
retries: 10
delay: 6
- name: Wait for MariaDB service to sync WSREP
become: true
command: >-
docker exec {{ mariadb_service.container_name }}
mysql -uroot -p{{ database_password }}
--silent --skip-column-names
-e 'SHOW STATUS LIKE "wsrep_local_state_comment"'
changed_when: false
register: result
until: result.stdout == "wsrep_local_state_comment\tSynced"
retries: 10
delay: 6
no_log: true
when:
# NOTE(yoctozepto): we don't want to wait for new nodes to fully sync
# with an existing cluster as this could take time
- not mariadb_cluster_exists or
(groups.mariadb_port_alive_True is defined and
inventory_hostname in groups.mariadb_port_alive_True)

View File

@ -1,2 +1,66 @@
---
- include_tasks: deploy.yml
- name: Run upgrade in MariaDB container
vars:
service_name: "mariadb"
service: "{{ mariadb_services[service_name] }}"
become: true
kolla_docker:
action: "start_container"
common_options: "{{ docker_common_options }}"
detach: False
dimensions: "{{ service.dimensions }}"
environment:
KOLLA_UPGRADE:
KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
DB_HOST: "{{ api_interface_address }}"
DB_PORT: "{{ mariadb_port }}"
DB_ROOT_PASSWORD: "{{ database_password }}"
image: "{{ service.image }}"
labels:
UPGRADE:
name: "upgrade_mariadb"
restart_policy: no
volumes: "{{ service.volumes }}"
no_log: true
# NOTE(yoctozepto): due to older (<=10.1) MariaDB releases defaulting to
# non-dynamic (compact) row format, we have to change tables on upgrade
# to dynamic to allow for current migrations to pass
# see also https://bugs.launchpad.net/nova/+bug/1856296 and
# https://mariadb.com/kb/en/library/troubleshooting-row-size-too-large-errors-with-innodb
- name: Find tables to change row format to DYNAMIC
vars:
service_name: "mariadb"
mariadb_service: "{{ mariadb_services[service_name] }}"
become: True
command: >-
docker exec {{ mariadb_service.container_name }}
mysql -uroot -p{{ database_password }}
--silent --skip-column-names
-e "SELECT NAME
FROM information_schema.INNODB_SYS_TABLES
WHERE ROW_FORMAT IN('Redundant', 'Compact')
AND NAME NOT IN('SYS_DATAFILES', 'SYS_FOREIGN', 'SYS_FOREIGN_COLS', 'SYS_TABLESPACES', 'SYS_VIRTUAL', 'SYS_ZIP_DICT', 'SYS_ZIP_DICT_COLS');"
changed_when: False
register: tables_needing_row_format_change
run_once: True
no_log: True
- name: Change row format to DYNAMIC for legacy tables
vars:
service_name: "mariadb"
mariadb_service: "{{ mariadb_services[service_name] }}"
become: True
command: >-
docker exec {{ mariadb_service.container_name }}
mysql -uroot -p{{ database_password }}
--silent --skip-column-names
-e "{% for table in tables_needing_row_format_change.stdout_lines %}ALTER TABLE {{ table | regex_replace('/', '.') }} ROW_FORMAT=DYNAMIC;{% endfor %}"
changed_when: True
run_once: True
no_log: True
when:
- tables_needing_row_format_change.stdout != ''

View File

@ -0,0 +1,17 @@
---
fixes:
- |
Fixes MariaDB issues in multinode scenarios which affected
deployment, reconfiguration, upgrade and Galera cluster resizing.
They were usually manifested by WSREP issues in various places
and could lead to need to recover the Galera cluster.
Note these issues were due to how MariaDB was handled during
Kolla Ansible runs and did not affect Galera cluster during normal
operations unless MariaDB was later touched by Kolla Ansible.
Users wishing to run actions on their Galera clusters using
Kolla Ansible are strongly advised to update.
For details please see the following Launchpad bug records:
`bug 1857908
<https://bugs.launchpad.net/kolla-ansible/+bug/1857908>`__ and
`bug 1859145
<https://bugs.launchpad.net/kolla-ansible/+bug/1859145>`__.