Wait for mariadb to stop after shutdown

Stein only. Currently the kolla-ansible-centos-source-upgrade-ceph job is failing on the stable/stein branch. The problem occurs with mariadb, when performing an upgrade to the Stein release which has a new version of mariadb. It appears that when the slave mariadb services are shut down, we do not wait for the container to stop, so the service may not shut down cleanly. This prevents it from starting up successfully. This change waits for the container to stop after the shutdown command has been executed. It also temporarily replaces the restart policy of the container to prevent it from starting up again after the shutdown. This is not required in other branches since the mariadb shutdown workaround was only added in the stein branch for bug 1820325. There is a second issue that is addressed here. The Stein release switched from using xtrabackup to mariabackup for galera state syncing. If we run both container versions at the same time on different hosts then we can get an error such as the following: sh: wsrep_sst_mariabackup: command not found We therefore now stop the cluster and perform a recovery during an upgrade, if we detect that xtrabackup is in use. Finally, we now wait for the bootstrap host to report that it is in an OPERATIONAL state. Without this we can see errors where the MariaDB cluster is not ready when used by other services. Change-Id: I513bcf31adaee8441d43c6b578ca06f66820e52b Closes-Bug: #1834191 Related-Bug: #1820325
2019-06-25 12:59:41 +01:00 · 2019-06-25 12:59:41 +01:00 · 99cd5ec10c
parent 87f0da14c3
commit 99cd5ec10c
5 changed files with 294 additions and 17 deletions
--- a/ansible/roles/mariadb/handlers/main.yml
+++ b/ansible/roles/mariadb/handlers/main.yml
@ -42,11 +42,45 @@
    - bootstrap_host == inventory_hostname
  listen: Bootstrap MariaDB cluster

+- name: Wait for MariaDB to become operational
+  become: true
+  command: >-
+    docker exec {{ mariadb_service.container_name }}
+    mysql -uroot -p{{ database_password }}
+    --silent --skip-column-names
+    -e 'SHOW STATUS LIKE "wsrep_evs_state"'
+  changed_when: false
+  register: result
+  until: '"OPERATIONAL" in result.stdout'
+  retries: 10
+  delay: 6
+  no_log: true
+  when:
+    - bootstrap_host is defined
+    - bootstrap_host == inventory_hostname
+  listen: Bootstrap MariaDB cluster
+
 # NOTE(mgoddard): In Rocky the MariaDB image had an issue where it would not
 # stop on demand, and would result in Docker forcibly killing the container.
 # This could lead to a failed upgrade if the new image is unable to recover
 # from the crash. See https://bugs.launchpad.net/kolla-ansible/+bug/1820325.
+# We need to remove the restart policy from the MariaDB container to ensure
+# that it does not start back up after being shut down.
 # TODO(mgoddard): Remove this task in Train.
+
+- name: remove restart policy from slave mariadb
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  command: "docker update --restart no {{ service.container_name }}"
+  when:
+    - kolla_action != "config"
+    - has_cluster | bool
+    - inventory_hostname != master_host
+    - not mariadb_recover | default(false)
+  listen: restart mariadb
+
 - name: shutdown slave mariadb
  vars:
    service_name: "mariadb"
@ -73,6 +107,61 @@
    - kolla_action != "config"
    - has_cluster | bool
    - inventory_hostname != master_host
+    - not mariadb_recover | default(false)
+  listen: restart mariadb
+
+- name: wait for slave mariadb shutdown
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  kolla_container_facts:
+    name:
+      - "{{ service.container_name }}"
+  when:
+    - kolla_action != "config"
+    - has_cluster | bool
+    - inventory_hostname != master_host
+    - not mariadb_recover | default(false)
+  listen: restart mariadb
+  register: mariadb_container_facts
+  # Don't fail if the container is still up - we will stop it in the next task.
+  failed_when: false
+  until: service.container_name not in mariadb_container_facts
+  retries: 30
+  delay: 2
+
+# Sometimes the mariadb service can get into a partially shutdown
+# state following a 'mysqladmin shutdown'. Stop the container.
+- name: stop mariadb slave
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  kolla_docker:
+    action: "stop_container"
+    common_options: "{{ docker_common_options }}"
+    name: "{{ mariadb_service.container_name }}"
+  when:
+    - kolla_action != "config"
+    - service.container_name in mariadb_container_facts
+    - has_cluster | bool
+    - inventory_hostname != master_host
+    - not mariadb_recover | default(false)
+  listen: restart mariadb
+
+- name: replace restart policy on slave mariadb
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+    restart_policy: "{{ docker_restart_policy }}{% if docker_restart_policy == 'on-failure' %}{{ docker_restart_policy_retry }}{% endif %}"
+  become: true
+  command: "docker update --restart {{ restart_policy }} {{ service.container_name }}"
+  when:
+    - kolla_action != "config"
+    - has_cluster | bool
+    - inventory_hostname != master_host
+    - not mariadb_recover | default(false)
  listen: restart mariadb

 - name: restart slave mariadb
@ -90,6 +179,7 @@
  when:
    - kolla_action != "config"
    - inventory_hostname != master_host
+    - not mariadb_recover | default(false)
  listen: restart mariadb

 # TODO(jeffrey4l), remove the task check when the wait_for bug is fixed
@ -108,6 +198,7 @@
  when:
    - kolla_action != "config"
    - inventory_hostname != master_host
+    - not mariadb_recover | default(false)
  listen: restart mariadb

 - name: run upgrade on slave
@ -142,7 +233,22 @@
 # stop on demand, and would result in Docker forcibly killing the container.
 # This could lead to a failed upgrade if the new image is unable to recover
 # from the crash. See https://bugs.launchpad.net/kolla-ansible/+bug/1820325.
+# We need to remove the restart policy from the MariaDB container to ensure
+# that it does not start back up after being shut down.
 # TODO(mgoddard): Remove this task in Train.
+
+- name: remove restart policy from master mariadb
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  command: "docker update --restart no {{ service.container_name }}"
+  when:
+    - kolla_action != "config"
+    - inventory_hostname == master_host
+    - not mariadb_recover | default(false)
+  listen: restart mariadb
+
 - name: shutdown master mariadb
  vars:
    service_name: "mariadb"
@ -168,6 +274,57 @@
  when:
    - kolla_action != "config"
    - inventory_hostname == master_host
+    - not mariadb_recover | default(false)
+  listen: restart mariadb
+
+- name: wait for master mariadb shutdown
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  kolla_container_facts:
+    name:
+      - "{{ service.container_name }}"
+  when:
+    - kolla_action != "config"
+    - inventory_hostname == master_host
+    - not mariadb_recover | default(false)
+  listen: restart mariadb
+  register: mariadb_container_facts
+  # Don't fail if the container is still up - we will stop it in the next task.
+  failed_when: false
+  until: service.container_name not in mariadb_container_facts
+  retries: 30
+  delay: 2
+
+# Sometimes the mariadb service can get into a partially shutdown
+# state following a 'mysqladmin shutdown'. Stop the container.
+- name: stop mariadb master
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  kolla_docker:
+    action: "stop_container"
+    common_options: "{{ docker_common_options }}"
+    name: "{{ mariadb_service.container_name }}"
+  listen: restart mariadb
+  when:
+    - service.container_name in mariadb_container_facts
+    - inventory_hostname == master_host
+    - not mariadb_recover | default(false)
+
+- name: replace restart policy on master mariadb
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+    restart_policy: "{{ docker_restart_policy }}{% if docker_restart_policy == 'on-failure' %}{{ docker_restart_policy_retry }}{% endif %}"
+  become: true
+  command: "docker update --restart {{ restart_policy }} {{ service.container_name }}"
+  when:
+    - kolla_action != "config"
+    - inventory_hostname == master_host
+    - not mariadb_recover | default(false)
  listen: restart mariadb

 - name: restart master mariadb
@ -185,6 +342,7 @@
  when:
    - kolla_action != "config"
    - inventory_hostname == master_host
+    - not mariadb_recover | default(false)
  listen: restart mariadb

 # TODO(jeffrey4l), remove the task check when the wait_for bug is fixed
@ -203,6 +361,7 @@
  when:
    - kolla_action != "config"
    - inventory_hostname == master_host
+    - not mariadb_recover | default(false)
  listen: restart mariadb

 - name: run upgrade on master
--- a/ansible/roles/mariadb/tasks/recover_cluster.yml
+++ b/ansible/roles/mariadb/tasks/recover_cluster.yml
@ -20,40 +20,53 @@
  run_once: true

 - block:
-    - name: Stop MariaDB containers
+    # NOTE(mgoddard): In Rocky the MariaDB image had an issue where it would
+    # not stop on demand, and would result in Docker forcibly killing the
+    # container.  This could lead to a failed upgrade if the new image is
+    # unable to recover from the crash. See
+    # https://bugs.launchpad.net/kolla-ansible/+bug/1820325.
+    # TODO(mgoddard): Remove this task in Train.
+
+    - name: Check if MariaDB is running
+      vars:
+        service_name: "mariadb"
+        service: "{{ mariadb_services[service_name] }}"
      become: true
-      kolla_docker:
-        name: "{{ mariadb_service.container_name }}"
-        action: "stop_container"
+      kolla_container_facts:
+        name:
+          - "{{ service.container_name }}"
+      register: mariadb_container_facts
+
+    # NOTE(mgoddard): Not using an Ansible block here since rescue is broken
+    # with nested blocks.
+    - import_tasks: stop.yml
+      when: "'mariadb' in mariadb_container_facts"

    - name: Run MariaDB wsrep recovery
      become: true
      kolla_docker:
        action: "start_container"
        common_options: "{{ docker_common_options }}"
+        detach: false
        environment:
          KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
          BOOTSTRAP_ARGS: "--wsrep-recover"
        image: "{{ mariadb_service.image }}"
        labels:
          BOOTSTRAP:
-        name: "{{ mariadb_service.container_name }}"
+        name: mariadb_wsrep_recovery
        restart_policy: "never"
        volumes: "{{ mariadb_service.volumes }}"

-    - name: Stop MariaDB containers
-      become: true
-      kolla_docker:
-        name: "{{ mariadb_service.container_name }}"
-        action: "stop_container"
-
    - name: Copying MariaDB log file to /tmp
      become: true
      shell: "docker cp {{ mariadb_service.container_name }}:/var/log/kolla/mariadb/mariadb.log /tmp/mariadb_tmp.log"

+    # Look for sequence number in logs. Format is:
+    # WSREP: Recovered position: <UUID>:<seqno>.
    - name: Get MariaDB wsrep recovery seqno
      become: true
-      shell: tail -n 200 /tmp/mariadb_tmp.log | awk -F" " '$0~/Recovered position/{print $NF;exit;}' | awk -F":" '{print $1}'
+      shell: tail -n 200 /tmp/mariadb_tmp.log | awk -F" " '$0~/Recovered position/{print $NF;exit;}' | awk -F":" '{print $2}'
      register: wsrep_recovery_seqno

    - name: Removing MariaDB log file from /tmp
@ -165,6 +178,23 @@
    - bootstrap_host is defined
    - bootstrap_host == inventory_hostname

+- name: Wait for MariaDB to become operational
+  become: true
+  command: >-
+    docker exec {{ mariadb_service.container_name }}
+    mysql -uroot -p{{ database_password }}
+    --silent --skip-column-names
+    -e 'SHOW STATUS LIKE "wsrep_evs_state"'
+  changed_when: false
+  register: result
+  until: '"OPERATIONAL" in result.stdout'
+  retries: 10
+  delay: 6
+  no_log: true
+  when:
+    - bootstrap_host is defined
+    - bootstrap_host == inventory_hostname
+
 - name: Restart slave MariaDB container
  become: true
  kolla_docker:
--- a/ansible/roles/mariadb/tasks/stop.yml
+++ b/ansible/roles/mariadb/tasks/stop.yml
@ -1,6 +1,66 @@
 ---
- import_role:
-    role: service-stop
+- name: Remove restart policy from MariaDB
  vars:
-    project_services: "{{ mariadb_services }}"
-    service_name: "{{ project_name }}"
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  command: "docker update --restart no {{ service.container_name }}"
+
+- name: Shutdown MariaDB
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  kolla_docker:
+    action: "start_container"
+    command: >-
+      bash -c '
+      sudo -E kolla_set_configs &&
+      mysqladmin shutdown --host={{ api_interface_address }} --user=root --password={{ database_password }}
+      '
+    common_options: "{{ docker_common_options }}"
+    detach: False
+    name: "mariadb_shutdown"
+    image: "{{ service.image }}"
+    volumes: "{{ service.volumes }}"
+    dimensions: "{{ service.dimensions }}"
+    labels:
+      UPGRADE:
+    restart_policy: "never"
+  no_log: true
+
+- name: Wait for MariaDB shutdown
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  kolla_container_facts:
+    name:
+      - "{{ service.container_name }}"
+  register: mariadb_container_facts
+  # Don't fail if the container is still up - we will stop it in the next task.
+  failed_when: false
+  until: service.container_name not in mariadb_container_facts
+  retries: 30
+  delay: 2
+
+# Sometimes the mariadb service can get into a partially shutdown
+# state following a 'mysqladmin shutdown'. Stop the container.
+- name: Stop MariaDB container
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  kolla_docker:
+    action: "stop_container"
+    common_options: "{{ docker_common_options }}"
+    name: "{{ mariadb_service.container_name }}"
+  when: service.container_name in mariadb_container_facts
+
+- name: Replace restart policy on MariaDB
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+    restart_policy: "{{ docker_restart_policy }}{% if docker_restart_policy == 'on-failure' %}{{ docker_restart_policy_retry }}{% endif %}"
+  become: true
+  command: "docker update --restart {{ restart_policy }} {{ service.container_name }}"
--- a/ansible/roles/mariadb/tasks/upgrade.yml
+++ b/ansible/roles/mariadb/tasks/upgrade.yml
@ -1,2 +1,30 @@
 ---
+# NOTE(mgoddard): In Rocky, xtrabackup was used for galera state sync. In Stein
+# this was switched to mariabackup. This means that Rocky and Stein containers
+# are not compatible, and we need to shut down the entire cluster and perform
+# an offline upgrade. We do this using the existing cluster recovery code.
+
+- block:
+    - name: Check for use of xtrabackup
+      become: true
+      command: grep xtrabackup {{ node_config_directory }}/mariadb/galera.cnf
+      changed_when: false
+      failed_when: false
+      register: grep_result
+
+    - name: Set a fact to trigger shutdown and recovery of MariaDB cluster for xtrabackup to mariabackup migration
+      set_fact:
+        mariadb_recover: true
+      when: grep_result.rc == 0
+
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  when:
+    # Only required for multinode galera clusters on CentOS.
+    - groups[service.group] | length > 1
+    - kolla_base_distro in ['centos', 'oraclelinux', 'rhel']
+    - inventory_hostname in groups[service.group]
+    - service.enabled | bool
+
 - include_tasks: deploy.yml
--- a/tests/templates/globals-default.j2
+++ b/tests/templates/globals-default.j2
@ -2,7 +2,7 @@
 kolla_base_distro: "{{ base_distro }}"
 kolla_install_type: "{{ install_type }}"
 network_interface: "{{ api_interface_name }}"
-docker_restart_policy: "never"
+docker_restart_policy: "no"

 # Use a random router id, otherwise it may result in the same router id
 # in the CI gate.