Merge "MariaDB: Galera cluster refactor"

2018-10-25 16:32:56 +00:00 · 2018-10-25 16:32:56 +00:00 · 4835aa637a
parent 1ec8981aa4 f6e84fe15f
commit 4835aa637a
7 changed files with 825 additions and 218 deletions
--- a/mariadb/templates/bin/_start.py.tpl
+++ b/mariadb/templates/bin/_start.py.tpl
@ -0,0 +1,693 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright 2018 The Openstack-Helm Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import re
+import logging
+import select
+import signal
+import subprocess
+import socket
+import tempfile
+import time
+import threading
+from datetime import datetime, timedelta
+
+import configparser
+import iso8601
+import kubernetes.client
+import kubernetes.config
+
+# Create logger, console handler and formatter
+logger = logging.getLogger('OpenStack-Helm Mariadb')
+logger.setLevel(logging.INFO)
+ch = logging.StreamHandler()
+ch.setLevel(logging.INFO)
+formatter = logging.Formatter(
+    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+# Set the formatter and add the handler
+ch.setFormatter(formatter)
+logger.addHandler(ch)
+
+# Get the local hostname
+local_hostname = socket.gethostname()
+logger.info("This instance hostname: {0}".format(local_hostname))
+
+# Setup k8s client credentials and check api version
+kubernetes.config.load_incluster_config()
+kubernetes_version = kubernetes.client.VersionApi().get_code().git_version
+logger.info("Kubernetes API Version: {0}".format(kubernetes_version))
+k8s_api_instance = kubernetes.client.CoreV1Api()
+
+
+def check_env_var(env_var):
+    """Check if an env var exists.
+
+    Keyword arguments:
+    env_var -- the env var to check for the existance of
+    """
+    if env_var in os.environ:
+        return True
+    else:
+        logger.critical("environment variable \"{0}\" not set".format(env_var))
+        sys.exit(1)
+
+
+# Set some variables from env vars injected into the container
+if check_env_var("STATE_CONFIGMAP"):
+    state_configmap_name = os.environ['STATE_CONFIGMAP']
+    logger.info("Will use \"{0}\" configmap for cluster state info".format(
+        state_configmap_name))
+if check_env_var("POD_NAMESPACE"):
+    pod_namespace = os.environ['POD_NAMESPACE']
+if check_env_var("DIRECT_SVC_NAME"):
+    direct_svc_name = os.environ['DIRECT_SVC_NAME']
+if check_env_var("MARIADB_REPLICAS"):
+    mariadb_replicas = os.environ['MARIADB_REPLICAS']
+if check_env_var("POD_NAME_PREFIX"):
+    pod_name_prefix = os.environ['POD_NAME_PREFIX']
+if check_env_var("DISCOVERY_DOMAIN"):
+    discovery_domain = os.environ['DISCOVERY_DOMAIN']
+if check_env_var("WSREP_PORT"):
+    wsrep_port = os.environ['WSREP_PORT']
+if check_env_var("MYSQL_ROOT_PASSWORD"):
+    mysql_root_password = os.environ['MYSQL_ROOT_PASSWORD']
+
+# Set some variables for tuneables
+cluster_leader_ttl = 120
+state_configmap_update_period = 10
+default_sleep = 20
+
+
+def ensure_state_configmap(pod_namespace, configmap_name, configmap_body):
+    """Ensure the state configmap exists.
+
+    Keyword arguments:
+    pod_namespace -- the namespace to house the configmap
+    configmap_name -- the configmap name
+    configmap_body -- the configmap body
+    """
+    try:
+        k8s_api_instance.read_namespaced_config_map(
+            name=configmap_name, namespace=pod_namespace)
+        return True
+    except:
+        k8s_api_instance.create_namespaced_config_map(
+            namespace=pod_namespace, body=configmap_body)
+        return False
+
+
+def run_cmd_with_logging(popenargs,
+                         logger,
+                         stdout_log_level=logging.INFO,
+                         stderr_log_level=logging.INFO,
+                         **kwargs):
+    """Run subprocesses and stream output to logger."""
+    child = subprocess.Popen(
+        popenargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs)
+    log_level = {
+        child.stdout: stdout_log_level,
+        child.stderr: stderr_log_level
+    }
+
+    def check_io():
+        ready_to_read = select.select([child.stdout, child.stderr], [], [],
+                                      1000)[0]
+        for io in ready_to_read:
+            line = io.readline()
+            logger.log(log_level[io], line[:-1])
+
+    while child.poll(
+    ) is None:  # keep checking stdout/stderr until the child exits
+        check_io()
+    check_io()  # check again to catch anything after the process exits
+    return child.wait()
+
+
+def stop_mysqld():
+    """Stop mysqld, assuming pid file in default location."""
+    logger.info("Shutting down any mysqld instance if required")
+    mysqld_pidfile_path = "/var/lib/mysql/{0}.pid".format(local_hostname)
+
+    def is_pid_running(pid):
+        if os.path.isdir('/proc/{0}'.format(pid)):
+            return True
+        return False
+
+    def is_pid_mysqld(pid):
+        with open('/proc/{0}/comm'.format(pid), "r") as mysqld_pidfile:
+            comm = mysqld_pidfile.readlines()[0].rstrip('\n')
+        if comm.startswith('mysqld'):
+            return True
+        else:
+            return False
+
+    if os.path.isfile(mysqld_pidfile_path):
+        logger.info(
+            "Previous pid file found for mysqld, attempting to shut it down")
+        with open(mysqld_pidfile_path, "r") as mysqld_pidfile:
+            mysqld_pid = int(mysqld_pidfile.readlines()[0].rstrip('\n'))
+        if is_pid_running(mysqld_pid):
+            if is_pid_mysqld(mysqld_pid):
+                logger.info("pid from pidfile is mysqld")
+                os.kill(mysqld_pid, 15)
+                pid, status = os.waitpid(mysqld_pid, 0)
+                logger.info("Mysqld stopped: pid = {0}, "
+                            "exit status = {1}".format(pid, status))
+            else:
+                logger.error(
+                    "pidfile process is not mysqld, removing pidfile and panic"
+                )
+                os.remove(mysqld_pidfile_path)
+                sys.exit(1)
+        else:
+            logger.info(
+                "Mysqld was not running with pid {0}, going to remove stale "
+                "file".format(mysqld_pid))
+            os.remove(mysqld_pidfile_path)
+    else:
+        logger.debug("No previous pid file found for mysqld")
+
+
+def mysqld_write_cluster_conf(mode='run'):
+    """Write out dynamic cluster config.
+
+    Keyword arguments:
+    mode -- whether we are writing the cluster config for the cluster to 'run'
+            or 'bootstrap' (default 'run')
+    """
+    logger.info("Setting up cluster config")
+    cluster_config = configparser.ConfigParser()
+    cluster_config['mysqld'] = {}
+    cluster_config_params = cluster_config['mysqld']
+    wsrep_cluster_members = []
+    for node in range(int(mariadb_replicas)):
+        node_hostname = "{0}-{1}".format(pod_name_prefix, node)
+        if local_hostname == node_hostname:
+            wsrep_node_address = "{0}.{1}:{2}".format(
+                node_hostname, discovery_domain, wsrep_port)
+            cluster_config_params['wsrep_node_address'] = wsrep_node_address
+            wsrep_node_name = "{0}.{1}".format(node_hostname, discovery_domain)
+            cluster_config_params['wsrep_node_name'] = wsrep_node_name
+        else:
+            addr = "{0}.{1}:{2}".format(node_hostname, discovery_domain,
+                                        wsrep_port)
+            wsrep_cluster_members.append(addr)
+    if wsrep_cluster_members and mode == 'run':
+        cluster_config_params['wsrep_cluster_address'] = "gcomm://{0}".format(
+            ",".join(wsrep_cluster_members))
+    else:
+        cluster_config_params['wsrep_cluster_address'] = "gcomm://"
+    cluster_config_file = '/etc/mysql/conf.d/10-cluster-config.cnf'
+    logger.info(
+        "Writing out cluster config to: {0}".format(cluster_config_file))
+    with open(cluster_config_file, 'w') as configfile:
+        cluster_config.write(configfile)
+
+
+# Function to setup mysqld
+def mysqld_bootstrap():
+    """Boostrap the db if no data found in the 'bootstrap_test_dir'"""
+    logger.info("Boostrapping Mariadb")
+    mysql_data_dir = '/var/lib/mysql'
+    bootstrap_test_dir = "{0}/mysql".format(mysql_data_dir)
+    if not os.path.isdir(bootstrap_test_dir):
+        stop_mysqld()
+        mysqld_write_cluster_conf(mode='bootstrap')
+        run_cmd_with_logging([
+            'mysql_install_db', '--user=mysql',
+            "--datadir={0}".format(mysql_data_dir)
+        ], logger)
+        template = (
+            "DELETE FROM mysql.user ;\n"
+            "CREATE OR REPLACE USER 'root'@'%' IDENTIFIED BY \'{0}\' ;\n"
+            "GRANT ALL ON *.* TO 'root'@'%' WITH GRANT OPTION ;\n"
+            "DROP DATABASE IF EXISTS test ;\n"
+            "FLUSH PRIVILEGES ;\n"
+            "SHUTDOWN ;".format(mysql_root_password))
+        bootstrap_sql_file = tempfile.NamedTemporaryFile(suffix='.sql').name
+        with open(bootstrap_sql_file, 'w') as f:
+            f.write(template)
+            f.close()
+        run_cmd_with_logging([
+            'mysqld', '--bind-address=127.0.0.1',
+            "--init-file={0}".format(bootstrap_sql_file)
+        ], logger)
+        os.remove(bootstrap_sql_file)
+    else:
+        logger.info("Skipping bootstrap as {0} directory is present".format(
+            bootstrap_test_dir))
+
+
+def safe_update_configmap(configmap_dict, configmap_patch):
+    """Update a configmap with locking.
+
+    Keyword arguments:
+    configmap_dict -- a dict representing the configmap to be patched
+    configmap_patch -- a dict containign the patch
+    """
+    logger.debug("Safe Patching configmap")
+    #NOTE(portdirect): Explictly set the resource version we are patching to
+    # ensure nothing else has modified the confimap since we read it.
+    configmap_patch['metadata']['resourceVersion'] = configmap_dict[
+        'metadata']['resource_version']
+    try:
+        api_response = k8s_api_instance.patch_namespaced_config_map(
+            name=state_configmap_name,
+            namespace=pod_namespace,
+            body=configmap_patch)
+        return True
+    except kubernetes.client.rest.ApiException as error:
+        logger.error("Failed to set configmap: {0}".format(error))
+        return error
+
+
+def set_configmap_annotation(key, value):
+    """Update a configmap's annotations via patching.
+
+    Keyword arguments:
+    key -- the key to be patched
+    value -- the value to give the key
+    """
+    logger.debug("Setting configmap annotation key={0} value={1}".format(
+        key, value))
+    configmap_dict = k8s_api_instance.read_namespaced_config_map(
+        name=state_configmap_name, namespace=pod_namespace).to_dict()
+    configmap_patch = {'metadata': {'annotations': {}}}
+    configmap_patch['metadata']['annotations'][key] = value
+    return safe_update_configmap(
+        configmap_dict=configmap_dict, configmap_patch=configmap_patch)
+
+
+def set_configmap_data(key, value):
+    """Update a configmap's data via patching.
+
+    Keyword arguments:
+    key -- the key to be patched
+    value -- the value to give the key
+    """
+    logger.debug("Setting configmap data key={0} value={1}".format(key, value))
+    configmap_dict = k8s_api_instance.read_namespaced_config_map(
+        name=state_configmap_name, namespace=pod_namespace).to_dict()
+    configmap_patch = {'data': {}, 'metadata': {}}
+    configmap_patch['data'][key] = value
+    return safe_update_configmap(
+        configmap_dict=configmap_dict, configmap_patch=configmap_patch)
+
+
+def get_configmap_value(key, type='data'):
+    """Get a configmap's key's value.
+
+    Keyword arguments:
+    key -- the key to retrive the data from
+    type -- the type of data to retrive from the configmap, can either be 'data'
+            or an 'annotation'. (default data)
+    """
+    state_configmap = k8s_api_instance.read_namespaced_config_map(
+        name=state_configmap_name, namespace=pod_namespace)
+    state_configmap_dict = state_configmap.to_dict()
+    if type == 'data':
+        state_configmap_data = state_configmap_dict['data']
+    elif type == 'annotation':
+        state_configmap_data = state_configmap_dict['metadata']['annotations']
+    else:
+        logger.error(
+            "Unknown data type \"{0}\" reqested for retrival".format(type))
+        return False
+    if state_configmap_data and key in state_configmap_data:
+        return state_configmap_data[key]
+    else:
+        return None
+
+
+def get_cluster_state():
+    """Get the current cluster state from a configmap, creating the configmap
+    if it does not already exist.
+    """
+    logger.info("Getting cluster state")
+    state = None
+    while state is None:
+        try:
+            state = get_configmap_value(
+                type='annotation',
+                key='openstackhelm.openstack.org/cluster.state')
+            logger.info(
+                "The cluster is currently in \"{0}\" state.".format(state))
+        except:
+            logger.info("The cluster configmap \"{0}\" does not exist.".format(
+                state_configmap_name))
+            initial_configmap_body = {
+                "apiVersion": "v1",
+                "kind": "ConfigMap",
+                "metadata": {
+                    "name": state_configmap_name,
+                    "annotations": {
+                        "openstackhelm.openstack.org/cluster.state": "new"
+                    }
+                },
+                "data": {}
+            }
+            ensure_state_configmap(
+                pod_namespace=pod_namespace,
+                configmap_name=state_configmap_name,
+                configmap_body=initial_configmap_body)
+    return state
+
+
+def declare_myself_cluser_leader(ttl):
+    """Declare the current pod as the cluster leader.
+
+    Keyword arguments:
+    ttl -- the ttl for the leader period
+    """
+    logger.info("Declaring myself current cluster leader")
+    leader_expiry_raw = datetime.utcnow() + timedelta(seconds=120)
+    leader_expiry = "{0}Z".format(leader_expiry_raw.isoformat("T"))
+    set_configmap_annotation(
+        key='openstackhelm.openstack.org/leader.node', value=local_hostname)
+    set_configmap_annotation(
+        key='openstackhelm.openstack.org/leader.expiry', value=leader_expiry)
+
+
+def deadmans_leader_election():
+    """Run a simplisic deadmans leader election."""
+    leader_node = get_configmap_value(
+        type='annotation', key='openstackhelm.openstack.org/leader.node')
+    leader_expiry = get_configmap_value(
+        type='annotation', key='openstackhelm.openstack.org/leader.expiry')
+    if iso8601.parse_date(leader_expiry).replace(
+            tzinfo=None) < datetime.utcnow().replace(tzinfo=None):
+        logger.info("Current cluster leader has expired")
+        declare_myself_cluser_leader(ttl=cluster_leader_ttl)
+    elif local_hostname == leader_node:
+        logger.info("Renewing cluster leader lease")
+        declare_myself_cluser_leader(ttl=cluster_leader_ttl)
+
+
+def get_grastate_val(key):
+    """Extract data from grastate.dat.
+
+    Keyword arguments:
+    key -- the key to extract the value of
+    """
+    logger.debug("Reading grastate.dat key={0}".format(key))
+    with open("/var/lib/mysql/grastate.dat", "r") as myfile:
+        grastate_raw = map(lambda s: s.strip(), myfile.readlines())
+    return [i for i in grastate_raw
+            if i.startswith("{0}:".format(key))][0].split(':')[1].strip()
+
+
+def set_grastate_val(key, value):
+    """Set values in grastate.dat.
+
+    Keyword arguments:
+    key -- the key to set the value of
+    value -- the value to set the key to
+    """
+    logger.debug("Updating grastate.dat key={0} value={1}".format(key, value))
+    with open("/var/lib/mysql/grastate.dat", "r") as sources:
+        lines = sources.readlines()
+        for line_num, line_content in enumerate(lines):
+            if line_content.startswith("{0}:".format(key)):
+                line_content = "{0}: {1}\n".format(key, value)
+            lines[line_num] = line_content
+    with open("/var/lib/mysql/grastate.dat", "w") as sources:
+        for line in lines:
+            sources.write(line)
+
+
+def update_grastate_configmap():
+    """Update state configmap with grastate.dat info."""
+    while not os.path.exists('/var/lib/mysql/grastate.dat'):
+        time.sleep(1)
+    logger.info("Updating grastate configmap")
+    grastate = dict()
+    grastate['version'] = get_grastate_val(key='version')
+    grastate['uuid'] = get_grastate_val(key='uuid')
+    grastate['seqno'] = get_grastate_val(key='seqno')
+    grastate['safe_to_bootstrap'] = get_grastate_val(key='safe_to_bootstrap')
+    grastate['sample_time'] = "{0}Z".format(datetime.utcnow().isoformat("T"))
+    for grastate_key, grastate_value in grastate.iteritems():
+        configmap_key = "{0}.{1}".format(grastate_key, local_hostname)
+        if get_configmap_value(
+                type='data', key=configmap_key) != grastate_value:
+            set_configmap_data(key=configmap_key, value=grastate_value)
+
+
+def update_grastate_on_restart():
+    """Update the grastate.dat on node restart."""
+    logger.info("Updating grastate info for node")
+    if get_grastate_val(key='seqno') == '-1':
+        logger.info(
+            "Node shutdown was not clean, getting position via wsrep-recover")
+
+        def recover_wsrep_position():
+            """Extract recoved wsrep position from uncleanly exited node."""
+            wsrep_recover = subprocess.Popen(
+                [
+                    'mysqld', '--bind-address=127.0.0.1',
+                    '--wsrep_cluster_address=gcomm://', '--wsrep-recover'
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
+            out, err = wsrep_recover.communicate()
+            for item in err.split("\n"):
+                if "WSREP: Recovered position:" in item:
+                    line = item.strip().split()
+                    wsrep_rec_pos = line[-1].split(':')[-1]
+            return wsrep_rec_pos
+
+        set_grastate_val(key='seqno', value=recover_wsrep_position())
+    else:
+        logger.info("Node shutdown was clean, using grastate.dat")
+
+    update_grastate_configmap()
+
+
+def check_for_active_nodes(endpoints_name=direct_svc_name,
+                           namespace=pod_namespace):
+    """Check K8s endpoints to see if there are active Mariadb Instances.
+
+    Keyword arguments:
+    endpoints_name -- endpoints to check for active backends
+                      (default direct_svc_name)
+    namespace -- namespace to check for endpoints (default pod_namespace)
+    """
+    logger.info("Checking for active nodes")
+    endpoints = k8s_api_instance.read_namespaced_endpoints(
+        name=endpoints_name, namespace=pod_namespace)
+    endpoints_dict = endpoints.to_dict()
+    addresses_index = [
+        i for i, s in enumerate(endpoints_dict['subsets']) if 'addresses' in s
+    ][0]
+    active_endpoints = endpoints_dict['subsets'][addresses_index]['addresses']
+    if active_endpoints and len(active_endpoints) >= 1:
+        return True
+    else:
+        return False
+
+
+def check_if_cluster_data_is_fresh():
+    """Check if the state_configmap is both current and reasonably stable."""
+    logger.info("Checking to see if cluster data is fresh")
+    state_configmap = k8s_api_instance.read_namespaced_config_map(
+        name=state_configmap_name, namespace=pod_namespace)
+    state_configmap_dict = state_configmap.to_dict()
+    sample_times = dict()
+    for key, value in state_configmap_dict['data'].iteritems():
+        keyitems = key.split('.')
+        key = keyitems[0]
+        node = keyitems[1]
+        if key == 'sample_time':
+            sample_times[node] = value
+    sample_time_ok = True
+    for key, value in sample_times.iteritems():
+        sample_time = iso8601.parse_date(value).replace(tzinfo=None)
+        sample_cutoff_time = datetime.utcnow().replace(
+            tzinfo=None) - timedelta(seconds=20)
+        if not sample_time >= sample_cutoff_time:
+            logger.info(
+                "The data we have from the cluster is too old to make a "
+                "decision for node {0}".format(key))
+            sample_time_ok = False
+        else:
+            logger.info(
+                "The data we have from the cluster is ok for node {0}".format(
+                    key))
+    return sample_time_ok
+
+
+def check_if_i_lead():
+    """Check on full restart of cluster if this node should lead the cluster
+    reformation."""
+    logger.info("Checking to see if I lead the cluster for reboot")
+    # as we sample on the update period - we sample for a full cluster
+    # leader election period as a simplistic way of ensureing nodes are
+    # reliably checking in following full restart of cluster.
+    count = cluster_leader_ttl / state_configmap_update_period
+    counter = 0
+    while counter <= count:
+        if check_if_cluster_data_is_fresh():
+            counter += 1
+        else:
+            counter = 0
+        time.sleep(state_configmap_update_period)
+        logger.info(
+            "Cluster info has been uptodate {0} times out of the required "
+            "{1}".format(counter, count))
+    state_configmap = k8s_api_instance.read_namespaced_config_map(
+        name=state_configmap_name, namespace=pod_namespace)
+    state_configmap_dict = state_configmap.to_dict()
+    seqnos = dict()
+    for key, value in state_configmap_dict['data'].iteritems():
+        keyitems = key.split('.')
+        key = keyitems[0]
+        node = keyitems[1]
+        if key == 'seqno':
+            seqnos[node] = value
+    max_seqno = max(seqnos.values())
+    max_seqno_node = sorted(
+        [k for k, v in seqnos.items() if v == max_seqno])[0]
+    if local_hostname == max_seqno_node:
+        logger.info("I lead the cluster")
+        return True
+    else:
+        logger.info("{0} leads the cluster".format(max_seqno_node))
+        return False
+
+
+def monitor_cluster():
+    """Function to kick off grastate configmap updating thread"""
+    while True:
+        update_grastate_configmap()
+        time.sleep(state_configmap_update_period)
+
+
+# Setup the thread for the cluster monitor
+monitor_cluster_thread = threading.Thread(target=monitor_cluster, args=())
+monitor_cluster_thread.daemon = True
+
+
+def launch_cluster_monitor():
+    """Launch grastate configmap updating thread"""
+    if not monitor_cluster_thread.isAlive():
+        monitor_cluster_thread.start()
+
+
+def leader_election():
+    """Function to kick off leader election thread"""
+    while True:
+        deadmans_leader_election()
+        time.sleep(cluster_leader_ttl / 2)
+
+
+# Setup the thread for the leader election
+leader_election_thread = threading.Thread(target=leader_election, args=())
+leader_election_thread.daemon = True
+
+
+def launch_leader_election():
+    """Launch leader election thread"""
+    if not leader_election_thread.isAlive():
+        leader_election_thread.start()
+
+
+def run_mysqld(cluster='existing'):
+    """Launch the mysqld instance for the pod.
+
+    Keyword arguments:
+    cluster -- whether we going to form a cluster 'new' or joining an existing
+               cluster 'existing' (default 'existing')
+    """
+    stop_mysqld()
+    mysqld_write_cluster_conf(mode='run')
+    launch_leader_election()
+    launch_cluster_monitor()
+    mysqld_cmd = ['mysqld']
+    if cluster == 'new':
+        mysqld_cmd.append('--wsrep-new-cluster')
+    run_cmd_with_logging(mysqld_cmd, logger)
+
+
+def mysqld_reboot():
+    """Reboot a mysqld cluster."""
+    declare_myself_cluser_leader(ttl=cluster_leader_ttl)
+    set_grastate_val(key='safe_to_bootstrap', value='1')
+    run_mysqld(cluster='new')
+
+
+def sigterm_shutdown(x, y):
+    """Shutdown the instnace of mysqld on shutdown signal."""
+    logger.info("Got a sigterm from the container runtime, time to go.")
+    stop_mysqld()
+
+
+# Register the signal to the handler
+signal.signal(signal.SIGTERM, sigterm_shutdown)
+
+# Main logic loop
+if get_cluster_state() == 'new':
+    set_configmap_annotation(
+        key='openstackhelm.openstack.org/cluster.state', value='init')
+    declare_myself_cluser_leader(ttl=cluster_leader_ttl)
+    launch_leader_election()
+    mysqld_bootstrap()
+    update_grastate_configmap()
+    set_configmap_annotation(
+        key='openstackhelm.openstack.org/cluster.state', value='live')
+    run_mysqld(cluster='new')
+elif get_cluster_state() == 'init':
+    logger.info("Waiting for cluster to start running")
+    while not get_cluster_state() == 'live':
+        time.sleep(default_sleep)
+    while not check_for_active_nodes():
+        time.sleep(default_sleep)
+    launch_leader_election()
+    run_mysqld()
+elif get_cluster_state() == 'live':
+    logger.info("Cluster has been running starting restore/rejoin")
+    if not mariadb_replicas > 1:
+        logger.info(
+            "There is only a single node in this cluster, we are good to go")
+        update_grastate_on_restart()
+        mysqld_reboot()
+    else:
+        if check_for_active_nodes():
+            logger.info(
+                "There are currently running nodes in the cluster, we can "
+                "join them")
+            run_mysqld()
+        else:
+            logger.info("This cluster has lost all running nodes, we need to "
+                        "determine the new lead node")
+            update_grastate_on_restart()
+            launch_leader_election()
+            launch_cluster_monitor()
+            if check_if_i_lead():
+                logger.info("I won the ability to reboot the cluster")
+                mysqld_reboot()
+            else:
+                logger.info(
+                    "Waiting for the lead node to come online before joining "
+                    "it")
+                while not check_for_active_nodes():
+                    time.sleep(default_sleep)
+                run_mysqld()
+else:
+    logger.critical("Dont understand cluster state, exiting with error status")
+    sys.exit(1)
--- a/mariadb/templates/bin/_start.sh.tpl
+++ b/mariadb/templates/bin/_start.sh.tpl
@ -1,190 +0,0 @@
-#!/bin/bash
-{{/*
-Copyright 2017 The Openstack-Helm Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/}}
-
-set -xe
-
-# MariaDB 10.2.13 has a regression which breaks clustering, patch
-# around this for now
-if /usr/sbin/mysqld --version | grep --silent 10.2.13 ; then
-    sed -i 's^LSOF_OUT=.*^LSOF_OUT=$(lsof -sTCP:LISTEN -i TCP:${PORT} -a -c nc -c socat -F c 2> /dev/null || :)^' /usr/bin/wsrep_sst_xtrabackup-v2
-fi
-
-# Bootstrap database
-CLUSTER_INIT_ARGS=""
-CLUSTER_CONFIG_PATH=/etc/mysql/conf.d/10-cluster-config.cnf
-
-function exitWithManualRecovery() {
-
-    UUID=$(sed -e 's/^.*uuid:[\ ,\t]*//' -e 'tx' -e 'd' -e ':x' /var/lib/mysql/grastate.dat)
-    SEQNO=$(sed -e 's/^.*seqno:[\ ,\t]*//' -e 'tx' -e 'd' -e ':x' /var/lib/mysql/grastate.dat)
-
-    cat >/dev/stderr <<EOF
-   **********************************************************
-   *            MANUAL RECOVERY ACTION REQUIRED             *
-   **********************************************************
-
-All cluster members are down and grastate.dat indicates that it's not
-safe to start the cluster from this node. If you see this message on
-all nodes, you have to do a manual recovery by following these steps:
-
-    a) Find the node with the highest WSREP seq#:
-
-	POD ${PODNAME} uuid: ${UUID} seq: ${SEQNO}
-
-   	If you see uuid 00000000-0000-0000-0000-000000000000 with
-   	seq -1, the node crashed during DDL.
-
-   	If seq is -1 you will find a DETECTED CRASH message
-   	on your log. Check the output from InnoDB for the last
-   	transaction id available.
-
-    b) Set environment variable FORCE_RECOVERY=<NAME OF POD>
-       to force bootstrapping from the specified node.
-
-        Remember to remove FORCE_RECOVERY after your nodes
-        are fully recovered! You may lose data otherwise.
-
-You can ignore this message and wait for the next restart if at
-least one node started without errors.
-EOF
-
-    exit 1
-}
-
-# Construct cluster config
-MEMBERS=""
-for i in $(seq 1 ${MARIADB_REPLICAS}); do
-    if [ "$i" -eq "1" ]; then
-      NUM="0"
-    else
-      NUM="$(expr $i - 1)"
-    fi
-    CANDIDATE_POD="${POD_NAME_PREFIX}-$NUM.$(hostname -d)"
-    if [ "x${CANDIDATE_POD}" != "x${POD_NAME}.$(hostname -d)" ]; then
-        if [ -n "${MEMBERS}" ]; then
-            MEMBERS+=,
-        fi
-        MEMBERS+="${CANDIDATE_POD}:${WSREP_PORT}"
-    fi
-done
-
-echo "Writing cluster config for ${POD_NAME} to ${CLUSTER_CONFIG_PATH}"
-cat > ${CLUSTER_CONFIG_PATH} <<EOF
-[mysqld]
-wsrep_cluster_address="gcomm://${MEMBERS}"
-wsrep_node_address=${POD_IP}
-wsrep_node_name=${POD_NAME}.$(hostname -d)
-EOF
-
-if [ ! -z "${FORCE_RECOVERY// }" ]; then
-    	cat >/dev/stderr <<EOF
-   **********************************************************
-   *    !!!        FORCE_RECOVERY WARNING       !!!         *
-   **********************************************************
-
-POD is starting with FORCE_RECOVERY defined. Remember to unset this
-variable after recovery! You may end up in recovering from a node
-with old data on a crash!
-
-You have been warned ;-)
-
-   **********************************************************
-   *               FORCE_RECOVERY WARNING                   *
-   **********************************************************
-EOF
-
-fi
-
-if [ -d /var/lib/mysql/mysql -a -f /var/lib/mysql/grastate.dat ]; then
-
-    # Node already initialized
-
-    if [ "$(sed -e 's/^.*seqno:[\ ,\t]*//' -e 'tx' -e 'd' -e ':x' /var/lib/mysql/grastate.dat)" = "-1" ]; then
-    	cat >/dev/stderr <<EOF
-   **********************************************************
-   *                   DETECTED CRASH                       *
-   **********************************************************
-
-Trying to recover from a previous crash by running with wsrep-recover...
-EOF
-	mysqld --wsrep_cluster_address=gcomm:// --wsrep-recover
-    fi
-
-    echo "Check if we can find a cluster memeber."
-    if ! mysql --defaults-file=/etc/mysql/admin_user.cnf \
-        --connect-timeout 2 \
-         -e 'select 1'; then
-	# No other nodes are running
-    	if [ -z "${FORCE_RECOVERY// }" -a "$(sed -e 's/^.*safe_to_bootstrap:[\ ,\t]*//' -e 'tx' -e 'd' -e ':x' /var/lib/mysql/grastate.dat)" = "1" ]; then
-            echo 'Bootstrapping from this node.'
-            CLUSTER_INIT_ARGS=--wsrep-new-cluster
-        elif [ "x${FORCE_RECOVERY}x" = "x${POD_NAME}x" ]; then
-            echo 'Forced recovery bootstrap from this node.'
-            CLUSTER_INIT_ARGS=--wsrep-new-cluster
-            cp -f /var/lib/mysql/grastate.dat /var/lib/mysql/grastate.bak
-    	    cat >/var/lib/mysql/grastate.dat <<EOF
-`grep -v 'safe_to_bootstrap:' /var/lib/mysql/grastate.bak`
-safe_to_bootstrap: 1
-EOF
-	    chown -R mysql:mysql /var/lib/mysql/grastate.dat
-        else
-    	    exitWithManualRecovery
-    	fi
-    fi
-
-elif [ ! -d /var/lib/mysql/mysql -o "x${FORCE_BOOTSTRAP}" = "xtrue" ]; then
-    if [ "x${POD_NAME}" = "x${POD_NAME_PREFIX}-0" ]; then
-        echo No data found for pod 0
-        if [ "x${FORCE_BOOTSTRAP}" = "xtrue" ]; then
-            echo 'force_bootstrap set, so will force-initialize node 0.'
-            CLUSTER_INIT_ARGS=--wsrep-new-cluster
-            CLUSTER_BOOTSTRAP=true
-        elif ! mysql --defaults-file=/etc/mysql/admin_user.cnf \
-                     --connect-timeout 2 \
-                     -e 'select 1'; then
-            echo 'No other nodes found, so will initialize cluster.'
-            CLUSTER_INIT_ARGS=--wsrep-new-cluster
-            CLUSTER_BOOTSTRAP=true
-        else
-            echo 'Found other live nodes, will attempt to join them.'
-            mkdir /var/lib/mysql/mysql
-        fi
-    else
-        echo 'Not pod 0, so will avoid upstream database initialization.'
-        mkdir /var/lib/mysql/mysql
-    fi
-    chown -R mysql:mysql /var/lib/mysql
-fi
-
-
-if [ "x${CLUSTER_BOOTSTRAP}" = "xtrue" ]; then
-  mysql_install_db --user=mysql --datadir=/var/lib/mysql
-
-  cat > "${BOOTSTRAP_FILE}" << EOF
-DELETE FROM mysql.user ;
-CREATE OR REPLACE USER 'root'@'%' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}' ;
-GRANT ALL ON *.* TO 'root'@'%' WITH GRANT OPTION ;
-DROP DATABASE IF EXISTS test ;
-FLUSH PRIVILEGES ;
-SHUTDOWN ;
-EOF
-
-  mysqld ${CLUSTER_INIT_ARGS} --bind-address=127.0.0.1 --init-file=${BOOTSTRAP_FILE}
-  rm -f "${BOOTSTRAP_FILE}"
-fi
-
-exec mysqld ${CLUSTER_INIT_ARGS}
--- a/mariadb/templates/configmap-bin.yaml
+++ b/mariadb/templates/configmap-bin.yaml
@ -32,8 +32,8 @@ data:
 {{ tuple "bin/_mariadb-ingress-error-pages.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
  readiness.sh: |
 {{ tuple "bin/_readiness.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
-  start.sh: |
-{{ tuple "bin/_start.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
+  start.py: |
+{{ tuple "bin/_start.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
  stop.sh: |
 {{ tuple "bin/_stop.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
 {{- end }}
--- a/mariadb/templates/statefulset.yaml
+++ b/mariadb/templates/statefulset.yaml
@ -17,9 +17,53 @@ limitations under the License.
 {{- if .Values.manifests.statefulset }}
 {{- $envAll := . }}

-{{- $serviceAccountName := "mariadb" }}
+{{- $serviceAccountName := printf "%s-%s" .Release.Name "mariadb" }}
 {{ tuple $envAll "mariadb" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
 ---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: Role
+metadata:
+  name: {{ $serviceAccountName }}
+  namespace: {{ $envAll.Release.Namespace }}
+rules:
+  - apiGroups:
+      - ""
+    resources:
+      - configmaps
+    verbs:
+      - create
+  - apiGroups:
+      - ""
+    resourceNames:
+      - {{ printf "%s-%s" .Release.Name "mariadb-state" | quote }}
+    resources:
+      - configmaps
+    verbs:
+      - get
+      - patch
+  - apiGroups:
+      - ""
+    resourceNames:
+      - {{ tuple "oslo_db" "direct" . | include "helm-toolkit.endpoints.hostname_short_endpoint_lookup" }}
+    resources:
+      - endpoints
+    verbs:
+      - get
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: RoleBinding
+metadata:
+  name: {{ $serviceAccountName }}
+  namespace: {{ $envAll.Release.Namespace }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: {{ $serviceAccountName }}
+subjects:
+  - kind: ServiceAccount
+    name: {{ $serviceAccountName }}
+    namespace: {{ $envAll.Release.Namespace }}
+---
 apiVersion: apps/v1
 kind: StatefulSet
 metadata:
@ -27,6 +71,8 @@ metadata:
  name: {{ tuple "oslo_db" "direct" . | include "helm-toolkit.endpoints.hostname_short_endpoint_lookup" }}
  annotations:
    {{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }}
+    configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }}
+    configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }}
  labels:
 {{ tuple $envAll "mariadb" "server" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }}
 spec:
@ -40,6 +86,10 @@ spec:
    metadata:
      labels:
 {{ tuple $envAll "mariadb" "server" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 8 }}
+      annotations:
+        {{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }}
+        configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }}
+        configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }}
    spec:
      serviceAccountName: {{ $serviceAccountName }}
      affinity:
@ -68,26 +118,22 @@ spec:
 {{ tuple $envAll "mariadb" | include "helm-toolkit.snippets.image" | indent 10 }}
 {{ tuple $envAll $envAll.Values.pod.resources.server | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
          env:
-            - name: POD_IP
+            - name: POD_NAMESPACE
              valueFrom:
                fieldRef:
-                  fieldPath: status.podIP
-            - name: POD_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.name
-            - name: FORCE_BOOTSTRAP
-              value: {{ .Values.force_bootstrap | quote }}
-            - name: FORCE_RECOVERY
-              value: {{ .Values.force_recovey | quote }}
-            - name: BOOTSTRAP_FILE
-              value: "/tmp/bootstrap.sql"
+                  fieldPath: metadata.namespace
            - name: MARIADB_REPLICAS
              value: {{ .Values.pod.replicas.server | quote }}
-            - name: WSREP_PORT
-              value: {{ tuple "oslo_db" "direct" "wsrep" . | include "helm-toolkit.endpoints.endpoint_port_lookup" | quote }}
            - name: POD_NAME_PREFIX
              value: {{ tuple "oslo_db" "direct" . | include "helm-toolkit.endpoints.hostname_short_endpoint_lookup" }}
+            - name: DISCOVERY_DOMAIN
+              value: {{ tuple "oslo_db" "discovery" . | include "helm-toolkit.endpoints.hostname_fqdn_endpoint_lookup" }}
+            - name: DIRECT_SVC_NAME
+              value: {{ tuple "oslo_db" "direct" . | include "helm-toolkit.endpoints.hostname_short_endpoint_lookup" }}
+            - name: WSREP_PORT
+              value: {{ tuple "oslo_db" "direct" "wsrep" . | include "helm-toolkit.endpoints.endpoint_port_lookup" | quote }}
+            - name: STATE_CONFIGMAP
+              value: {{ printf "%s-%s" .Release.Name "mariadb-state" | quote }}
            - name: MYSQL_ROOT_PASSWORD
              valueFrom:
                secretKeyRef:
@ -101,7 +147,7 @@ spec:
              protocol: TCP
              containerPort: {{ tuple "oslo_db" "direct" "wsrep" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
          command:
-            - /tmp/start.sh
+            - /tmp/start.py
          lifecycle:
            preStop:
              exec:
@ -118,8 +164,8 @@ spec:
            - name: mycnfd
              mountPath: /etc/mysql/conf.d
            - name: mariadb-bin
-              mountPath: /tmp/start.sh
-              subPath: start.sh
+              mountPath: /tmp/start.py
+              subPath: start.py
              readOnly: true
            - name: mariadb-bin
              mountPath: /tmp/stop.sh
--- a/mariadb/values.yaml
+++ b/mariadb/values.yaml
@ -21,10 +21,7 @@ release_group: null

 images:
  tags:
-    # NOTE: if you update from 10.2.13 please look at
-    # https://review.openstack.org/#/q/Ifd09d7effe7d382074ca9e6678df36bdd4bce0af
-    # and check whether it's still needed
-    mariadb: docker.io/mariadb:10.2.18
+    mariadb: docker.io/openstackhelm/mariadb:10.2.18
    ingress: quay.io/kubernetes-ingress-controller/nginx-ingress-controller:0.9.0
    error_pages: gcr.io/google_containers/defaultbackend:1.0
    prometheus_create_mysql_user: docker.io/mariadb:10.2.13
@ -61,8 +58,8 @@ pod:
      topologyKey:
        default: kubernetes.io/hostname
  replicas:
-    server: 1
-    ingress: 1
+    server: 3
+    ingress: 2
    error_page: 1
    prometheus_mysql_exporter: 1
  lifecycle:
@ -169,8 +166,6 @@ volume:
  class_name: general
  size: 5Gi

-
-
 conf:
  ingress: null

--- a/tools/images/mariadb/Dockerfile
+++ b/tools/images/mariadb/Dockerfile
@ -0,0 +1,22 @@
+FROM docker.io/mariadb@sha256:d4cf9fbdf33a2940ca35c653bf2b702cbaed0bff87ade8c3e3ee9eab81b38b27
+#FROM docker.io/mariadb:10.2.18
+
+RUN set -ex ;\
+    apt-get update ;\
+    apt-get upgrade -y ;\
+    apt-get install -y --no-install-recommends \
+        python-pip ;\
+    pip --no-cache-dir install --upgrade pip ;\
+    hash -r ;\
+    pip --no-cache-dir install --upgrade setuptools ;\
+    pip --no-cache-dir install --upgrade \
+      configparser \
+      iso8601 \
+      kubernetes ;\
+    apt-get clean -y ;\
+    rm -rf \
+       /var/cache/debconf/* \
+       /var/lib/apt/lists/* \
+       /var/log/* \
+       /tmp/* \
+       /var/tmp/*
--- a/tools/images/mariadb/README.rst
+++ b/tools/images/mariadb/README.rst
@ -0,0 +1,41 @@
+MariaDB Container
+=================
+
+This container builds an image with MariaDB for use with OpenStack-Helm.
+
+Instructions
+------------
+
+OS Specific Host setup:
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Ubuntu:
+^^^^^^^
+
+From a freshly provisioned Ubuntu 16.04 LTS host run:
+
+.. code:: bash
+
+    sudo apt-get update -y
+    sudo apt-get install -y \
+            docker.io \
+            git
+
+Build the MariaDB Image
+~~~~~~~~~~~~~~~~~~~~~~~
+
+A known good image is published to dockerhub on a fairly regular basis, but if
+you wish to build your own image, from the root directory of the OpenStack-Helm
+repo run:
+
+.. code:: bash
+
+    sudo docker build \
+      --network=host \
+      --force-rm \
+      --pull \
+      --no-cache \
+      --file=./tools/images/mariadb/Dockerfile \
+      -t docker.io/openstackhelm/mariadb:10.2.18 \
+      tools/images/mariadb
+    sudo docker push docker.io/openstackhelm/mariadb:10.2.18