From 42e57118c345bcf9372e4f898b341032242ac62a Mon Sep 17 00:00:00 2001 From: Chris Friesen Date: Fri, 5 Apr 2019 09:32:55 -0400 Subject: [PATCH] Add dbmon ocf script for containerized mariadb on AIO-DX On a two-node system the openstack-helm chart for mariadb has issues. If you run with a single replica then your failover times are very long due to internal timeouts in kubernetes which prevent accessing the backing volume on the newly-active node. At the same time, you can't run a "garbd" pod the way we do on a full lab configuration because there is no third node to run it on. The only viable option we've found is to trigger something to explicitly tell the mariadb pod on the active node to bootstrap a new primary cluster if it loses quorum due to the other mariadb pod going away unexpectedly. Accordingly, this commit creates a new "dbmon" OCF script which behaves basically as follows: start -- return $OCF_SUCCESS stop -- return $OCF_NOT_RUNNING standby -- return $OCF_SUCCESS or $OCF_NOT_RUNNING depending on whether mariadb on this node is a member of the primary cluster active -- if mariadb on this node is not a member of the primary cluster then tell it to bootstrap a new primary cluster. Then check again and return $OCF_SUCCESS or $OCF_NOT_RUNNING depending on whether mariadb on this node is a member of the primary cluster monitor -- If mariadb on this node is a member of the primary cluster then return $OCF_RUNNING_MASTER on the active controller and $OCF_SUCCESS on the standby controller. If mariadb is not a member of the primary cluster return $OCF_NOT_RUNNING. There are a few complicating factors. If openstack application or mariadb chart not installed then treat it like being a member of the primary cluster. If the mariadb pod is still initializing treat it like not being a member of the primary cluster. If we're in a standard lab (with garbd running on a compute node) then don't actually tell mariadb to bootstrap a new primary cluster but just report whether it's a member of the primary cluster or not. Story: 2004712 Task: 30410 Change-Id: I8d91dc13cb9a9adb7f7a7a95faadad4339ddb466 Signed-off-by: Chris Friesen --- openstack/stx-ocf-scripts/src/ocf/dbmon | 339 ++++++++++++++++++++++++ 1 file changed, 339 insertions(+) create mode 100644 openstack/stx-ocf-scripts/src/ocf/dbmon diff --git a/openstack/stx-ocf-scripts/src/ocf/dbmon b/openstack/stx-ocf-scripts/src/ocf/dbmon new file mode 100644 index 00000000..0a95a07d --- /dev/null +++ b/openstack/stx-ocf-scripts/src/ocf/dbmon @@ -0,0 +1,339 @@ +#!/bin/sh +# +# +# OpenStack mariadb monitor (dbmon) +# +# Description: Monitors local OpenStack mariadb pod as an HA resource +# +# Authors: Chris Friesen +# +# Support: starlingx-discuss@lists.starlingx.io +# License: Apache Software License (ASL) 2.0 +# +# Copyright (c) 2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +# +# See usage() function below for more details ... +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + + +export KUBECONFIG=/etc/kubernetes/admin.conf + +####################################################################### + +usage() { + cat < + + +1.0 + + +Resource agent for the OpenStack mariadb monitor (dbmon) +This tries to ensure that the active controller node always has access +to a functional OpenStack database. + +Monitors the OpenStack mariadb pod + + + + + + + + + + + + + +END +} + +####################################################################### +# Functions invoked by resource manager actions + +dbmon_validate() { + local rc + + check_binary base64 + check_binary grep + check_binary cut + check_binary head + check_binary kubectl + + true +} + +active_controller() { + # "facter" will return "true" or "false" + eval $(FACTERLIB=/usr/share/puppet/modules/platform/lib/facter/ facter is_controller_active) +} + +check_has_garbd_chart() { + helm list osh-openstack-garbd|grep -q osh-openstack-garbd +} + + +debuginfo() { + # Log some information on what's preventing us from getting the DB status + + APP_STATUS='uninstalled' + + # Check whether kubectl is working + kubectl get node ${HOSTNAME} &> /dev/null + if [ $? -ne 0 ]; then + ocf_log info "kubectl isn't working." + STATUS="Primary" + return + fi + + # Check whether the openstack namespace exists. This is a proxy + # for whether the application is installed. + kubectl get ns openstack &> /dev/null + if [ $? -ne 0 ]; then + ocf_log info "Openstack namespace doesn't exist." + # TODO: this is a hack until we can dynamically provision services in sm. + # Once that's available this should just return. + STATUS="Primary" + return + fi + + # TODO: this is also a hack until we can dynamically provision services in sm. + # Check whether the helm chart is deployed. If not, we don't want to + # cause an alarm to be raised. The helm list command returns 0 even if the + # specified chart isn't found. + helm list osh-openstack-mariadb|grep -qe "osh-openstack-mariadb.*DEPLOYED" + if [ $? -ne 0 ]; then + ocf_log info "osh-openstack-mariadb helm chart not installed." + STATUS="Primary" + return + fi + + APP_STATUS='initializing' + + # Check that mariadb is running in the pod. If it's not, we could be still + # initializing. + INFO=`kubectl exec -n openstack -it ${PODNAME} -- ps -C mysqld` + if [ $? -ne 0 ]; then + ocf_log info "Mysqld not running in mariadb container." + # Setting STATUS so the audit will return $OCF_NOT_RUNNING instead + # of $OCF_ERR_GENERIC + STATUS="non-Primary" + fi +} + +get_status() { + STATUS=`kubectl exec -n openstack -it ${PODNAME} -- mysql -sN -u root \ + --password=${DBPASSWD} -e " select VARIABLE_VALUE from \ + information_schema.GLOBAL_STATUS where VARIABLE_NAME='wsrep_cluster_status';"` + + if [ $? -ne 0 ]; then + # something is wrong, can't query status + ocf_log info "Unknown error trying to query mariadb status" + return $OCF_ERR_GENERIC + fi + + # In my testing there is a CR/LF at the end of STATUS, remove it. + STATUS=${STATUS%%[[:space:]]} +} + +get_pod_and_status() { + + # Get name of local mariadb pod + PODNAME=`kubectl -n openstack get pod --field-selector spec.nodeName=${HOSTNAME} \ + -l application=mariadb,component=server -o=jsonpath='{.items[0].metadata.name'}` + if [ $? -ne 0 ]; then + ocf_log info "Error getting mariadb server pod name on this node." + return $OCF_ERR_GENERIC + fi + + # If this doesn't exist, the return code will still be zero + # but the variable will be blank. + DBPASSWD_ENCODED=`kubectl -n openstack get secret mariadb-db-root-password \ + -o=jsonpath='{.data.MYSQL_ROOT_PASSWORD'}` + if [ "${DBPASSWD_ENCODED}" = '' ]; then + ocf_log info "Unable to get mariadb password. Exiting." + return $OCF_ERR_GENERIC + fi + + DBPASSWD=`echo ${DBPASSWD_ENCODED}|base64 -d` + if [ $? -ne 0 ]; then + ocf_log info "Unable to decode mariadb password. Exiting." + return $OCF_ERR_GENERIC + fi + + get_status +} + +dbmon_status() { + get_pod_and_status + if [ $? -ne 0 ]; then + # The call to debuginfo may write to STATUS, once sysinv can dynamically + # add services that shouldn't be necessary any more. + debuginfo + fi + + if [ "${STATUS}" == "Primary" ]; then + if active_controller ; then + return $OCF_RUNNING_MASTER + else + return $OCF_SUCCESS + fi + elif [ "${STATUS}" == "non-Primary" ]; then + ocf_log info "mariadb server is running but not primary." + return $OCF_NOT_RUNNING + else + ocf_log info "unexpected mariadb status of ${STATUS}" + return $OCF_ERR_GENERIC + fi +} + +dbmon_monitor() { + dbmon_status +} + +dbmon_standby() { + get_pod_and_status + if [ $? -ne 0 ]; then + # something is wrong, can't query status + debuginfo + ocf_log info "unable to query status, exiting" + # If the app isn't installed yet then there's no point in complaining. + if [ $APP_STATUS == 'uninstalled' ]; then + return $OCF_SUCCESS + else + return $OCF_NOT_RUNNING + fi + fi + + if [ "${STATUS}" == "non-Primary" ]; then + return $OCF_NOT_RUNNING + elif [ "${STATUS}" == "Primary" ]; then + return $OCF_SUCCESS + else + ocf_log info "unexpected mariadb status of ${STATUS}" + return $OCF_ERR_GENERIC + fi +} + +dbmon_active() { + + get_pod_and_status + if [ $? -ne 0 ]; then + # something is wrong, can't query status + debuginfo + ocf_log info "unable to query status, exiting" + # If the app isn't installed yet we don't want to hold up the + # controller going active. + if [ $APP_STATUS == 'uninstalled' ]; then + return $OCF_SUCCESS + else + return $OCF_NOT_RUNNING + fi + fi + + if [ "${STATUS}" == "non-Primary" ]; then + ocf_log info "mariadb server is running but not primary" + + # If we're on a standard lab, exit before telling the server to become primary + check_has_garbd_chart + if [ $? -eq 0 ]; then + ocf_log info "in standard lab, exiting" + return $OCF_NOT_RUNNING + fi + + # tell mysql to become primary + kubectl exec -n openstack -it ${PODNAME} -- mysql -u root --password=${DBPASSWD} \ + -e "SET GLOBAL wsrep_provider_options='pc.bootstrap=YES';" + if [ $? -ne 0 ]; then + ocf_log info "Error telling mysql to become primary." + return $OCF_NOT_RUNNING + fi + + # wait a bit then check the status + sleep 1 + ocf_log info "checking status after bootstrapping" + get_status + if [ $? -ne 0 ]; then + # something is wrong, can't query status + debuginfo + ocf_log info "unable to query status, exiting" + return $OCF_NOT_RUNNING + fi + if [ "${STATUS}" == "non-Primary" ]; then + ocf_log info "mariadb server is running but still not primary" + return $OCF_NOT_RUNNING + fi + fi + + # At this point status is either Primary or something unexpected + if [ "${STATUS}" == "Primary" ]; then + return $OCF_SUCCESS + else + ocf_log info "unexpected mariadb status of ${STATUS}" + return $OCF_ERR_GENERIC + fi +} + +dbmon_start() { + return $OCF_SUCCESS +} + +dbmon_stop() { + return $OCF_NOT_RUNNING +} + +####################################################################### + +case "$1" in + meta-data) meta_data + exit $OCF_SUCCESS;; + usage|help) usage + exit $OCF_SUCCESS;; +esac + +# Anything except meta-data and help must pass validation +dbmon_validate || exit $? + +# What kind of method was invoked? +case "$1" in + start) dbmon_start;; + stop) dbmon_stop;; + monitor) dbmon_monitor;; + active) dbmon_active;; + standby) dbmon_standby;; + validate-all) ;; + *) usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac + +