338 lines
9.7 KiB
Bash
Executable File
338 lines
9.7 KiB
Bash
Executable File
#!/bin/bash
|
|
# Copyright 2016 Mirantis, Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
# See usage() function below for more details ...
|
|
#
|
|
# OCF instance parameters:
|
|
# OCF_RESKEY_binary
|
|
# OCF_RESKEY_config
|
|
# OCF_RESKEY_log_file
|
|
# OCF_RESKEY_user
|
|
# OCF_RESKEY_watchdog_file
|
|
# OCF_RESKEY_watchdog_timeout
|
|
#######################################################################
|
|
# Initialization:
|
|
|
|
: "${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}"
|
|
. "${OCF_FUNCTIONS_DIR}/ocf-shellfuncs"
|
|
|
|
#######################################################################
|
|
|
|
# Fill in some defaults if no values are specified
|
|
|
|
OCF_RESKEY_binary_default="/usr/bin/hekad"
|
|
OCF_RESKEY_user_default="root"
|
|
OCF_RESKEY_watchdog_file_default=
|
|
OCF_RESKEY_watchdog_timeout_default=20
|
|
|
|
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
|
|
: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}}
|
|
: ${OCF_RESKEY_watchdog_file=${OCF_RESKEY_watchdog_file_default}}
|
|
: ${OCF_RESKEY_watchdog_timeout=${OCF_RESKEY_watchdog_timeout_default}}
|
|
|
|
#######################################################################
|
|
|
|
usage() {
|
|
cat <<UEND
|
|
usage: $0 (start|stop|validate-all|meta-data|status|monitor)
|
|
|
|
$0 manages the collector process as an HA resource
|
|
|
|
The 'start' operation starts the collector
|
|
The 'stop' operation stops the collector
|
|
The 'validate-all' operation reports whether the parameters are valid
|
|
The 'meta-data' operation reports this RA's meta-data information
|
|
The 'status' operation reports whether the collector is running
|
|
The 'monitor' operation reports whether the collector is running
|
|
|
|
UEND
|
|
}
|
|
|
|
meta_data() {
|
|
cat <<END
|
|
<?xml version="1.0"?>
|
|
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
|
|
<resource-agent name="lma_collector">
|
|
<version>1.0</version>
|
|
|
|
<longdesc lang="en">
|
|
Manages the LMA collector daemon as a Pacemaker Resource.
|
|
</longdesc>
|
|
<shortdesc lang="en">Manages Log or Metric collector</shortdesc>
|
|
<parameters>
|
|
|
|
<parameter name="service_name" unique="0" required="1">
|
|
<longdesc lang="en">
|
|
Name of the collector service.
|
|
</longdesc>
|
|
<shortdesc lang="en">Collector service name</shortdesc>
|
|
<content type="string" />
|
|
</parameter>
|
|
|
|
<parameter name="binary" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
Path of the LMA collector binary file that will be run.
|
|
</longdesc>
|
|
<shortdesc lang="en">LMA collector binary file</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_binary_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="config" unique="0" required="1">
|
|
<longdesc lang="en">
|
|
Path to the LMA collector configuration file or directory
|
|
</longdesc>
|
|
<shortdesc lang="en">LMA collector configuration</shortdesc>
|
|
<content type="string" />
|
|
</parameter>
|
|
|
|
<parameter name="log_file" unique="0" required="1">
|
|
<longdesc lang="en">
|
|
Path to the LMA collector log file
|
|
</longdesc>
|
|
<shortdesc lang="en">LMA collector log file</shortdesc>
|
|
<content type="string" />
|
|
</parameter>
|
|
|
|
<parameter name="user" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
User running the LMA collector process
|
|
</longdesc>
|
|
<shortdesc lang="en">LMA collector user</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_user_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="watchdog_file" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
The file to monitor that the process is up and running
|
|
</longdesc>
|
|
<shortdesc lang="en">LMA collector watchdog file</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_watchdog_file_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="watchdog_timeout" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
How much time the watchdog file can be left unmodified before claiming that
|
|
the process is unresponsive.
|
|
</longdesc>
|
|
<shortdesc lang="en">LMA collector watchdog timeout</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_watchdog_timeout_default}" />
|
|
</parameter>
|
|
|
|
</parameters>
|
|
|
|
<actions>
|
|
<action name="start" timeout="20" />
|
|
<action name="stop" timeout="20" />
|
|
<action name="status" timeout="20" />
|
|
<action name="monitor" timeout="30" interval="20" />
|
|
<action name="validate-all" timeout="5" />
|
|
<action name="meta-data" timeout="5" />
|
|
</actions>
|
|
</resource-agent>
|
|
END
|
|
}
|
|
|
|
#######################################################################
|
|
# Functions invoked by resource manager actions
|
|
|
|
service_validate() {
|
|
local rc
|
|
|
|
PID_FILE="${HA_RSCTMP}/${__SCRIPT_NAME}/${OCF_RESKEY_service_name}.pid"
|
|
|
|
check_binary "$OCF_RESKEY_binary"
|
|
|
|
if [[ ! -f $OCF_RESKEY_config && ! -d $OCF_RESKEY_config ]]; then
|
|
ocf_log err "Config $OCF_RESKEY_config doesn't exist"
|
|
return "$OCF_ERR_GENERIC"
|
|
fi
|
|
|
|
getent passwd "$OCF_RESKEY_user" >/dev/null 2>&1
|
|
rc=$?
|
|
if [ $rc -ne 0 ]; then
|
|
ocf_log err "User $OCF_RESKEY_user doesn't exist"
|
|
return "$OCF_ERR_GENERIC"
|
|
fi
|
|
|
|
true
|
|
}
|
|
|
|
service_status() {
|
|
local rc
|
|
local pid
|
|
|
|
# check and make PID file dir
|
|
local PID_DIR
|
|
PID_DIR=$( dirname "${PID_FILE}" )
|
|
if [ ! -d "${PID_DIR}" ] ; then
|
|
ocf_log debug "Create pid file dir: ${PID_DIR} and chown to ${OCF_RESKEY_user}"
|
|
mkdir -p "${PID_DIR}"
|
|
chown -R "${OCF_RESKEY_user}" "${PID_DIR}"
|
|
chmod 755 "${PID_DIR}"
|
|
fi
|
|
|
|
if [ ! -f "$PID_FILE" ]; then
|
|
ocf_log info "LMA collector is not running"
|
|
return "$OCF_NOT_RUNNING"
|
|
else
|
|
pid=$(cat "$PID_FILE")
|
|
fi
|
|
|
|
if [ -n "${pid}" ]; then
|
|
ocf_run -warn kill -s 0 "$pid"
|
|
rc=$?
|
|
if [ $rc -ne 0 ]; then
|
|
ocf_log info "Old PID file found, but LMA collector process isn't running"
|
|
return "$OCF_NOT_RUNNING"
|
|
fi
|
|
else
|
|
ocf_log err "PID file ${PID_FILE} is empty!"
|
|
return "$OCF_ERR_GENERIC"
|
|
fi
|
|
|
|
if [ ! -z "$OCF_RESKEY_watchdog_file" ]; then
|
|
if [ ! -f "$OCF_RESKEY_watchdog_file" ]; then
|
|
ocf_log info "${OCF_RESKEY_watchdog_file} is missing"
|
|
return "$OCF_NOT_RUNNING"
|
|
else
|
|
now=$(date '+%s')
|
|
last_access=$(stat -c %Y "${OCF_RESKEY_watchdog_file}")
|
|
if [ $(( now - last_access )) -gt "${OCF_RESKEY_watchdog_timeout}" ]; then
|
|
ocf_log err "File ${OCF_RESKEY_watchdog_file} not modified since ${OCF_RESKEY_watchdog_timeout} seconds"
|
|
return "$OCF_NOT_RUNNING"
|
|
fi
|
|
fi
|
|
else
|
|
ocf_log debug "Skip watchdog check since watchdog_file parameter is not set"
|
|
fi
|
|
|
|
return "$OCF_SUCCESS"
|
|
}
|
|
|
|
service_monitor() {
|
|
local rc
|
|
service_status
|
|
rc=$?
|
|
return $rc
|
|
}
|
|
|
|
service_start() {
|
|
local rc
|
|
|
|
service_monitor
|
|
rc=$?
|
|
if [ $rc -eq "$OCF_SUCCESS" ]; then
|
|
ocf_log info "${OCF_RESKEY_service_name} is already running"
|
|
return "$OCF_SUCCESS"
|
|
fi
|
|
|
|
# See https://bugs.launchpad.net/lma-toolchain/+bug/1543289
|
|
ulimit -n 102400
|
|
|
|
su "${OCF_RESKEY_user}" -s /bin/sh -c "${OCF_RESKEY_binary} \
|
|
-config=${OCF_RESKEY_config} >> $OCF_RESKEY_log_file 2>&1"' & echo $!' > "$PID_FILE"
|
|
|
|
# Spin waiting for the server to come up
|
|
while true; do
|
|
service_monitor
|
|
rc=$?
|
|
[ $rc -eq "$OCF_SUCCESS" ] && break
|
|
if [ $rc -ne "$OCF_NOT_RUNNING" ]; then
|
|
ocf_log err "${OCF_RESKEY_service_name} start failed"
|
|
exit "$OCF_ERR_GENERIC"
|
|
fi
|
|
sleep 3
|
|
done
|
|
|
|
ocf_log info "${OCF_RESKEY_service_name} started"
|
|
return "$OCF_SUCCESS"
|
|
}
|
|
|
|
service_stop() {
|
|
local rc
|
|
local pid
|
|
|
|
service_monitor
|
|
rc=$?
|
|
if [ $rc -eq "$OCF_NOT_RUNNING" ]; then
|
|
ocf_log info "${OCF_RESKEY_service_name} is already stopped"
|
|
return "$OCF_SUCCESS"
|
|
fi
|
|
|
|
# Try SIGTERM
|
|
pid=$(cat "$PID_FILE")
|
|
ocf_run kill -s TERM "$pid"
|
|
rc=$?
|
|
if [ $rc -ne 0 ]; then
|
|
ocf_log err "${OCF_RESKEY_service_name} couldn't be stopped"
|
|
exit "$OCF_ERR_GENERIC"
|
|
fi
|
|
|
|
# stop waiting
|
|
shutdown_timeout=15
|
|
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
|
|
shutdown_timeout=$(( (OCF_RESKEY_CRM_meta_timeout/1000)-5 ))
|
|
fi
|
|
count=0
|
|
while [ $count -lt $shutdown_timeout ]; do
|
|
service_monitor
|
|
rc=$?
|
|
if [ $rc -eq "$OCF_NOT_RUNNING" ]; then
|
|
break
|
|
fi
|
|
count=$(( count + 1))
|
|
sleep 1
|
|
ocf_log debug "${OCF_RESKEY_service_name} still hasn't stopped yet. Waiting ..."
|
|
done
|
|
|
|
service_monitor
|
|
rc=$?
|
|
if [ "${rc}" -ne "${OCF_NOT_RUNNING}" ]; then
|
|
# SIGTERM didn't help either, try SIGKILL
|
|
ocf_log info "${OCF_RESKEY_service_name} failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL ..."
|
|
ocf_run kill -s KILL "${pid}"
|
|
fi
|
|
|
|
ocf_log info "${OCF_RESKEY_service_name} stopped"
|
|
|
|
ocf_log debug "Delete pid file: ${PID_FILE} with content $(cat "${PID_FILE}")"
|
|
rm -f "${PID_FILE}"
|
|
|
|
return "${OCF_SUCCESS}"
|
|
}
|
|
|
|
#######################################################################
|
|
|
|
case "$1" in
|
|
meta-data) meta_data
|
|
exit "$OCF_SUCCESS";;
|
|
usage|help) usage
|
|
exit "$OCF_SUCCESS";;
|
|
esac
|
|
|
|
# Anything except meta-data and help must pass validation
|
|
service_validate || exit $?
|
|
|
|
# What kind of method was invoked?
|
|
case "$1" in
|
|
start) service_start;;
|
|
stop) service_stop;;
|
|
status) service_status;;
|
|
monitor) service_monitor;;
|
|
validate-all) ;;
|
|
*) usage
|
|
exit "$OCF_ERR_UNIMPLEMENTED";;
|
|
esac
|