diff --git a/README.rst b/README.rst index fcb3403..3c5968b 100644 --- a/README.rst +++ b/README.rst @@ -52,32 +52,6 @@ Configure masakari-monitors $ masakari-hostmonitor $ masakari-instancemonitor -If you are intend to use bash scripts of masakari-processmonitor and -masakari-hostmonitor, use following steps to install them. -However, those bash shell scripts are deprecated as of the Ocata release and -will be removed in the Queens release. -Use above masakari-hostmonitors implemented in python instead. - -#. Clone masakari using:: - - $ git clone https://github.com/openstack/masakari-monitors.git - -#. Create masakarimonitors directory in /etc/. - -#. Remove '.sample' from files hostmonitor.conf.sample, - processmonitor.conf.sample and proc.list.sample which exist at - masakari-monitors/etc/. - -#. Copy hostmonitor.conf, processmonitor.conf and proc.list files from - masakari-monitors/etc/ to /etc/masakarimonitors folder and make necessary - changes to the hostmonitor.conf, processmonitor.conf and proc.list files. - -#. To run bash scripts of masakari-processmonitor and masakari-hostmonitor - simply use following binary:: - - $ masakari-processmonitor.sh /etc/masakarimonitors/processmonitor.conf /etc/masakarimonitors/proc.list - $ masakari-hostmonitor.sh /etc/masakarimonitors/hostmonitor.conf - Features -------- diff --git a/masakarimonitors/cmd/masakari-hostmonitor.sh b/masakarimonitors/cmd/masakari-hostmonitor.sh deleted file mode 100755 index 2d8a712..0000000 --- a/masakarimonitors/cmd/masakari-hostmonitor.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright(c) 2016 Nippon Telegraph and Telephone Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -SCRIPT_DIR=/usr/local/lib/python2.7/dist-packages/masakarimonitors/hostmonitor -SCRIPT_FILE=${SCRIPT_DIR}/hostmonitor.sh - -# Argument check -if [ $# -ne 1 ]; then - echo "Usage: $0 " - exit 1 -else - SCRIPT_CONF_FILE=$1 -fi - -sudo bash ${SCRIPT_FILE} ${SCRIPT_CONF_FILE} diff --git a/masakarimonitors/cmd/masakari-processmonitor.sh b/masakarimonitors/cmd/masakari-processmonitor.sh deleted file mode 100755 index 9f0850f..0000000 --- a/masakarimonitors/cmd/masakari-processmonitor.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# Copyright(c) 2016 Nippon Telegraph and Telephone Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -SCRIPT_DIR=/usr/local/lib/python2.7/dist-packages/masakarimonitors/processmonitor -SCRIPT_FILE=${SCRIPT_DIR}/processmonitor.sh - -# Argument check -if [ $# -ne 2 ]; then - echo "Usage: $0 " - exit 1 -else - SCRIPT_CONF_FILE=$1 - PROC_LIST=$2 -fi - -sudo bash ${SCRIPT_FILE} ${SCRIPT_CONF_FILE} ${PROC_LIST} diff --git a/masakarimonitors/hostmonitor/hostmonitor.sh b/masakarimonitors/hostmonitor/hostmonitor.sh deleted file mode 100755 index 39e35a1..0000000 --- a/masakarimonitors/hostmonitor/hostmonitor.sh +++ /dev/null @@ -1,964 +0,0 @@ -#!/bin/bash - -# Copyright(c) 2016 Nippon Telegraph and Telephone Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Define variables. -BASE_NAME=`basename $0` -HOST_NAME=`hostname` -MY_NODE_NAME=${HOST_NAME,,} -LOGTAG=`basename $0` -TMP_DIR="/var/tmp" -TMP_CRM_MON_FILE="$TMP_DIR/crm_mon.tmp" -STATUS_FILE="$TMP_DIR/node_status.tmp" -TMP_CRMADM_FILE="$TMP_DIR/crmadmin.tmp" -TMP_IFCONFIG_FILE="$TMP_DIR/ifconfig.tmp" -NOTICE_OUTPUT="$TMP_DIR/${BASE_NAME}_resp.out" -NOTICE_PROGRAM="curl" -RA_COUNT=0 -LOGDIR="/var/log/masakari" -LOGFILE="${LOGDIR}/masakari-hostmonitor.log" -CLUSTER_STATUS="ONLINE" -HOST_STATUS="NORMAL" - -# Define the node state. -NODE_STATUS_STARTED="Started" -NODE_STATUS_STOPPED="Stopped" -NODE_STATUS_STARTING="Starting" -NODE_STATUS_STOPPING="Stopping" -NODE_STATUS_UNKNOWN="Unknown" - -# This function outputs the debug log -# Argument -# $1 : Message -log_debug () { - if [ ! -e ${LOGDIR} ]; then - mkdir -p ${LOGDIR} - fi - - if [ "${LOG_LEVEL}" == "debug" ]; then - log_output "$1" - fi -} - -# This function outputs the info log -# Argument -# $1 : Message -log_info () { - if [ ! -e ${LOGDIR} ]; then - mkdir -p ${LOGDIR} - fi - - log_output "$1" -} - -# This function outputs the log -# Argument -# $1 : Message -log_output () { - echo "`date +'%Y-%m-%d %H:%M:%S'` ${HOST_NAME} ${LOGTAG}: $1" >> $LOGFILE -} - -# This function locks a file -# Argument -# $1 : Message -file_lock () { - exec 9>>$1 - flock -x 9 -} - -# This function unlocks a file -file_unlock () { - exec 9>&- -} - -# Initialization function -script_initialize () { - ID=`uuidgen` - log_debug "begin loop ID:$ID" - if [ -f $TMP_CRM_MON_FILE ]; then - sudo rm -f $TMP_CRM_MON_FILE - fi - if [ -f $NOTICE_OUTPUT ]; then - sudo rm -f $NOTICE_OUTPUT - fi - if [ -e $TMP_CRMADM_FILE ]; then - sudo rm -rf $TMP_CRMADM_FILE - fi - if [ -e $TMP_IFCONFIG_FILE ]; then - sudo rm -rf $TMP_IFCONFIG_FILE - fi - return 0 -} - -# Finalization function -# Argument -# $1 : The flag indicating whether delete the node state file. -# 0 -> The node state file is deleted. -# 1 -> The node state file is not deleted. -script_finalize () { - if [ $1 -eq 0 ]; then - if [ -f $STATUS_FILE ]; then - sudo rm -f $STATUS_FILE - fi - fi - if [ -f $TMP_CRM_MON_FILE ]; then - sudo rm -f $TMP_CRM_MON_FILE - fi - if [ -f $NOTICE_OUTPUT ]; then - sudo rm -f $NOTICE_OUTPUT - fi - if [ -e $TMP_CRMADM_FILE ]; then - sudo rm -rf $TMP_CRMADM_FILE - fi - if [ -e $TMP_IFCONFIG_FILE ]; then - sudo rm -rf $TMP_IFCONFIG_FILE - fi - log_debug "end loop ID:$ID" - return 0 -} - -# Check the value is correct type -# Argument -# $1: Type -# $2: Parameter Name -# $3: Value -# Return -# 0: The value is correct type -# 1: The value is not correct type -check_config_type() { - expected_type=$1 - parameter_name=$2 - value=$3 - - ret=0 - case $expected_type in - int) - expr $value + 1 > /dev/null 2>&1 - if [ $? -ge 2 ]; then ret=1; fi - ;; - string) - if [ -z $value ] ; then ret=1; fi - ;; - *) - ret=1 - ;; - esac - - if [ $ret -eq 1 ] ; then - log_info "config file parameter error. [${SCRIPT_CONF_FILE}:${parameter_name}]" - exit 1 - fi - - log_info "config file parameter : ${parameter_name}=${value}" - return 0 -} - -# This function reads the configuration file and set the value. -# If the value is omitted, set the default value. -# If invalid value is set, return 1. -# Note) The default value for each item are as follows. -# MONITOR_INTERVAL (defualt : 60) -# NOTICE_TIMEOUT (defualt : 10) -# NOTICE_RETRY_COUNT (default : 12) -# NOTICE_RETRY_INTERVAL (default : 10) -# STONITH_WAIT (default : 30) -# MAX_CHILD_PROCESS (default : 3) -# TCPDUMP_TIMEOUT (default : 10) -# IPMI_TIMEOUT (default : 5) -# IPMI_RETRY_MAX (default : 3) -# IPMI_RETRY_INTERVAL (default : 10) -# HA_CONF (default : "/etc/corosync/corosync.conf") -# LOG_LEVEL (default : "info") -# DOMAIN (default : "") -# ADMIN_USER (default : "") -# ADMIN_PASS (default : "") -# PROJECT (default : "") -# AUTH_URL (default : "") -# REGION (default : "") -# IGNORE_RESOURCE_GROUP_NAME_PATTERN (default : "stonith") -# -# Return value -# 0 : Setting completion -# 1 : Reading failure of the configuration or invalid setting value -set_conf_value () { - # Read the configuration file - source $SCRIPT_CONF_FILE > /dev/null 2>&1 - if [ $? -ne 0 ]; then - log_info "config file read error. [$SCRIPT_CONF_FILE]" - return 1 - fi - - MONITOR_INTERVAL=${MONITOR_INTERVAL:-60} - check_config_type 'int' MONITOR_INTERVAL $MONITOR_INTERVAL - - NOTICE_TIMEOUT=${NOTICE_TIMEOUT:-10} - check_config_type 'int' NOTICE_TIMEOUT $NOTICE_TIMEOUT - - NOTICE_RETRY_COUNT=${NOTICE_RETRY_COUNT:-12} - check_config_type 'int' NOTICE_RETRY_COUNT $NOTICE_RETRY_COUNT - - NOTICE_RETRY_INTERVAL=${NOTICE_RETRY_INTERVAL:-10} - check_config_type 'int' NOTICE_RETRY_INTERVAL $NOTICE_RETRY_INTERVAL - - STONITH_WAIT=${STONITH_WAIT:-30} - check_config_type 'int' STONITH_WAIT $STONITH_WAIT - - MAX_CHILD_PROCESS=${MAX_CHILD_PROCESS:-3} - check_config_type 'int' MAX_CHILD_PROCESS $MAX_CHILD_PROCESS - - TCPDUMP_TIMEOUT=${TCPDUMP_TIMEOUT:-10} - check_config_type 'int' TCPDUMP_TIMEOUT $TCPDUMP_TIMEOUT - - IPMI_TIMEOUT=${IPMI_TIMEOUT:-5} - check_config_type 'int' IPMI_TIMEOUT $IPMI_TIMEOUT - - IPMI_RETRY_MAX=${IPMI_RETRY_MAX:-3} - check_config_type 'int' IPMI_RETRY_MAX $IPMI_RETRY_MAX - - IPMI_RETRY_INTERVAL=${IPMI_RETRY_INTERVAL:-10} - check_config_type 'int' IPMI_RETRY_INTERVAL $IPMI_RETRY_INTERVAL - - HA_CONF=${HA_CONF:-"/etc/corosync/corosync.conf"} - check_config_type 'string' HA_CONF $HA_CONF - - LOG_LEVEL=${LOG_LEVEL:-"info"} - check_config_type 'string' LOG_LEVEL $LOG_LEVEL - - DOMAIN=${DOMAIN:-""} - check_config_type 'string' DOMAIN $DOMAIN - - ADMIN_USER=${ADMIN_USER:-""} - check_config_type 'string' ADMIN_USER $ADMIN_USER - - ADMIN_PASS=${ADMIN_PASS:-""} - check_config_type 'string' ADMIN_PASS $ADMIN_PASS - - PROJECT=${PROJECT:-""} - check_config_type 'string' PROJECT $PROJECT - - AUTH_URL=${AUTH_URL:-""} - check_config_type 'string' AUTH_URL $AUTH_URL - - REGION=${REGION:-""} - check_config_type 'string' REGION $REGION - - IGNORE_RESOURCE_GROUP_NAME_PATTERN=${IGNORE_RESOURCE_GROUP_NAME_PATTERN:-""} - check_config_type 'string' IGNORE_RESOURCE_GROUP_NAME_PATTERN $IGNORE_RESOURCE_GROUP_NAME_PATTERN - - return 0 -} - -# This function gets the NIC that is used for intercommunication of corosync based on -# the contents of /etc/corosync/corosync.conf. -# -# Argument -# $1 : Value of bindnetabbr is set in /etc/corosync/corosync.conf -# Return value -# 0 : Success to get -# 1 : Fail to get(Detect /etc/corosync/corosync.conf of invalid setting value) -get_mcast_nic () { - BIND_NET_ADDR=$1 - BIND_NET_ADDR=`echo ${BIND_NET_ADDR} | sed -e 's/\.0$//g'` - sudo ifconfig > ${TMP_IFCONFIG_FILE} - - if [ `grep "${BIND_NET_ADDR}" ${TMP_IFCONFIG_FILE} | wc -l` -eq 0 ]; then - return 1 - fi - - S_LINES=`cat ${TMP_IFCONFIG_FILE} | grep -n -e "^[a-z]" -e "^[0-9]" | cut -d":" -f1` - E_LINE_DEFAULT=`cat -n ${TMP_IFCONFIG_FILE} | tail -n 1 | awk '{print $1}'` - for S_LINE in ${S_LINES} - do - S_LINE=`expr ${S_LINE} + 1` - E_LINE=`cat ${TMP_IFCONFIG_FILE} | tail -n +${S_LINE} | egrep -n -m 1 -e "^[a-z]" -e "^[0-9]" | cut -d":" -f1` - - if [ -z "${E_LINE}" ]; then - E_LINE=${E_LINE_DEFAULT} - else - E_LINE=`expr ${S_LINE} + ${E_LINE} - 1 - 1` - fi - - if [ `cat ${TMP_IFCONFIG_FILE} | sed -n "${S_LINE},${E_LINE}p" | grep "${BIND_NET_ADDR}" | wc -l` -ne 0 ]; then - break - fi - done - - S_LINE=`expr ${S_LINE} - 1` - MCAST_NIC=`cat -n ${TMP_IFCONFIG_FILE} | grep " ${S_LINE}" | awk '{print $2}'` - - return 0 -} - -# Check whether masakari-hostmoitor works on pacemaker-remote -# Return value -# 0 : works on pacemaker-remote -# 1 : doesn't work on pacemaker-remote -is_pacemaker_remote() { - sudo service pacemaker_remote status > /dev/null 2>&1 - return $? -} - -# This function checks whether the HB line is alive -# Return value -# 0 : The HB line is alive. -# 1 : The HB line is not alive. -# 2 : Detect /etc/corosync/corosync.conf of invalic setting value -check_hb_line () { - # If the heartbeat is not starting, it is not required to execute tcpdump command. - sudo service corosync status > /dev/null 2>&1 - RET_CORO=$? - sudo service pacemaker status > /dev/null 2>&1 - RET_PACE=$? - is_pacemaker_remote - RET_REMOTE=$? - if [ ${RET_CORO} -ne 0 -o ${RET_PACE} -ne 0 ]; then - if [ ${RET_REMOTE} -ne 0 ]; then - log_debug "neither pacemaker nor pacemaker-remote is running." - return 1 - else - log_debug "works on pacemaker-remote." - return 0 - fi - fi - - # Get all the setting of mcastport and bindnetaddr. - MCAST_PORTS=`grep "mcastport:" ${HA_CONF} | awk '{print $2}'` - BIND_NET_ADDRS=`grep "bindnetaddr:" ${HA_CONF} | awk '{print $2}'` - - array_mcast_ports=(`echo ${MCAST_PORTS}`) - array_bind_net_addrs=(`echo ${BIND_NET_ADDRS}`) - - if [ -z "${MCAST_PORTS}" ] || - [ -z "${BIND_NET_ADDRS}" ] || - [ ${#array_bind_net_addrs[*]} -ne ${#array_mcast_ports[*]} ]; then - log_debug "${HA_CONF} has incorrect parameters." - return 2 - fi - - NIC_SUCCESS_FLG=0 - results="" - loop_count=0 - while [ ${loop_count} -lt ${#array_bind_net_addrs[*]} ] - do - MCAST_PORT=${array_mcast_ports[${loop_count}]} - MCAST_NIC="" - # Get the NIC that is used for multicast from the values set in bindnetaddr. - get_mcast_nic ${array_bind_net_addrs[$loop_count]} - if [ $? -ne 0 ]; then - log_debug "${HA_CONF} has incorrect parameters." - return 2 - fi - - log_debug "read mcast port from ${HA_CONF} -> ${MCAST_PORT}" - log_debug "read mcast nic from ${HA_CONF} -> ${MCAST_NIC}" - - timeout $TCPDUMP_TIMEOUT sudo tcpdump -c 1 -p -i ${MCAST_NIC} port ${MCAST_PORT} > /dev/null 2>&1 - result=$? - if [ $result -eq 0 ]; then - NIC_SUCCESS_FLG=1 - log_debug "tcpdump hb line (${MCAST_NIC}) ok." - break - else - log_debug "tcpdump hb line (${MCAST_NIC}) fail. [exit-code: $result]" - results+="$result " - fi - loop_count=`expr $loop_count + 1` - done - - if [ ${NIC_SUCCESS_FLG} -eq 0 ]; then - log_info "tcpdump hb line fail. [exit-code: $results]" - return 1 - fi - return 0 -} - -# This function checks the heartbeat state of the own node -# Return value -# 0 : Stable state -# 1 : The heartbeat is stopped state -# 2 : Unstable state (during state transitions) -check_hb_status() -{ - OWN_NODE=`uname -n` - - sudo crmadmin -S ${OWN_NODE,,} 1> $TMP_CRMADM_FILE 2>/dev/null - if [ $? -ne 0 ]; then - # The heartbeat is not running (or during get state). - log_debug "Heartbeat in the own node doesn't run." - rm -f $TMP_CRMADM_FILE - return 1 - fi - - grep -v -e S_IDLE -e S_NOT_DC $TMP_CRMADM_FILE 1>/dev/null 2>&1 - if [ $? -eq 0 ]; then - # The heartbeat is unstable state (or during state transitions). - log_debug "Heartbeat is in an unstable state." - rm -f $TMP_CRMADM_FILE - return 2 - fi - - rm -f $TMP_CRMADM_FILE - log_debug "Heartbeat is in a stable state." - return 0 -} - -# This function executes the crm_mon command and hold result -# Return value -# 0 : Normal termination -# 1 : Fail to execute the crm_command -run_crm_mon () { - sudo crm_mon -A -1 >$TMP_CRM_MON_FILE - result=$? - if [ $result -ne 0 ]; then - log_debug "crm_mon fail. [exit-code: $result]" - return 1 - else - # Count the number of RA. - if [ $RA_COUNT -eq 0 ]; then - group_define=`sudo crm configure show | grep "^group " | grep -vi "$IGNORE_RESOURCE_GROUP_NAME_PATTERN" | sed -n '$p' | cut -d" " -f3-` - result=$? - if [ ! -n "$group_define" ] || ! [ "$result" -eq 0 ] ; then - log_debug "cib is not configured." - return 1 - fi - tmp_array=(`echo $group_define`) - ln=`echo $((${#group_define}))` - last_word=`echo ${group_define} | cut -c ${ln}` - if [[ $last_word != "\\" ]]; then - RA_COUNT=${#tmp_array[*]} - else - RA_COUNT=`expr ${#tmp_array[*]} - 1` - fi - fi - fi - log_debug "`cat $TMP_CRM_MON_FILE`" - - # Check whether there is the quorum. - grep "partition WITHOUT quorum" $TMP_CRM_MON_FILE > /dev/null 2>&1 - result=$? - if [ $result -eq 0 ]; then - log_info "$MY_NODE_NAME is no-quorum." - fi - - return 0 -} - -# This function creates the node state file -make_status_file () { - touch $STATUS_FILE - count_cluster_nodes - work_count=$? - n=0 - while [ $n -lt $work_count ] - do - check_node_status ${nodes_array[$n]} - result=$? - append_status_file ${nodes_array[$n]} $result - n=`expr $n + 1` - done -} - -# This function analyzes the output of crm_mon and count the number of cluster node. -# And it stores node name in array in this function. -# Return value -# The number of cluster node -count_cluster_nodes () { - # Initialize the array - nodes_array=() - - # Count the number of Online node. - online_nodes=`cat $TMP_CRM_MON_FILE | grep '^Online\|^RemoteOnline' | sed -e 's/\s\{1,\}/ /g' | sed -e 's/ \]$//g' | cut -d" " -f3- | tr '\n' ' '` - log_debug "online nodes : $online_nodes" - if [ -n "$online_nodes" ]; then - nodes_array+=(`echo $online_nodes`) - fi - - # Count the number of OFFLINE node. - offline_nodes=`cat $TMP_CRM_MON_FILE | grep '^OFFLINE\|^RemoteOFFLINE' | sed -e 's/\s\{1,\}/ /g' | sed -e 's/ \]$//g' | cut -d" " -f3- | tr '\n' ' '` - log_debug "offline nodes : $offline_nodes" - if [ -n "$offline_nodes" ]; then - nodes_array+=(`echo $offline_nodes`) - fi - - # Count the number of except for Online, OFFLINE node. - other_nodes=`cat $TMP_CRM_MON_FILE | grep ^Node | grep -v Attributes | sed -e 's/\s\{1,\}/ /g' | cut -d" " -f2` - log_debug "other nodes : $other_nodes" - if [ -n "$other_nodes" ]; then - nodes_array+=(`echo $other_nodes`) - fi - - return ${#nodes_array[*]} -} - -# This function checks startup state of node's RA. -# Argument -# $1 : Node name -# Return value -# 0 : Started state -# Node is online, and state of all RA is "Started" -# 1 : Stopped state -# UNCLEAN, OFFLINE, pending, standby -# 2 : Starting or Stopping state -# Node is online, and mixed "RA of Started" and "RA of Stopped" -check_node_status () { - online_nodes=`cat $TMP_CRM_MON_FILE | grep '^Online\|^RemoteOnline' | sed -e 's/\s\{1,\}/ /g' | sed -e 's/ \]$//g' | cut -d" " -f3-` - # Check whether the node of argument is "Online". - if [ "`echo $online_nodes | grep -e "$1 " -e "$1$"`" ]; then - # Check whether the node of state of all RA is "Started". - # In some cases "unmanaged" may not exist. - START_RA_COUNT=`egrep -e "Started\s+$1\s*(\(unmanaged\))*\s*$" $TMP_CRM_MON_FILE | grep -v stonith | wc -l` - if [ $START_RA_COUNT -eq $RA_COUNT ] || [ $RA_COUNT -eq -1 ] ; then - # Node is online and state of all RA is "Started"(startup state) - return 0 - else - # There is "Stopped" even one(Starting or Stopping). - return 2 - fi - else - # In spite of "UNCLEAN" or "OFFLINE" or "pending" or "standby", - # if RA of "Started" exists, consider state as starting state or stopping state. - other_node_ra=`grep "Started $1 " $TMP_CRM_MON_FILE | grep -v stonith | wc -l` - if [ $other_node_ra -ne 0 ] ; then - return 2 - # "UNCLEAN" or "OFFLINE" or "pending" or "standby"(stopped) - else - return 1 - fi - fi -} - -# This function writes in the node state file -# Argument -# $1 : node name -# $2 : node state(0:Started, 1:Stopped, 2:Starting or Stopping) -append_status_file () { - if [ $2 -eq 0 ]; then - node_status="$NODE_STATUS_STARTED" - elif [ $2 -eq 1 ]; then - node_status="$NODE_STATUS_STOPPED" - else - node_status="$NODE_STATUS_UNKNOWN" - fi - - file_lock $STATUS_FILE - echo "$1 $node_status" >> $STATUS_FILE - file_unlock -} - -# This function analyzes the state of the node specified by the argument from the result of crm_mon, -# and if the nodes state are different from the last state, notify to the resource management. -# Argument -# $1 : Node name(1) -# $2 : Node name(2) -# ... -# $n : node name(n) -# Node name that are passed by arguments is multiple. -# If nothing is passed to the argument, immediate return. -parse_node_status () { - if [ $# -eq 0 ]; then - return 0 - fi - - work_count=$# - n=0 - while [ $n -lt $work_count ] - do - check_node_status $1 - result1=$? - if [ $result1 -eq 0 ]; then - EVENT="STARTED" - elif [ $result1 -eq 1 ]; then - EVENT="STOPPED" - fi - TIME=`date -u +'%Y-%m-%d %H:%M:%S'` - compare_status_file $1 $result1 - result2=$? - if [ $result2 -eq 1 ]; then - make_notice_data $1 - send_notification $1 - fi - shift - n=`expr $n + 1` - done - - return 0 -} - -# This function compares state of last node with state of this time node, -# and if they are different, rewrite the state file. -# It is called from child process. -# -# Arguments -# $1 : Node name -# $2 : Node state(0:Started, 1:Stopped, 2:Starting or Stopping) -# return value -# 0 : There is not change from the last state and notification to the resource is not required. -# 1 : There is change from the last state and notification to the resource is required. -# 2 : There is change from the last state and notification to the resource is not required. -compare_status_file () { - # Check whether state of this time node changed from state of last time node. - last_node_status=`grep "$1 " $STATUS_FILE | cut -d" " -f2` - - # If node name that does not exist in the node state file, add it's node name to the file. - if [ ! -n "$last_node_status" ]; then - append_status_file $1 $2 - return 2 - fi - - if [ $2 -eq 0 ]; then - # If state of this time node is "Started" and state of last time node is "Started", - if [[ $last_node_status = $NODE_STATUS_STARTED ]]; then - return 0 - # If state of this time node is "Started" and - # state of last time node is "Started" or "Stopping" or "Starting" or "Unknown", - else - change_status_file $1 $2 $last_node_status - return $? - fi - elif [ $2 -eq 1 ]; then - # If state of this time node is "Stopped" and state of last time node is "Stopped", - if [[ $last_node_status = $NODE_STATUS_STOPPED ]]; then - return 0 - # If state of this time node is "Stopped" and - # state of last time node is "Started" or "Stopping" or "Starting" or "Unknown", - else - change_status_file $1 $2 $last_node_status - return $? - fi - # If state of this time node is "Stopping" or "Starting" or "Unknown", - else - change_status_file $1 $2 $last_node_status - return $? - fi -} - -# This function rewrites the state file. -# Return the necessity of notification return code -# -# Argument -# $1 : Node name -# $2 : Node state(0:Started, 1:Stopped, 2:Starting or Stopping) -# $3 : State of the last node is specified in the node state file -# Return value -# 1 : Notification to the resource management is required -# 2 : Notification to the resource management is not required -change_status_file () { - # If state of this time node is "Started", - if [ $2 -eq 0 ]; then - node_status="$NODE_STATUS_STARTED" - # If state of this time node is "Stopping" or "Unknown", notification is not sent. - if [[ $3 = $NODE_STATUS_STOPPING ]] || - [[ $3 = $NODE_STATUS_UNKNOWN ]]; then - retval=2 - else - retval=1 - fi - # If state of this time node is "Stopped", - elif [ $2 -eq 1 ]; then - node_status="$NODE_STATUS_STOPPED" - # If state of this time node is "Starting" or "Unknown", notification is not sent. - if [[ $3 = $NODE_STATUS_STARTING ]] || - [[ $3 = $NODE_STATUS_UNKNOWN ]]; then - retval=2 - else - retval=1 - fi - # If state of this time node is "Starting" or "Stopping" or "Unknown", - else - if [[ $3 = $NODE_STATUS_STARTED ]]; then - node_status="$NODE_STATUS_STOPPING" - elif [[ $3 = $NODE_STATUS_STOPPED ]]; then - node_status="$NODE_STATUS_STARTING" - else - node_status="$3" - fi - # Notification is not sent. - retval=2 - fi - - file_lock $STATUS_FILE - sed -i "s/$1 $last_node_status/$1 $node_status/g" $STATUS_FILE - file_unlock - - return $retval -} - - -# This function creates data to be notified to the resource management. -# It is called from the child process. -# -# Argument -# $1 : Node name -make_notice_data () { - TMP_RULE=`sudo crm configure show | grep "rule" | grep -i -e "100: #uname eq $1 " -e "100: #uname eq $1$" | grep -vi "stonith"` - P_HOST=`echo ${TMP_RULE} | awk '{print $6}'` - if [[ ${STONITH_TYPE} = "ssh" ]] ; then - P_HOST=$1 - fi - - # Usually, the route which shuldn't pass - # (Abnormal states such as resource group name is "_grp", or physical host name is ""(empty string). - if [ ! -n "${P_HOST}" ]; then P_HOST="UnknownPhysicalHost"; fi - - CLUSTER_STATUS="ONLINE" - HOST_STATUS="NORMAL" - - # In the case of stop notification, check whether the opposing node has stopped securety. - if [[ ${EVENT} = "STOPPED" ]] ; then - CLUSTER_STATUS="OFFLINE" - HOST_STATUS="NORMAL" - - # adhoc setting for test - if [[ ${STONITH_TYPE} = "ipmi" ]] ; then - - # Get the value which is required for ipmitool command execution. - IPMI_RAS=`sudo crm configure show | grep "^primitive.*stonith:external/ipmi" | awk '{print $2}'` - for IPMI_RA in ${IPMI_RAS} - do - IPMI_HOST=`sudo crm resource param ${IPMI_RA} show hostname` - if [[ ${IPMI_HOST} = ${P_HOST} ]]; then - break - fi - done - userid=`sudo crm resource param ${IPMI_RA} show userid` - passwd=`sudo crm resource param ${IPMI_RA} show passwd` - interface=`sudo crm resource param ${IPMI_RA} show interface` - ipaddr=`sudo crm resource param ${IPMI_RA} show ipaddr` - - LOOP_COUNT=0 - while [ ${LOOP_COUNT} -lt `expr ${IPMI_RETRY_MAX} + 1` ] - do - POWER_STATUS=`timeout ${IPMI_TIMEOUT} sudo ipmitool -U ${userid} -P ${passwd} -I ${interface} -H ${ipaddr} power status 2>&1` - RET1=$? - echo ${POWER_STATUS} | grep "Power is off" > /dev/null 2>&1 - RET2=$? - # If the opposing node has stopped securely, pass route of the notification. - if [ ${RET1} -eq 0 ] && [ ${RET2} -eq 0 ]; then - log_debug "Node $1 power is off." - break - fi - # If the opposing node has stopped securely, recheck after sleep. - log_debug "Sleep to get power status of node $1" - sleep ${IPMI_RETRY_INTERVAL} - LOOP_COUNT=`expr ${LOOP_COUNT} + 1` - done - - if [ ${LOOP_COUNT} -eq `expr ${IPMI_RETRY_MAX} + 1` ]; then - HOST_STATUS="UNKNOWN" - # If get the state of "Power is on" at the final, the HOST_STATUS is "UNKNOWN". - if [ ${RET1} -eq 0 ]; then - log_info "$1 info : Node $1 power is still on." - # If get the state of "Unknown", HOST_STATUS is "UNKNOWN". - else - log_info "$1 info : Couldn't get power status of node $1." - fi - fi - fi - fi - - # Consider the port number - # that is used for intercommunication of Pacemaker+corosync as the cluster identifier. - - PAYLOAD="{\"event\": \"${EVENT}\",\"host_status\": \"${HOST_STATUS}\",\"cluster_status\": \"${CLUSTER_STATUS}\"}" - -} - - -# This function notifies to the resource management. -# It is called masakari_cli post_event method. -# -# Argument -# $1 : Node name -send_notification () { - TYPE="COMPUTE_HOST" - TARGET="post_event" - AUTH_INFO="--os-domain-name ${DOMAIN} --os-project-name ${PROJECT} --os-region-name ${REGION} --os-auth-url ${AUTH_URL} --os-username ${ADMIN_USER} --os-password ${ADMIN_PASS}" - - log_info "$1 info : Send a notification." - log_info "$1 info : openstack ${AUTH_INFO} notification create ${TYPE} ${P_HOST} \"${TIME}\" \"${PAYLOAD}\"" - - RESP=`openstack ${AUTH_INFO} notification create ${TYPE} ${P_HOST} "${TIME}" "${PAYLOAD}"` - result=$? - - if [ $result -eq 0 ]; then - log_info "$1 info : Succeeded in sending a notification." - log_info "$1 info : $RESP" - else - log_info "$1 info : Failed to send a notification. [exit-code: $result]" - log_info "$1 info : $RESP" - fi - - return -} - -# Argument check -if [ $# -ne 1 ]; then - echo "Usage: $0 " - exit 1 -else - SCRIPT_CONF_FILE=$1 -fi - -# Output warning message. -log_info "WARNING : $0 is deprecated as of the Ocata release and will be removed in the Queens release. Use masakari-hostmonitor implemented in python instead of $0." - -# main route -log_info "begin" - -# If node state file exists at the initial startup, delete the file. -if [ -f $STATUS_FILE ]; then - sudo rm -f $STATUS_FILE -fi - -while true -do - # If invalid value is set in the configuration file, set the default value. - set_conf_value - if [ $? -ne 0 ]; then - break - fi - - # Initialize - script_initialize - - # Check whether HB line is normal. - check_hb_line - ret=$? - if [ $ret -ne 0 ]; then - case $ret in - 1) - sleep $STONITH_WAIT - ;; - 2) - script_finalize 1 - sleep $MONITOR_INTERVAL - continue - ;; - esac - fi - - # Check the heartbeat state of the own node. - # It only checks hb status when this process runs on the full - # cluster stack of corosync. - if ! is_pacemaker_remote ; then - check_hb_status - ret=$? - if [ $ret -ne 0 ]; then - case $ret in - 1) - script_finalize 0 - ;; - 2) - script_finalize 1 - ;; - esac - sleep $MONITOR_INTERVAL - continue - fi - fi - - # Get output result of crm_mon. - run_crm_mon - ret=$? - if [ $ret -ne 0 ]; then - script_finalize 0 - sleep $MONITOR_INTERVAL - continue - fi - - # If state file of last node is not exsits, create state file, - # and write current state to state file. - if [ ! -e $STATUS_FILE ]; then - make_status_file - log_debug "`cat $STATUS_FILE`" - sleep $MONITOR_INTERVAL - continue - fi - - # Count the number of cluster node. - count_cluster_nodes - result=$? - if [ $result -eq 0 ]; then - script_finalize 0 - sleep $MONITOR_INTERVAL - continue - fi - - # If the number of nodes is fewer than the maximum number of child process, - # Child process should start only the number of the node. - if [ $result -le $MAX_CHILD_PROCESS ]; then - MAX_CHILD_PROCESS=$result - fi - - # Get the minimum number of nodes that are taken care of by the child process. - child_min_work=`expr $result / $MAX_CHILD_PROCESS` - # Get the maximum number of nodes that are taken care of by the child process. - child_max_work=`expr $child_min_work + 1` - # Get the number of the child process - # that takes care of the number of child_max_work nodes. - max_work_count=`expr $result % $MAX_CHILD_PROCESS` - - # Get the node name(multiple) that is processed by the child process, - # pass its node name to child process - jobsrunning=0 - n=0 - m=0 - # Loop processing is executed only by the MAX_CHILD_PROCESS. - while [ $jobsrunning -lt $MAX_CHILD_PROCESS ] - do - work=0 - param="" - # If the child process take care of only the "max_work_count" nodes, - if [ $m -lt $max_work_count ]; then - # Loop processing is executed only by the maximun number of nodes - # that are taken care of by the child process. - while [ $work -lt $child_max_work ] - do - # Only if node name is not empty string - # and it is not own node name, pass it to child process. - if [ -n "${nodes_array[$n]}" ] && [[ ${nodes_array[$n]} != $MY_NODE_NAME ]]; then - param+="${nodes_array[$n]} " - fi - work=`expr $work + 1` - n=`expr $n + 1` - done - # If the child process take care of only the "min_work_count" nodes, - else - # Loop processing is executed only by the maximun number of nodes - # that are taken care of by the child process. - while [ $work -lt `expr $child_min_work` ] - do - # Only if node name is not empty string - # and it is not own node name, pass it child process. - if [ -n "${nodes_array[$n]}" ] && [[ ${nodes_array[$n]} != $MY_NODE_NAME ]]; then - param+="${nodes_array[$n]} " - fi - work=`expr $work + 1` - n=`expr $n + 1` - done - fi - parse_node_status $param & - jobsrunning=`expr $jobsrunning + 1` - done - wait - - log_debug "`cat $STATUS_FILE`" - - script_finalize 1 - sleep $MONITOR_INTERVAL -done - -log_info "end" - diff --git a/masakarimonitors/processmonitor/common.sh b/masakarimonitors/processmonitor/common.sh deleted file mode 100755 index 0d99e0a..0000000 --- a/masakarimonitors/processmonitor/common.sh +++ /dev/null @@ -1,196 +0,0 @@ -# Copyright(c) 2016 Nippon Telegraph and Telephone Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -LOGTAG=`basename $0` -HOST_NAME=`hostname` -LOGDIR="/var/log/masakari" -LOGFILE="${LOGDIR}/masakari-processmonitor.log" - -# Debug log output function -# Argument -# $1 : Message -log_debug () { - if [ ! -e ${LOGDIR} ]; then - mkdir -p ${LOGDIR} - fi - - if [ "${LOG_LEVEL}" == "debug" ]; then - log_output "$1" - fi -} - -# Info log output function -# Argument -# $1 : Message -log_info () { - if [ ! -e ${LOGDIR} ]; then - mkdir -p ${LOGDIR} - fi - - log_output "$1" -} - -# This function outputs the log -# Argument -# $1 : Message -log_output () { - echo "`date +'%Y-%m-%d %H:%M:%S'` ${HOST_NAME} ${LOGTAG}: $1" >> $LOGFILE -} - -# Some sanity checks on the check target processing list. -# Format of the proc.list(Each columns must be separated by a comma.) -# The first column : Process ID (two digits of leading zeros) : cannot be omitted. -# The second column : The keyword when check exists in processing list(empty is NG.). : cannot be omitted -# The third column : The initial startup command (it's required to include word of "start". ) -# The fourth column : Rebooting command (it's required to include word of "start".) -# The fifth column : Shell file name for special processing at the initial startup(before the startup) -# The sixth column : Shell file name for special processing at the initial startup(after the startup) -# The seventh column : Shell file name for special processing at the initial restart(before the startup) -# The eighth column : Shell file name for special processing at the initial restart(after the startup) -# -# When abonormal condition is detected about proc.list, exits by "exit 2". -column_num=8 -check_proc_file_common (){ - - # Check the existence and validity of the proc.list. - if [ ! -e $PROC_LIST ]; then - log_info "$PROC_LIST(proc_list) is not exists." - exit 2 - fi - - if [ ! -s $PROC_LIST ]; then - log_info "$PROC_LIST(proc_list) is empty file." - exit 2 - fi - - if [ ! -r "$PROC_LIST" ]; then - log_info "$PROC_LIST(proc_list) is not readable." - exit 2 - fi - - OLD_IFS=$IFS - IFS=$'\n' - proc_list=(`cat $PROC_LIST`) - IFS=$OLD_IFS - - LINE_NO=1 - - for line in "${proc_list[@]}" - do - num=`echo "$line" | tr -dc ',' | wc -c` - # The number of required column are incomplete. - check_num=`expr $column_num - 1` - if [ $num -ne $check_num ]; then - log_info "$PROC_LIST format error (column_num) line $LINE_NO" - exit 2 - fi - - PROC_ID=`echo $line | cut -d"," -f 1` - if [ ! -z "$PROC_ID" ]; then - expr "$PROC_ID" + 1 >/dev/null 2>&1 - # If PROC ID is not a numeric, - if [ 1 -lt $? ]; then - log_info "$PROC_LIST format error (PROC_ID) not number. line $LINE_NO" - exit 2 - fi - else - log_info "$PROC_LIST format error (PROC_ID) empty. line $LINE_NO" - exit 2 - fi - - KEY_WORD=`echo $line | cut -d"," -f 2` - if [ -z "$KEY_WORD" ]; then - log_info "$PROC_LIST format error (KEY_WORD) empty. line $LINE_NO" - exit 2 - fi - - - START_CMD=`echo $line | cut -d"," -f 3` - if [ ! -z "$START_CMD" ]; then - check=`echo $START_CMD | grep -c start` - # If words of "start" are not included in initial startup processing., - if [ $check -ne 1 ]; then - log_info "$PROC_LIST format error (START_CMD) line $LINE_NO" - exit 2 - fi - fi - - RESTART_CMD=`echo $line | cut -d"," -f 4` - if [ ! -z "$RESTART_CMD" ]; then - check=`echo $RESTART_CMD | grep -c start` - # If words of "start" are not included in restart processing, - if [ $check -ne 1 ]; then - log_info "$PROC_LIST format error (RESTART_CMD) line $LINE_NO" - exit 2 - fi - fi - - # Check the existence and validity of special processing shell file to be executed before and after start processing. - START_SP_CMDFILE_BEFORE=`echo $line | cut -d"," -f 5` - if [ ! -z "$START_SP_CMDFILE_BEFORE" ]; then - # The starting (before executing) special processing shell file does not exist. - if [ ! -e $START_SP_CMDFILE_BEFORE ]; then - log_info "$PROC_LIST format error (START_SP_CMDFILE_BEFORE) not exists. line $LINE_NO" - exit 2 - fi - if [ ! -x $START_SP_CMDFILE_BEFORE ]; then - log_info "$PROC_LIST format error (START_SP_CMDFILE_BEFORE) not exeutable. line $LINE_NO" - exit 2 - fi - fi - - START_SP_CMDFILE_AFTER=`echo $line | cut -d"," -f 6` - if [ ! -z "$START_SP_CMDFILE_AFTER" ]; then - # The restarting (before executing) special processing shell file does not exist. - if [ ! -e $START_SP_CMDFILE_AFTER ]; then - log_info "$PROC_LIST format error (START_SP_CMDFILE_AFTER) not exists. line $LINE_NO" - exit 2 - fi - if [ ! -x $START_SP_CMDFILE_AFTER ]; then - log_info "$PROC_LIST format error (START_SP_CMDFILE_AFTER) not exeutable. line $LINE_NO" - exit 2 - fi - fi - - # Check the existence and validity of special processing shell file to be executed before and after restart processing. - RESTART_SP_CMDFILE_BEFORE=`echo $line | cut -d"," -f 7` - if [ ! -z "$RESTART_SP_CMDFILE_BEFORE" ]; then - # The restarting (before executing) special processing shell file does not exist. - if [ ! -e $RESTART_SP_CMDFILE_BEFORE ]; then - log_info "$PROC_LIST format error (RESTART_SP_CMDFILE_BEFORE) not exists. line $LINE_NO" - exit 2 - fi - if [ ! -x $RESTART_SP_CMDFILE_BEFORE ]; then - log_info "$PROC_LIST format error (RESTART_SP_CMDFILE_BEFORE) not exeutable. line $LINE_NO" - exit 2 - fi - fi - - RESTART_SP_CMDFILE_AFTER=`echo $line | cut -d"," -f 8` - if [ ! -z "$RESTART_SP_CMDFILE_AFTER" ]; then - # The restarting (before executing) special processing shell file does not exist. - if [ ! -e $RESTART_SP_CMDFILE_AFTER ]; then - log_info "$PROC_LIST format error (RESTART_SP_CMDFILE_AFTER) not exists. line $LINE_NO" - exit 2 - fi - if [ ! -x $RESTART_SP_CMDFILE_AFTER ]; then - log_info "$PROC_LIST format error (RESTART_SP_CMDFILE_AFTER) not exeutable. line $LINE_NO" - exit 2 - fi - fi - - LINE_NO=`expr $LINE_NO + 1` - done -} - diff --git a/masakarimonitors/processmonitor/nova_compute_reboot_before.sh b/masakarimonitors/processmonitor/nova_compute_reboot_before.sh deleted file mode 100755 index 35878cd..0000000 --- a/masakarimonitors/processmonitor/nova_compute_reboot_before.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -# Copyright(c) 2016 Nippon Telegraph and Telephone Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Delete the child process as the required steps to restart of nova_compute process. - -KILL_PS_LIST=(`ps -ef | grep nova-compute | grep -v grep | awk '{ print $2; }'`) - -for PS_ID in ${KILL_PS_LIST[@]} -do - sudo kill -9 ${PS_ID} -done diff --git a/masakarimonitors/processmonitor/process_status_checker.sh b/masakarimonitors/processmonitor/process_status_checker.sh deleted file mode 100755 index 5751016..0000000 --- a/masakarimonitors/processmonitor/process_status_checker.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -# Copyright(c) 2016 Nippon Telegraph and Telephone Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Define constants -SCRIPT_DIR=$(cd $(dirname $0);pwd) -SCRIPT_COMMON_SH="$SCRIPT_DIR/common.sh" - -TMP_DIR="/var/tmp" -PROC_LIST=$1 -BAD_CODE_LIST_FILE="$TMP_DIR/badproc.list" - -# Common processing (check of proc.list) -. $SCRIPT_COMMON_SH -check_proc_file_common - -# Get the process list. -ps_result=`ps -ef` - -# Initialize abnormal condition list -cat /dev/null > ${BAD_CODE_LIST_FILE} - -# Process check main processing -while read line -do - PROC_NO=`echo $line | cut -d"," -f 1` - PROC_NAME=`echo $line | cut -d"," -f 2` - PROC_CHECK=`echo $ps_result |grep -c "${PROC_NAME}"` - # If process was not detect, register ID in the abnormality process. - if [ ${PROC_CHECK} -eq 0 ]; then - log_info "down process id_no : ${PROC_NO}" - echo ${PROC_NO} >> ${BAD_CODE_LIST_FILE} - fi -done < ${PROC_LIST} - -# If failing process ID was detected, decide state as abnormal termination(exit code:1). -if [ -s ${BAD_CODE_LIST_FILE} ]; then - exit 1 -fi - -exit 0 diff --git a/masakarimonitors/processmonitor/processmonitor.sh b/masakarimonitors/processmonitor/processmonitor.sh deleted file mode 100755 index fa57b4e..0000000 --- a/masakarimonitors/processmonitor/processmonitor.sh +++ /dev/null @@ -1,497 +0,0 @@ -#!/bin/bash - -# Copyright(c) 2016 Nippon Telegraph and Telephone Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Define constants. -BASE_NAME=`basename $0` -TMP_DIR="/var/tmp" -TMP_CRM_MON_FILE="$TMP_DIR/crm_mon.tmp" -STATUS_FILE="$TMP_DIR/node_status.tmp" -TMP_CRMADM_FILE="$TMP_DIR/crmadmin.tmp" -NOTICE_OUTPUT="$TMP_DIR/${BASE_NAME}_resp.out" - -SCRIPT_DIR=$(cd $(dirname $0);pwd) -SCRIPT_CHECK_PROCESS="$SCRIPT_DIR/process_status_checker.sh" -SCRIPT_COMMON_SH="$SCRIPT_DIR/common.sh" - -DOWN_PROCESS_LIST="$TMP_DIR/badproc.list" - -MASAKARI_API_SEND_PROGRAM=curl -MASAKARI_API_SEND_FAIL_FLG="off" - -ALREADY_SEND_ID_LIST=() -LOGTAG=`basename $0` -P_HOST=`uname -n` - -# Define the default setting. -DEFAULT_PROCESS_CHECK_INTERVAL=5 -DEFAULT_PROCESS_REBOOT_RETRY=3 -DEFAULT_REBOOT_INTERVAL=10 -DEFAULT_MASAKARI_API_SEND_TIMEOUT=10 -DEFAULT_MASAKARI_API_SEND_RETRY=12 -DEFAULT_MASAKARI_API_SEND_DELAY=10 - - -# This function locks a file -# Argument: -# $1 : File name -file_lock () { - exec 9>>$1 - flock -x 9 -} - -# This function unlocks a file -file_unlock () { - exec 9>&- -} - -# This function reads the configuration file and setting value. -# If the value is omitted, set the default value. -# If invalid value is set, return "1". -# Note) The default value for each item are as follows. -# PROCESS_CHECK_INTERVAL (defualt : 60) -# PROCESS_REBOOT_RETRY (default : 10) -# REBOOT_INTERVAL (default : 3) -# MASAKARI_API_SEND_TIMEOUT (defualt : 10) -# MASAKARI_API_SEND_RETRY (default : 3) -# MASAKARI_API_SEND_DELAY (default : 1) -# -# Return value: -# 0 : Setting completion -# 1 : Reading failure of the configuration or invalid setting value -# 2 : Omission of the required item -set_conf_value () { - # Initialize setting - unset PROCESS_CHECK_INTERVAL - unset PROCESS_REBOOT_RETRY - unset REBOOT_INTERVAL - unset MASAKARI_API_SEND_TIMEOUT - unset MASAKARI_API_SEND_RETRY - unset MASAKARI_API_SEND_DELAY - unset DOMAIN - unset PROJECT - unset ADMIN_USER - unset ADMIN_PASS - unset AUTH_URL - unset REGION - - # Read configuration file - source $SCRIPT_CONF_FILE > /dev/null 2>&1 - if [ $? -ne 0 ]; then - log_info "config file read error. [$SCRIPT_CONF_FILE]" - return 1 - fi - - # Empty string is permitted. If there is no key itself, consider it as an error. - - # If the PROCESS_CHECK_INTERVAL is omitted, set the default value. - # If invalid is set, return 1. - expect_empty=`echo -n $PROCESS_CHECK_INTERVAL | sed 's/[0-9]//g'` - if [ "x" = "x${PROCESS_CHECK_INTERVAL}" ]; then - PROCESS_CHECK_INTERVAL=$DEFAULT_PROCESS_CHECK_INTERVAL - elif [ "x" != "x${expect_empty}" ]; then - log_info "config file parameter error. [$SCRIPT_CONF_FILE:PROCESS_CHECK_INTERVAL]" - return 1 - fi - log_debug "config file parameter : PROCESS_CHECK_INTERVAL=$PROCESS_CHECK_INTERVAL" - - # If the PROCESS_REBOOT_RETRY is omitted, set the default value. - # If invalid is set, return 1. - expect_empty=`echo -n $PROCESS_REBOOT_RETRY | sed 's/[0-9]//g'` - if [ "x" = "x${PROCESS_REBOOT_RETRY}" ]; then - PROCESS_REBOOT_RETRY=$DEFAULT_PROCESS_REBOOT_RETRY - elif [ "x" != "x${expect_empty}" ]; then - log_info "config file parameter error. [$SCRIPT_CONF_FILE:PROCESS_REBOOT_RETRY]" - return 1 - fi - log_debug "config file parameter : PROCESS_REBOOT_RETRY=$PROCESS_REBOOT_RETRY" - - # If the REBOOT_INTERVAL is omitted, set the default value. - # If invalid is set, return 1. - expect_empty=`echo -n $REBOOT_INTERVAL | sed 's/[0-9]//g'` - if [ "x" = "x${REBOOT_INTERVAL}" ]; then - REBOOT_INTERVAL=$DEFAULT_REBOOT_INTERVAL - elif [ "x" != "x${expect_empty}" ]; then - log_info "config file parameter error. [$SCRIPT_CONF_FILE:REBOOT_INTERVAL]" - return 1 - fi - log_debug "config file parameter : REBOOT_INTERVAL=$REBOOT_INTERVAL" - - # If the MASAKARI_API_SEND_TIMEOUT is omitted, set the default value. - # If invalid is set, return 1. - expect_empty=`echo -n $MASAKARI_API_SEND_TIMEOUT | sed 's/[0-9]//g'` - if [ "x" = "x${MASAKARI_API_SEND_TIMEOUT}" ]; then - MASAKARI_API_SEND_TIMEOUT=$DEFAULT_MASAKARI_API_SEND_TIMEOUT - elif [ "x" != "x${expect_empty}" ]; then - log_info "config file parameter error. [$SCRIPT_CONF_FILE:MASAKARI_API_SEND_TIMEOUT]" - return 1 - fi - log_debug "config file parameter : MASAKARI_API_SEND_TIMEOUT=$MASAKARI_API_SEND_TIMEOUT" - - # If the MASAKARI_API_SEND_RETRY is omitted, set the default value. - # If invalid is set, return 1. - expect_empty=`echo -n $MASAKARI_API_SEND_RETRY | sed 's/[0-9]//g'` - if [ "x" = "x${MASAKARI_API_SEND_RETRY}" ]; then - MASAKARI_API_SEND_RETRY=$DEFAULT_MASAKARI_API_SEND_RETRY - elif [ "x" != "x${expect_empty}" ]; then - log_info "config file parameter error. [$SCRIPT_CONF_FILE:MASAKARI_API_SEND_RETRY]" - return 1 - fi - log_debug "config file parameter : MASAKARI_API_SEND_RETRY=$MASAKARI_API_SEND_RETRY" - - # If the MASAKARI_API_SEND_DELAY is omitted, set the default value. - # If invalid is set, return 1. - expect_empty=`echo -n $MASAKARI_API_SEND_DELAY | sed 's/[0-9]//g'` - if [ "x" = "x${MASAKARI_API_SEND_DELAY}" ]; then - MASAKARI_API_SEND_DELAY=$DEFAULT_MASAKARI_API_SEND_DELAY - elif [ "x" != "x${expect_empty}" ]; then - log_info "config file parameter error. [$SCRIPT_CONF_FILE:MASAKARI_API_SEND_DELAY]" - return 1 - fi - log_debug "config file parameter : MASAKARI_API_SEND_DELAY=$MASAKARI_API_SEND_DELAY" - - # If the DOMAIN is omitted, return 1. - if [ "x" = "x${DOMAIN}" ]; then - log_info "config file parameter error. [$DOMAIN:DOMAIN]" - return 1 - else - log_debug "config file parameter : DOMAIN=$DOMAIN" - fi - - # If the PROJECT is omitted, return 1. - if [ "x" = "x${PROJECT}" ]; then - log_info "config file parameter error. [$PROJECT:PROJECT]" - return 1 - else - log_debug "config file parameter : PROJECT=$PROJECT" - fi - - # If the ADMIN_USER is omitted, return 1. - if [ "x" = "x${ADMIN_USER}" ]; then - log_info "config file parameter error. [$ADMIN_USER:ADMIN_USER]" - return 1 - else - log_debug "config file parameter : ADMIN_USER=$ADMIN_USER" - fi - - # If the ADMIN_PASS is omitted, return 1. - if [ "x" = "x${ADMIN_PASS}" ]; then - log_info "config file parameter error. [$ADMIN_PASS:ADMIN_PASS]" - return 1 - else - log_debug "config file parameter : ADMIN_PASS=$ADMIN_PASS" - fi - - # If the AUTH_URL is omitted, return 1. - if [ "x" = "x${AUTH_URL}" ]; then - log_info "config file parameter error. [$AUTH_URL:AUTH_URL]" - return 1 - else - log_debug "config file parameter : AUTH_URL=$AUTH_URL" - fi - - # If the REGION is omitted, return 1. - if [ "x" = "x${REGION}" ]; then - log_info "config file parameter error. [$REGION:REGION]" - return 1 - else - log_debug "config file parameter : REGION=$REGION" - fi - - return 0 -} - -# Initial startup command execution method: -# This method does not execute same command as startup command that executed once. - -init_boot() { - log_debug "init_boot start" - CMD_LIST=() - for line in "${proc_list[@]}" - do - ALREADY_FLG="off" - CMD=`echo ${line} | cut -d"," -f 3` - SPECIAL_BEFORE=`echo $line | cut -d"," -f 5` - SPECIAL_AFTER=`echo $line | cut -d"," -f 6` - - # If there is no startup command, can proceed to the next command. - if [ -z "$CMD" ]; then - continue - fi - - # Check whether already is executed. - for CHECK_CMD in "${CMD_LIST[@]}" - do - if [ "$CHECK_CMD" = "$CMD" ]; then - ALREADY_FLG="on" - break - fi - done - - # Execute special processing before the initial startup. - if [ ! -z "$SPECIAL_BEFORE" ]; then - $SPECIAL_BEFORE - fi - - # If not be executed, execute start command. - if [ "$ALREADY_FLG" = "off" ]; then - OLD_IFS=$IFS - IFS=';' - set -- $CMD - CMD_SPLIT_LIST=("$@") - IFS=$OLD_IFS - for SPLIT_CMD in "${CMD_SPLIT_LIST[@]}" - do - $SPLIT_CMD > /dev/null 2>&1 - done - - CMD_LIST=("$CMD_LIST" "$CMD") - fi - - # Execute special processing after the initial startup. - if [ ! -z "$SPECIAL_AFTER" ]; then - $SPECIAL_AFTER - fi - done - log_debug "init_boot end" -} - -# This function creates data that is notified to the masakari api. -# It is called from the child process. -# -make_notice_data () { - TIME=`date -u +'%Y-%m-%d %H:%M:%S'` - - PAYLOAD="{\"event\": \"STOPPED\", \"process_name\": \"${PROCESS_NAME}\"}" - -} - - - -# This function notifies to the masakari api. -# It is called masakari_cli post_event method. -send_notification () { - TYPE="PROCESS" - TARGET="post_event" - AUTH_INFO="--os-domain-name ${DOMAIN} --os-project-name ${PROJECT} --os-region-name ${REGION} --os-auth-url ${AUTH_URL} --os-username ${ADMIN_USER} --os-password ${ADMIN_PASS}" - - log_info "info : Send a notification." - log_info "info : openstack ${AUTH_INFO} notification create ${TYPE} ${P_HOST} \"${TIME}\" \"${PAYLOAD}\"" - - RESP=`openstack ${AUTH_INFO} notification create ${TYPE} ${P_HOST} "${TIME}" "${PAYLOAD}"` - result=$? - - if [ $result -eq 0 ]; then - log_info "info : Succeeded in sending a notification." - log_info "info : $RESP" - else - log_info "info : Failed to send a notification. [exit-code: $result]" - log_info "info : $RESP" - MASAKARI_API_SEND_FAIL_FLG="on" - fi - - return - -} - -# Attempt to restart the failer process. -# If failure to number of retries, notify to the masakari api. - -down_process_reboot(){ - ALREADY_REBOOT_CMD_LIST=() - while read line - do - ALREADY_FLG="off" - # No processing is executed about process id included in the send list. - for already_id in "${ALREADY_SEND_ID_LIST[@]}" - do - if [ "$line" = "$already_id" ]; then - ALREADY_FLG="on" - break - fi - done - - if [ "$ALREADY_FLG" = "on" ]; then - continue - fi - - for proc in "${proc_list[@]}" - do - PROC_ID=`echo $proc | cut -d"," -f 1` - if [ "$line" = "$PROC_ID" ] ; then - CMD=`echo $proc | cut -d"," -f 4` - PROCESS_NAME=`echo $proc | cut -d"," -f 2` - SPECIAL_BEFORE=`echo $proc | cut -d"," -f 7` - SPECIAL_AFTER=`echo $proc | cut -d"," -f 8` - break - fi - done - - if [ ! -z "$SPECIAL_BEFORE" ]; then - $SPECIAL_BEFORE - fi - - # If there is not restart command, can proceed to the next command. - if [ -z "$CMD" ]; then - continue - fi - - RESULT_FLG=1 - # Decomposes multiple processing be joined by ";" and execute them. (restart execution part) - OLD_IFS=$IFS - IFS=';' - set -- $CMD - CMD_SPLIT_LIST=("$@") - IFS=$OLD_IFS - for SPLIT_CMD in "${CMD_SPLIT_LIST[@]}" - do - ALREADY_FLG="off" - # Check whether already is executed. - for CHECK_CMD in "${ALREADY_REBOOT_CMD_LIST[@]}" - do - if [ "$CHECK_CMD" = "$SPLIT_CMD" ]; then - ALREADY_FLG="on" - break - fi - done - # If is already executed, skip. - if [ "$ALREADY_FLG" = "on" ]; then - continue - fi - - log_debug "reboot cmd:$SPLIT_CMD" - $SPLIT_CMD > /dev/null 2>&1 - if [ $? -ne 0 ]; then - RESULT_FLG=0 - break - else - ALREADY_REBOOT_CMD_LIST=("$ALREADY_REBOOT_CMD_LIST" "$SPLIT_CMD") - fi - done - - # If fail to restart, executes retry restart. - if [ $RESULT_FLG -ne 1 ]; then - result=0 - for retry in `seq $PROCESS_REBOOT_RETRY` - do - sleep $REBOOT_INTERVAL - # Retry the restart processing. - RESULT_FLG=1 - for SPLIT_CMD in "${CMD_SPLIT_LIST[@]}" - do - ALREADY_FLG="off" - # Check whether already is executed. - for CHECK_CMD in "${ALREADY_REBOOT_CMD_LIST[@]}" - do - if [ "$CHECK_CMD" = "$SPLIT_CMD" ]; then - ALREADY_FLG="on" - break - fi - done - # If is already executed, skip. - if [ "$ALREADY_FLG" = "on" ]; then - continue - fi - log_debug "reboot cmd:$SPLIT_CMD" - $SPLIT_CMD > /dev/null 2>&1 - if [ $? -ne 0 ]; then - RESULT_FLG=0 - break - else - ALREADY_REBOOT_CMD_LIST=("$ALREADY_REBOOT_CMD_LIST" "$SPLIT_CMD") - fi - done - if [ $RESULT_FLG -eq 1 ]; then - break - elif [ $retry -eq $PROCESS_REBOOT_RETRY ]; then - # If number of retries is exceeded, notify to the masakari api. - make_notice_data - if [ $result -eq 0 ]&& - [ "$MASAKARI_API_SEND_FAIL_FLG" = "off" ]; then - send_notification - fi - # Add the sent list. - ALREADY_SEND_ID_LIST=("${ALREADY_SEND_ID_LIST[@]}" "${line}") - fi - done - fi - - # Special processes after restart. - if [ ! -z "$SPECIAL_AFTER" ]; then - $SPECIAL_AFTER - fi - - - done < $DOWN_PROCESS_LIST -} - - -# Argument check -if [ $# -ne 2 ]; then - echo "Usage: $0 " - exit 1 -else - SCRIPT_CONF_FILE=$1 - PROC_LIST=$2 -fi - -# Initial processing (check proc.list and read conf file) -. $SCRIPT_COMMON_SH - -# Output warning message. -log_info "WARNING : $0 is deprecated as of the Ocata release and will be removed in the Queens release. Use masakari-processmonitor implemented in python instead of $0." - -log_debug "processmonitor start!!" -check_proc_file_common -set_conf_value -if [ $? -ne 0 ]; then - exit 1 -fi - -if [ -e $NOTICE_OUTPUT ]; then - sudo rm -rf $NOTICE_OUTPUT -fi - -# Initial startup -init_boot - -while true -do - # Recheck and reload of the proc.list. - check_proc_file_common - # If invalid value is set to configuration file, set default value. - set_conf_value - - if [ $? -ne 0 ]; then - exit 1 - fi - - # Execute process check processing. - ${SCRIPT_CHECK_PROCESS} ${PROC_LIST} - RESULT_CODE=$? - - # If the return code is 2, because can't continue functionally, stop. - if [ $RESULT_CODE -eq 2 ]; then - log_debug "process_status_checker down!" - exit 1 - fi - - # If the failing process is detected by shell check, retry restart. - if [ $RESULT_CODE -ne 0 ]; then - down_process_reboot - fi - - sleep ${PROCESS_CHECK_INTERVAL} -done diff --git a/setup.cfg b/setup.cfg index 441629d..ed6a83f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,10 +22,6 @@ classifier = packages = masakarimonitors -scripts = - masakarimonitors/cmd/masakari-hostmonitor.sh - masakarimonitors/cmd/masakari-processmonitor.sh - [entry_points] oslo.config.opts = masakarimonitors.conf = masakarimonitors.conf.opts:list_opts