metal/mtce/src/maintenance/mtcCmdHdlr.cpp

979 lines
41 KiB
C++

/*
* Copyright (c) 2013-2017, 2023 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*/
/****************************************************************************
* @file
* Wind River Titanium Cloud Maintenance Command Handler FSM Implementation
*
* nodeLinkClass::cmd_handler
*
****************************************************************************/
using namespace std;
#define __AREA__ "cmd"
#include "nodeClass.h" /* for ... nodeLinkClass */
#include "nodeUtil.h" /* for ... clean_bm_response_files */
#include "nodeTimers.h" /* for ... mtcTimer_start/stop */
#include "mtcNodeMsg.h" /* for ... send_mtc_cmd */
#include "nodeCmds.h" /* for ... Cmd hdl'ing stages & struct */
extern void mtcTimer_handler ( int sig, siginfo_t *si, void *uc);
string _get_cmd_str( int this_cmd )
{
string temp ;
switch (this_cmd)
{
case MTC_OPER__MODIFY_HOSTNAME:
{
temp = "Modify Hostname";
break ;
}
case MTC_OPER__RESET_PROGRESSION:
{
temp = "Reset Progression";
break ;
}
case MTC_OPER__HOST_SERVICES_CMD:
{
temp = "Host Services";
break ;
}
case MTC_OPER__RUN_IPMI_COMMAND:
{
temp = "IPMI Command";
break ;
}
default:
{
temp = "_unknown_" ;
}
}
return(temp);
}
void nodeLinkClass::mtcCmd_workQ_dump ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr->mtcCmd_work_fifo.size() != 0 )
{
for ( node_ptr->mtcCmd_work_fifo_ptr = node_ptr->mtcCmd_work_fifo.begin() ;
node_ptr->mtcCmd_work_fifo_ptr != node_ptr->mtcCmd_work_fifo.end();
node_ptr->mtcCmd_work_fifo_ptr++ )
{
printf ( "%15s mtceCmd_workQ:%10s seq:%d stage:%d status [%d:%s]\n",
node_ptr->hostname.c_str(),
_get_cmd_str(node_ptr->mtcCmd_work_fifo_ptr->cmd).c_str(),
node_ptr->mtcCmd_work_fifo_ptr->seq,
node_ptr->mtcCmd_work_fifo_ptr->stage,
node_ptr->mtcCmd_work_fifo_ptr->status,
node_ptr->mtcCmd_work_fifo_ptr->status_string.c_str());
}
}
}
void nodeLinkClass::mtcCmd_doneQ_dump ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr->mtcCmd_done_fifo.size() != 0 )
{
for ( node_ptr->mtcCmd_done_fifo_ptr = node_ptr->mtcCmd_done_fifo.begin() ;
node_ptr->mtcCmd_done_fifo_ptr != node_ptr->mtcCmd_done_fifo.end();
node_ptr->mtcCmd_done_fifo_ptr++ )
{
printf ( "%15s mtceCmd_doneQ:%10s seq:%d stage:%d status [%d:%s]\n",
node_ptr->hostname.c_str(),
_get_cmd_str(node_ptr->mtcCmd_done_fifo_ptr->cmd).c_str(),
node_ptr->mtcCmd_done_fifo_ptr->seq,
node_ptr->mtcCmd_done_fifo_ptr->stage,
node_ptr->mtcCmd_done_fifo_ptr->status,
node_ptr->mtcCmd_work_fifo_ptr->status_string.c_str());
}
}
}
void nodeLinkClass::mtcCmd_doneQ_dump_all ( void )
{
struct node * ptr = static_cast<struct node *>(NULL) ;
/* check for empty list condition */
if ( head != NULL )
{
/* Now search the node list */
for ( ptr = head ; ptr != NULL ; ptr = ptr->next )
{
mtcCmd_doneQ_dump ( ptr );
mtcCmd_doneQ_purge ( ptr );
}
}
}
void nodeLinkClass::mtcCmd_workQ_dump_all ( void )
{
struct node * ptr = static_cast<struct node *>(NULL) ;
/* check for empty list condition */
if ( head != NULL )
{
/* Now search the node list */
for ( ptr = head ; ptr != NULL ; ptr = ptr->next )
{
mtcCmd_workQ_dump ( ptr );
}
}
}
int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr )
{
int rc = PASS ;
/* Should not be called empty but check just in case */
if ( node_ptr->mtcCmd_work_fifo.size() == 0 )
return (rc);
node_ptr->mtcCmd_work_fifo_ptr = node_ptr->mtcCmd_work_fifo.begin ();
switch ( node_ptr->mtcCmd_work_fifo_ptr->stage )
{
case MTC_CMD_STAGE__START:
{
dlog ("%s mtcCmd: %d:%d.%d\n",
node_ptr->hostname.c_str(),
node_ptr->mtcCmd_work_fifo_ptr->cmd,
node_ptr->mtcCmd_work_fifo_ptr->parm1,
node_ptr->mtcCmd_work_fifo_ptr->parm2);
if ( node_ptr->mtcCmd_work_fifo_ptr->cmd == MTC_OPER__RESET_PROGRESSION )
{
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET_PROGRESSION_START ;
}
else if ( node_ptr->mtcCmd_work_fifo_ptr->cmd == MTC_OPER__HOST_SERVICES_CMD )
{
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__HOST_SERVICES_SEND_CMD ;
}
else if ( node_ptr->mtcCmd_work_fifo_ptr->cmd == MTC_OPER__MODIFY_HOSTNAME )
{
send_hbs_command ( node_ptr->hostname, MTC_CMD_DEL_HOST );
send_guest_command ( node_ptr->hostname, MTC_CMD_DEL_HOST );
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__MODIFY_HOSTNAME_START ;
}
else
{
slog ("%s Unsupported Mtce Command (%d)\n",
node_ptr->hostname.c_str(),
node_ptr->mtcCmd_work_fifo_ptr->cmd );
node_ptr->mtcCmd_work_fifo_ptr->status = FAIL_BAD_PARM ;
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
}
break ;
}
case MTC_CMD_STAGE__HOST_SERVICES_SEND_CMD:
{
send_mtc_cmd ( node_ptr->hostname, node_ptr->host_services_req.cmd, MGMNT_INTERFACE );
/* Start timer that waits for the initial command received response
* There is no point in waiting for the longer host services
* execution timeout if the far end is not even able to ACK the
* initial test request. Bare in mind that the execution of the
* host services command can take a while so its timeout is much
* longer and polled for in the 3rd phase of this fsm but only
* if we get an initial command ACK. */
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_CMD_RSP_TIMEOUT );
/* change state to waiting for that initial ACK */
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__HOST_SERVICES_RECV_ACK ;
break ;
}
case MTC_CMD_STAGE__HOST_SERVICES_RECV_ACK:
{
if ( mtcTimer_expired ( node_ptr->mtcCmd_timer ) )
{
node_ptr->mtcCmd_work_fifo_ptr->status =
node_ptr->host_services_req.status = FAIL_NO_CMD_ACK ;
node_ptr->host_services_req.status_string =
node_ptr->host_services_req.name ;
node_ptr->host_services_req.status_string.append (" ack timeout") ;
dlog ("%s %s (rc:%d)\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.status_string.c_str(),
node_ptr->host_services_req.status );
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
}
else if ( node_ptr->host_services_req.ack )
{
/* get the host services timeout and add MTC_AGENT_TIMEOUT_EXTENSION
* seconds so that it is a bit longer than the mtcClient timeout */
int timeout = daemon_get_cfg_ptr()->host_services_timeout ;
timeout += MTC_AGENT_TIMEOUT_EXTENSION ;
dlog ("%s %s request ack (monitor mode)\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str());
node_ptr->host_services_req.cmd = MTC_CMD_HOST_SVCS_RESULT ;
node_ptr->mtcCmd_work_fifo_ptr->stage =
MTC_CMD_STAGE__HOST_SERVICES_WAIT_FOR_RESULT ;
mtcTimer_reset ( node_ptr->mtcCmd_timer );
mtcTimer_start ( node_ptr->mtcCmd_timer,
mtcTimer_handler,
timeout );
}
else if ( node_ptr->host_services_req.cmd == node_ptr->host_services_req.rsp )
{
dlog ("%s %s request ack (legacy mode)\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str());
// Upgrades that lock storage nodes can
// lead to storage corruption if ceph isn't given
// enough time to shut down.
//
// The following special case for storage node
// lock forces a 90 sec holdoff for pre-upgrade storage
// hosts ; i.e. legacy mode.
//
if ( is_storage(node_ptr) )
{
ilog ("%s waiting for ceph OSD shutdown\n", node_ptr->hostname.c_str());
mtcTimer_reset ( node_ptr->mtcCmd_timer );
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_LOCK_CEPH_DELAY );
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__STORAGE_LOCK_DELAY ;
}
else
{
node_ptr->mtcCmd_work_fifo_ptr->status =
node_ptr->host_services_req.status = PASS ;
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
}
}
break ;
}
case MTC_CMD_STAGE__STORAGE_LOCK_DELAY:
{
/* wait for the timer to expire before moving on */
if ( mtcTimer_expired ( node_ptr->mtcCmd_timer ) )
{
ilog ("%s ceph OSD shutdown wait complete\n",
node_ptr->hostname.c_str());
node_ptr->mtcCmd_work_fifo_ptr->status =
node_ptr->host_services_req.status = PASS ;
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
}
break ;
}
case MTC_CMD_STAGE__HOST_SERVICES_WAIT_FOR_RESULT:
{
if ( mtcTimer_expired ( node_ptr->mtcCmd_timer ) )
{
node_ptr->mtcCmd_work_fifo_ptr->status =
node_ptr->host_services_req.status = FAIL_TIMEOUT ;
node_ptr->host_services_req.status_string =
node_ptr->host_services_req.name ;
node_ptr->host_services_req.status_string.append (" execution timeout") ;
dlog ("%s %s (rc:%d)\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.status_string.c_str(),
node_ptr->host_services_req.status );
}
else if ( node_ptr->host_services_req.rsp != MTC_CMD_HOST_SVCS_RESULT )
{
/* waiting for result response ... */
break ;
}
else if ( node_ptr->host_services_req.status == PASS )
{
dlog ("%s %s completed\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str());
node_ptr->mtcCmd_work_fifo_ptr->status = PASS ;
}
else
{
node_ptr->mtcCmd_work_fifo_ptr->status =
node_ptr->host_services_req.status ;
if ( ! node_ptr->host_services_req.status_string.empty() )
{
wlog ("%s %s\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.status_string.c_str());
}
node_ptr->host_services_req.status_string =
node_ptr->host_services_req.name ;
node_ptr->host_services_req.status_string.append (" execution failed") ;
dlog ("%s %s ; rc:%d\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.status_string.c_str(),
node_ptr->host_services_req.status);
}
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
break ;
}
/***************************************************************************
*
* 'Reset Progression' Command Stages
*
* This target handler FSM is responsible for resetting a host through
* progression escalation of interfaces. First a reboot by command is
* attempted over the management network. If that fails the same operation
* is tried over the cluster-host network. If both reboot command
* attempts fail and the board management network for this host is
* provisioned then reset through it is attempted.
* Number of reset retries is specified in the command parameter 1
* where a value of -1 means infinitely and a value of zero means no
* retries ; only attempt up to all provisioned interfaces only once.
*
* *************************************************************************/
case MTC_CMD_STAGE__RESET_PROGRESSION_START:
{
if ( node_ptr->cmd.task == true )
{
/* Management Reboot Failed */
mtcInvApi_update_task ( node_ptr, MTC_TASK_REBOOT_REQUEST );
}
start_offline_handler ( node_ptr );
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__REBOOT ;
break ;
}
case MTC_CMD_STAGE__REBOOT:
{
int rc = PASS ;
bool send_reboot_ok = false ;
node_ptr->reboot_cmd_ack_mgmnt = false ;
node_ptr->reboot_cmd_ack_clstr = false ;
/* send reboot command */
node_ptr->cmdReq = MTC_CMD_REBOOT ;
node_ptr->cmdRsp = MTC_CMD_NONE ;
if (( rc = send_mtc_cmd ( node_ptr->hostname,
MTC_CMD_REBOOT,
MGMNT_INTERFACE )) != PASS )
{
wlog ("%s reboot request failed (%s) (rc:%d)\n",
node_ptr->hostname.c_str(),
get_iface_name_str(MGMNT_INTERFACE), rc);
}
else
{
send_reboot_ok = true ;
}
if ( clstr_network_provisioned == true )
{
if (( rc = send_mtc_cmd ( node_ptr->hostname,
MTC_CMD_REBOOT,
CLSTR_INTERFACE )) != PASS )
{
wlog ("%s reboot request failed (%s) (rc:%d)\n",
node_ptr->hostname.c_str(),
get_iface_name_str(CLSTR_INTERFACE), rc);
}
else
{
send_reboot_ok = true ;
}
}
if ( send_reboot_ok == true )
{
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__REBOOT_ACK ;
mtcTimer_reset ( node_ptr->mtcCmd_timer );
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_CMD_RSP_TIMEOUT );
ilog ("%s waiting for reboot ACK\n", node_ptr->hostname.c_str() );
}
else
{
/* This means that the mtcAgent can't send commands.
* Very unlikely case. Fail the operation.
*/
if ( node_ptr->cmd.task == true )
{
/* Reboot Failed */
mtcInvApi_update_task ( node_ptr, MTC_TASK_REBOOT_FAIL );
}
node_ptr->mtcCmd_work_fifo_ptr->status = FAIL_SOCKET_SENDTO ;
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
}
break ;
}
case MTC_CMD_STAGE__REBOOT_ACK:
{
/* can come in from either interface */
if ( node_ptr->cmdRsp != MTC_CMD_REBOOT )
{
if ( node_ptr->mtcCmd_timer.ring == true )
{
if (( node_ptr->cmd.task == true ) && ( node_ptr->cmd_retries == 0 ))
{
/* no need to repost task on retries */
mtcInvApi_update_task ( node_ptr, MTC_TASK_REBOOT_FAIL );
}
node_ptr->mtcCmd_timer.ring = false ;
/* progress to RESET if we have tried
* RESET_PROG_MAX_REBOOTS_B4_RESET times already */
if ( ++node_ptr->cmd_retries >= RESET_PROG_MAX_REBOOTS_B4_RESET )
{
wlog ("%s reboot ACK timeout ; max reboot retries reached",
node_ptr->hostname.c_str());
if ( node_ptr->bmc_provisioned )
{
int reset_delay = bmc_reset_delay - (RESET_PROG_MAX_REBOOTS_B4_RESET * MTC_CMD_RSP_TIMEOUT);
node_ptr->bmc_reset_pending_log_throttle = 0 ;
gettime ( node_ptr->reset_delay_start_time );
/* Clear the counts so we can tell if we have been getting mtcAlive
* messages from the remote host during the reset delay window */
node_ptr->mtcAlive_mgmnt_count = 0 ;
node_ptr->mtcAlive_clstr_count = 0 ;
wlog ("%s ... bmc reset in %d secs", node_ptr->hostname.c_str(), reset_delay);
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, reset_delay );
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET ;
}
else
{
ilog ("%s bmc not provisioned ; search for offline", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, offline_timeout_secs());
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ;
}
}
else
{
int retry_delay = MTC_CMD_RSP_TIMEOUT ;
wlog ("%s reboot ACK timeout ; reboot retry (%d of %d) in %d secs",
node_ptr->hostname.c_str(),
node_ptr->cmd_retries,
RESET_PROG_MAX_REBOOTS_B4_RESET-1,
retry_delay);
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, retry_delay );
}
}
}
else
{
/* declare successful reboot */
plog ("%s reboot request succeeded (%s %s)",
node_ptr->hostname.c_str(),
node_ptr->reboot_cmd_ack_mgmnt ? get_iface_name_str(MGMNT_INTERFACE) : "",
node_ptr->reboot_cmd_ack_clstr ? get_iface_name_str(CLSTR_INTERFACE) : "");
if ( node_ptr->cmd.task == true )
{
/* Management Reboot Failed */
mtcInvApi_update_task ( node_ptr, MTC_TASK_REBOOTING );
}
set_uptime ( node_ptr, 0 , false );
/* start timer that verifies board has reset */
mtcTimer_reset ( node_ptr->mtcCmd_timer );
/* progress to RESET if we have tried 5 times already */
if ( ++node_ptr->cmd_retries >= RESET_PROG_MAX_REBOOTS_B4_RESET )
{
int reset_delay = bmc_reset_delay - (RESET_PROG_MAX_REBOOTS_B4_RESET * MTC_CMD_RSP_TIMEOUT) ;
node_ptr->bmc_reset_pending_log_throttle = 0 ;
gettime ( node_ptr->reset_delay_start_time );
/* Clear the counts so we can tell if we have been getting mtcAlive
* messages from the remote host during the reset delay window */
node_ptr->mtcAlive_mgmnt_count = 0 ;
node_ptr->mtcAlive_clstr_count = 0 ;
wlog ("%s max reboot retries reached ; still not offline ; reset in %3d secs",
node_ptr->hostname.c_str(), reset_delay);
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, reset_delay );
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET ;
}
else
{
ilog ("%s searching for offline ; next reboot attempt in %d seconds\n",
node_ptr->hostname.c_str(), offline_timeout_secs());
/* After the host is reset we need to wait for it to stop sending mtcAlive messages
* Delay the time fo the offline handler to run to completion at least once before
* timing out and retrying the reset again */
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, offline_timeout_secs());
/* Wait for the host to go offline */
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ;
}
}
break ;
}
case MTC_CMD_STAGE__RESET:
{
if ( node_ptr->bmc_provisioned == true )
{
if ( node_ptr->mtcCmd_timer.ring == true )
{
if ( node_ptr->bmc_accessible == true )
{
plog ("%s issuing reset over bmc", node_ptr->hostname.c_str());
if ( node_ptr->cmd.task == true )
{
mtcInvApi_update_task ( node_ptr, MTC_TASK_RESET_REQUEST);
}
/* bmc power control reset by bmc */
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET );
if ( rc == PASS )
{
ilog ("%s bmc reset requested", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_BMC_REQUEST_DELAY );
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET_ACK;
break ;
}
else
{
node_ptr->mtcCmd_work_fifo_ptr->status = rc ;
wlog ("%s bmc reset command request failed (%d)", node_ptr->hostname.c_str(), rc );
}
}
else
{
wlog ("%s bmc not accessible ; unable to reset", node_ptr->hostname.c_str());
}
}
else
{
/* To handle potentially large bmc_reset_delay values that could
* be longer than a boot time this check cancels the reset once the
* node goes online. Maybe the reset did get through or the node
* rebooted quite fast.
*
* However, don't allow momentary heartbeat loss recovery handling
* or the failure of just one (mgmnt or clstr) networks to mistakenly
* cancel the reset. Prevent the cancel if
* - the node uptime is high and
* - not receiving mtcAlive both mgmnt and clstr networks.
*
* Note: online does not mean both networks are receiving mtcAlive,
* Currently just mgmnt needs to see mtcAlive for the node to
* go online.
* TODO: Fix this in the future so both are required.
* It came from the days when the cluster-host was named the
* infrastructure network where at that time it was optional.
* Cluster-host is no longer optional. */
if (( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) &&
( node_ptr->uptime < MTC_MINS_5 ) &&
( node_ptr->mtcAlive_mgmnt_count ) &&
( node_ptr->mtcAlive_clstr_count ))
{
mtcTimer_reset ( node_ptr->mtcCmd_timer );
ilog ("%s cancelling reset ; host is online ; delay:%d uptime:%d mtcAlive:%d:%d ",
node_ptr->hostname.c_str(),
bmc_reset_delay,
node_ptr->uptime,
node_ptr->mtcAlive_mgmnt_count,
node_ptr->mtcAlive_clstr_count);
node_ptr->mtcCmd_work_fifo_ptr->status = PASS ;
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
}
else
{
time_debug_type now_time ;
time_delta_type diff_time ;
int reset_delay = bmc_reset_delay - (RESET_PROG_MAX_REBOOTS_B4_RESET * MTC_CMD_RSP_TIMEOUT) ;
gettime ( now_time );
timedelta ( node_ptr->reset_delay_start_time, now_time, diff_time );
if ( reset_delay > diff_time.secs )
{
#define BMC_RESET_PENDING_LOG_THROTTLE (1000)
wlog_throttled ( node_ptr->bmc_reset_pending_log_throttle,
BMC_RESET_PENDING_LOG_THROTTLE,
"%s reset in %3ld secs ; delay:%d uptime:%d mtcAlive:%d:%d",
node_ptr->hostname.c_str(),
reset_delay-diff_time.secs,
bmc_reset_delay,
node_ptr->uptime,
node_ptr->mtcAlive_mgmnt_count,
node_ptr->mtcAlive_clstr_count);
}
}
break ; /* waiting path */
}
}
else if ( node_ptr->bmc_provisioned == false )
{
wlog ("%s bmc not provisioned", node_ptr->hostname.c_str());
}
/* if we get here then either
* - the bmc is not proivisioned,
* - the bmc is not accessible after the bmc_reset_delay
* - the reset send command failed
* So we need to just jump to the offline check which will
* retry the reboot/reset if the host still does not go
* offline aftrer calculated delay
*/
mtcTimer_reset ( node_ptr->mtcCmd_timer );
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, offline_timeout_secs());
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ;
break ;
}
case MTC_CMD_STAGE__RESET_ACK:
{
if ( node_ptr->mtcCmd_timer.ring == true )
{
/* bmc power control reset by bmc */
rc = bmc_command_recv ( node_ptr );
if ( rc == RETRY )
{
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_BMC_REQUEST_DELAY );
break ;
}
else if ( rc )
{
elog ("%s bmc reset request failed [rc:%d]\n", node_ptr->hostname.c_str(), rc);
if ( node_ptr->cmd.task == true )
{
mtcInvApi_update_task ( node_ptr, MTC_TASK_RESET_FAIL);
}
node_ptr->mtcCmd_work_fifo_ptr->status = rc ;
}
else
{
plog ("%s bmc reset request succeeded\n", node_ptr->hostname.c_str());
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__RESET ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__REBOOT ))
{
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_AUTO_RESET );
}
set_uptime ( node_ptr, 0 , false );
if ( node_ptr->cmd.task == true )
{
mtcInvApi_update_task ( node_ptr, MTC_TASK_RESETTING );
}
}
ilog ("%s waiting for host to go offline ; %d secs before retrying reboot/reset",
node_ptr->hostname.c_str(), offline_timeout_secs());
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ;
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, offline_timeout_secs());
}
break ;
}
case MTC_CMD_STAGE__OFFLINE_CHECK:
{
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE )
{
mtcTimer_reset ( node_ptr->mtcCmd_timer );
clear_service_readies ( node_ptr );
qlog ("%s reset progression complete ; host is offline (after %d retries)\n",
node_ptr->hostname.c_str(),
node_ptr->cmd_retries );
node_ptr->mtcCmd_work_fifo_ptr->status = PASS ;
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
}
else if ( node_ptr->mtcCmd_timer.ring == true )
{
if ( ++node_ptr->cmd_retries < RESET_PROG_MAX_REBOOTS_B4_RETRY )
{
ilog ("%s reboot (retry %d of %d)\n",
node_ptr->hostname.c_str(),
node_ptr->cmd_retries,
RESET_PROG_MAX_REBOOTS_B4_RETRY );
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__REBOOT ;
}
else
{
ilog ("%s still not offline\n", node_ptr->hostname.c_str());
node_ptr->mtcCmd_work_fifo_ptr->status = FAIL_RETRY ;
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET_PROGRESSION_RETRY ;
}
}
break ;
}
case MTC_CMD_STAGE__RESET_PROGRESSION_RETRY:
{
/* Complete command if we reach max retries */
if ( ++node_ptr->mtcCmd_work_fifo_ptr->parm2 > node_ptr->mtcCmd_work_fifo_ptr->parm1 )
{
plog ("%s reset progression done\n", node_ptr->hostname.c_str());
node_ptr->mtcCmd_work_fifo_ptr->status = FAIL_RETRY ;
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
}
else
{
wlog ("%s reset progression retry\n", node_ptr->hostname.c_str());
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET_PROGRESSION_START ;
}
stop_offline_handler ( node_ptr );
break ;
}
case MTC_CMD_STAGE__IPMI_COMMAND_SEND:
{
if ( bmc_command_send ( node_ptr, node_ptr->cmdReq ) != PASS )
{
elog ("%s IPMI %s Send Failed\n",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(node_ptr->cmdReq).c_str());
node_ptr->mtcCmd_work_fifo_ptr->status = FAIL_RETRY ;
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
}
else
{
plog ("%s IPMI %s Requested\n",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(node_ptr->cmdReq).c_str());
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_BMC_REQUEST_DELAY );
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__IPMI_COMMAND_RECV ;
}
break ;
}
case MTC_CMD_STAGE__IPMI_COMMAND_RECV:
{
if ( mtcTimer_expired ( node_ptr->mtcCmd_timer ) )
{
rc = bmc_command_recv ( node_ptr );
if ( rc == RETRY )
{
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_SECS_5 ) ;
break ;
}
else if ( rc == PASS )
{
plog ("%s IPMI %s Successful\n", node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(node_ptr->cmdReq).c_str());
}
else
{
plog ("%s IPMI %s Requested\n", node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(node_ptr->cmdReq).c_str());
}
node_ptr->mtcCmd_work_fifo_ptr->status = rc ;
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ;
}
break ;
}
/***************************************************************************
*
* 'Modify Hostname' Command Stages
*
* *************************************************************************/
case MTC_CMD_STAGE__MODIFY_HOSTNAME_START:
{
send_hbs_command ( node_ptr->hostname, MTC_CMD_DEL_HOST );
send_hwmon_command ( node_ptr->hostname, MTC_CMD_DEL_HOST );
send_guest_command ( node_ptr->hostname, MTC_CMD_DEL_HOST );
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, work_queue_timeout );
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__MODIFY_HOSTNAME_DELETE_WAIT ;
break ;
}
case MTC_CMD_STAGE__MODIFY_HOSTNAME_DELETE_WAIT:
{
/* We still doing enable work ? */
if ( node_ptr->libEvent_work_fifo.size () == 0 )
{
string name = node_ptr->mtcCmd_work_fifo_ptr->name ;
if ( node_ptr->mtcCmd_timer.tid )
mtcTimer_stop ( node_ptr->mtcCmd_timer );
/* make the change */
hostname_inventory.remove ( node_ptr->hostname );
node_ptr->hostname = name ;
hostname_inventory.push_back ( node_ptr->hostname );
/* update the timer hostname */
node_ptr->mtcTimer.hostname = name ;
node_ptr->mtcAlive_timer.hostname = name ;
node_ptr->mtcSwact_timer.hostname = name ;
node_ptr->mtcCmd_timer.hostname = name ;
node_ptr->oosTestTimer.hostname = name ;
node_ptr->insvTestTimer.hostname = name ;
node_ptr->mtcConfig_timer.hostname = name ;
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, work_queue_timeout );
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__MODIFY_HOSTNAME_CREATE_WAIT ;
/* return RETRY so that the FSM reloads the inventory loop */
return (RETRY);
}
if ( node_ptr->mtcCmd_timer.ring == true )
{
elog ("%s mtcCmd timeout ; purging host's work queue\n", node_ptr->hostname.c_str());
workQueue_purge ( node_ptr );
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
}
break ;
}
case MTC_CMD_STAGE__MODIFY_HOSTNAME_CREATE_WAIT:
{
/* We still doing create work ? */
if ( node_ptr->libEvent_work_fifo.size() == 0 )
{
if ( node_ptr->mtcCmd_timer.tid )
mtcTimer_stop ( node_ptr->mtcCmd_timer );
send_hbs_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
send_guest_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
{
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
}
node_ptr->mtcCmd_work_fifo_ptr->status = PASS ;
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
}
if ( node_ptr->mtcCmd_timer.ring == true )
{
elog ("%s hostname change failed\n", node_ptr->hostname.c_str());
elog ("... workQueue empty timeout ; purging host's work queue\n");
workQueue_purge ( node_ptr );
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
}
break ;
}
case MTC_CMD_STAGE__DONE:
case MTC_CMD_STAGE__STAGES:
default:
{
int size ;
mtcTimer_reset ( node_ptr->mtcCmd_timer );
if ( node_ptr->mtcCmd_work_fifo_ptr->status != PASS )
{
qlog ("%s Command '%s' (%d) Failed (Status:%d)\n",
node_ptr->hostname.c_str(),
_get_cmd_str(node_ptr->mtcCmd_work_fifo_ptr->cmd).c_str(),
node_ptr->mtcCmd_work_fifo_ptr->cmd,
node_ptr->mtcCmd_work_fifo_ptr->status );
}
else
{
qlog ("%s Command '%s' Completed\n", node_ptr->hostname.c_str(),
_get_cmd_str(node_ptr->mtcCmd_work_fifo_ptr->cmd).c_str());
}
if ( ( size = node_ptr->mtcCmd_done_fifo.size()) != 0 )
{
wlog ( "%s mtcCmd doneQ not empty (contains %d elements)\n",
node_ptr->hostname.c_str(), size );
mtcCmd_doneQ_purge ( node_ptr );
}
node_ptr->mtcCmd_done_fifo.push_front(node_ptr->mtcCmd_work_fifo.front());
node_ptr->mtcCmd_work_fifo.pop_front();
break ;
}
}
return (PASS);
}
/* ***********************************************************************
*
* Name : nodeLinkClass::mtcCmd_workQ_purge
*
* Description: Removes all items from the work queue.
*
*/
int nodeLinkClass::mtcCmd_workQ_purge ( struct nodeLinkClass::node * node_ptr )
{
int size = node_ptr->mtcCmd_work_fifo.size() ;
if ( size )
{
wlog ("%s purging %d items from work queue\n", node_ptr->hostname.c_str(), size );
for ( node_ptr->mtcCmd_work_fifo_ptr = node_ptr->mtcCmd_work_fifo.begin();
node_ptr->mtcCmd_work_fifo_ptr != node_ptr->mtcCmd_work_fifo.end();
node_ptr->mtcCmd_work_fifo_ptr++ )
{
wlog ("%s purging mtcCmd '%s' in stage %d from work queue\n",
node_ptr->hostname.c_str(),
_get_cmd_str(node_ptr->mtcCmd_work_fifo_ptr->cmd).c_str(),
node_ptr->mtcCmd_work_fifo_ptr->stage);
}
node_ptr->mtcCmd_work_fifo.clear();
}
else
{
qlog ("%s all work done\n", node_ptr->hostname.c_str());
}
return (PASS);
}
/* ***********************************************************************
*
* Name : nodeLinkClass::mtcCmd_doneQ_purge
*
* Description: Removes all items from the mtcCmd done queue.
*
* Returns a failure, the sequence number of the first command
* in the done queue that did not PASS.
*
*/
int nodeLinkClass::mtcCmd_doneQ_purge ( struct nodeLinkClass::node * node_ptr )
{
int rc = PASS ;
int size = node_ptr->mtcCmd_done_fifo.size() ;
if ( size )
{
int index = 0 ;
for ( node_ptr->mtcCmd_done_fifo_ptr = node_ptr->mtcCmd_done_fifo.begin();
node_ptr->mtcCmd_done_fifo_ptr != node_ptr->mtcCmd_done_fifo.end();
node_ptr->mtcCmd_done_fifo_ptr++ )
{
index++ ;
if ( node_ptr->mtcCmd_done_fifo_ptr->status )
{
dlog ("%s mtcCmd:%d failed (status:%d) (%d of %d)\n",
node_ptr->hostname.c_str(),
node_ptr->mtcCmd_done_fifo_ptr->cmd,
node_ptr->mtcCmd_done_fifo_ptr->status,
index, size);
/* Save sequence of first failed command */
if ( rc == PASS )
{
rc = node_ptr->mtcCmd_done_fifo_ptr->seq ;
}
}
}
if ( rc == PASS )
{
dlog ("%s all (%d) mtcCmd operations passed\n", node_ptr->hostname.c_str(), size );
}
qlog ("%s purging %d items from done queue\n", node_ptr->hostname.c_str(), size );
node_ptr->mtcCmd_done_fifo.clear();
}
return (rc);
}