Merge "Improve maintenance power/reset control command retry handling"

This commit is contained in:
Zuul 2024-01-26 14:43:11 +00:00 committed by Gerrit Code Review
commit 25bb6a1dbf
7 changed files with 164 additions and 169 deletions

View File

@ -187,6 +187,7 @@ typedef enum
#define DEFAULT_GOENABLE_TIMEOUT (300) #define DEFAULT_GOENABLE_TIMEOUT (300)
#define DEFAULT_DOR_MODE_TIMEOUT (20) #define DEFAULT_DOR_MODE_TIMEOUT (20)
#define DEFAULT_DOR_MODE_AIO_TIMEOUT (600) #define DEFAULT_DOR_MODE_AIO_TIMEOUT (600)
#define DEFAULT_POWER_OFF_RETRY_WAIT (30)
/** TODO: Convert names to omit JSON part */ /** TODO: Convert names to omit JSON part */
#define MTC_JSON_INV_LABEL "ihosts" #define MTC_JSON_INV_LABEL "ihosts"
@ -323,9 +324,14 @@ typedef enum
#define COMMAND_RETRY_DELAY (8) /* from sshUtil.h */ #define COMMAND_RETRY_DELAY (8) /* from sshUtil.h */
#define COMMAND_DELAY (2) /* from sshUtil.h */ #define COMMAND_DELAY (2) /* from sshUtil.h */
/* Define Reset and Power Action retry controls ; delay, count and switch threshold */
#define MTC_POWER_ACTION_QUERY_WAIT (30)
#define MTC_POWER_ACTION_RETRY_DELAY (20) #define MTC_POWER_ACTION_RETRY_DELAY (20)
#define MTC_POWER_ACTION_RETRY_COUNT (10) #define MTC_POWER_ACTION_RETRY_COUNT (10)
#define MTC_RESET_ACTION_RETRY_COUNT (5) #define MTC_POWER_ACTION_SWITCH_THRESHOLD (MTC_POWER_ACTION_RETRY_COUNT/2)
#define MTC_RESET_ACTION_RETRY_DELAY (20)
#define MTC_RESET_ACTION_RETRY_COUNT (10)
#define MTC_RESET_ACTION_SWITCH_THRESHOLD (MTC_RESET_ACTION_RETRY_COUNT/2)
/* number of calls to the bmc_handler while bm_access is not confirmed */ /* number of calls to the bmc_handler while bm_access is not confirmed */
#define MTC_MAX_B2B_BM_ACCESS_FAIL_COUNT_B4_ALARM (5) #define MTC_MAX_B2B_BM_ACCESS_FAIL_COUNT_B4_ALARM (5)

View File

@ -2,7 +2,7 @@
#define __INCLUDE_NODETIMERS_HH__ #define __INCLUDE_NODETIMERS_HH__
/* /*
* Copyright (c) 2013-2016 Wind River Systems, Inc. * Copyright (c) 2013-2023 Wind River Systems, Inc.
* *
* SPDX-License-Identifier: Apache-2.0 * SPDX-License-Identifier: Apache-2.0
* *
@ -94,6 +94,9 @@
#define MTC_AGENT_TIMEOUT_EXTENSION (5) #define MTC_AGENT_TIMEOUT_EXTENSION (5)
#define MTC_LOCK_CEPH_DELAY (90) #define MTC_LOCK_CEPH_DELAY (90)
#define MTC_RECV_RETRY_WAIT (MTC_RETRY_WAIT)
#define MTC_RECV_WAIT (MTC_RETRY_WAIT)
/** Host must stay enabled for this long for the /** Host must stay enabled for this long for the
* failed_recovery_counter to get cleared */ * failed_recovery_counter to get cleared */
#define MTC_ENABLED_TIMER (5) #define MTC_ENABLED_TIMER (5)

View File

@ -244,6 +244,7 @@ nodeLinkClass::nodeLinkClass()
memory_used = 0 ; memory_used = 0 ;
hosts = 0 ; hosts = 0 ;
host_deleted = false ; host_deleted = false ;
power_off_retry_wait = DEFAULT_POWER_OFF_RETRY_WAIT ;
/* Init the base level pulse info and pointers for all interfaces */ /* Init the base level pulse info and pointers for all interfaces */
pulse_ptr = NULL ; pulse_ptr = NULL ;

View File

@ -1508,6 +1508,9 @@ public:
/** Host has been deleted */ /** Host has been deleted */
bool host_deleted ; bool host_deleted ;
/** seconds to wait between power-off retries */
int power_off_retry_wait ;
/** Host Administrative State Change public member function */ /** Host Administrative State Change public member function */
int admin_state_change ( string hostname, int admin_state_change ( string hostname,
string newAdminState ); string newAdminState );

View File

@ -100,15 +100,17 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
} }
case BMC_THREAD_CMD__POWER_RESET: case BMC_THREAD_CMD__POWER_RESET:
{ {
/* use immediate for all retries if server supports an immediate command */ /* Use graceful for the first half of the retry countdown
if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.reset.immediate.empty() )) * and immediate for the remaining retries. */
if ((!node_ptr->bmc_info.power_ctrl.reset.immediate.empty()) &&
( node_ptr->power_action_retries < MTC_RESET_ACTION_SWITCH_THRESHOLD))
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate);
/* unfaulted graceful if it exists */ /* Unfaulted graceful if it exists */
else if (!node_ptr->bmc_info.power_ctrl.reset.graceful.empty()) else if (!node_ptr->bmc_info.power_ctrl.reset.graceful.empty())
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.graceful); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.graceful);
/* unfaulted immediate if graceful does not exist */ /* Unfaulted immediate if graceful does not exist */
else if (!node_ptr->bmc_info.power_ctrl.reset.immediate.empty()) else if (!node_ptr->bmc_info.power_ctrl.reset.immediate.empty())
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate);
else else
@ -120,18 +122,19 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
} }
case BMC_THREAD_CMD__POWER_ON: case BMC_THREAD_CMD__POWER_ON:
{ {
/* use immediate for all retries if server supports an immediate command */ /* Use graceful for the first half of the retry countdown
if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT) && ( ! node_ptr->bmc_info.power_ctrl.poweron.immediate.empty() )) * and immediate for the remaining retries. */
if ((!node_ptr->bmc_info.power_ctrl.poweron.immediate.empty()) &&
( node_ptr->power_action_retries < MTC_POWER_ACTION_SWITCH_THRESHOLD))
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate);
/* unfaulted graceful if it exists */ /* Unfaulted graceful if it exists */
else if (!node_ptr->bmc_info.power_ctrl.poweron.graceful.empty()) else if (!node_ptr->bmc_info.power_ctrl.poweron.graceful.empty())
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.graceful); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.graceful);
/* unfaulted immediate if graceful does not exist */ /* Unfaulted immediate if graceful does not exist */
else if (!node_ptr->bmc_info.power_ctrl.poweron.immediate.empty()) else if (!node_ptr->bmc_info.power_ctrl.poweron.immediate.empty())
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate);
else else
{ {
elog("%s offers no supported poweron commands", node_ptr->hostname.c_str()); elog("%s offers no supported poweron commands", node_ptr->hostname.c_str());
@ -141,15 +144,17 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
} }
case BMC_THREAD_CMD__POWER_OFF: case BMC_THREAD_CMD__POWER_OFF:
{ {
/* use immediate for all retries if server supports an immediate command */ /* Use graceful for the first half of the retry countdown
if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty() )) * and immediate for the remaining retries. */
if ((!node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty() ) &&
( node_ptr->power_action_retries < MTC_POWER_ACTION_SWITCH_THRESHOLD))
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate);
/* unfaulted graceful if it exists */ /* Unfaulted graceful if it exists */
else if (!node_ptr->bmc_info.power_ctrl.poweroff.graceful.empty() ) else if (!node_ptr->bmc_info.power_ctrl.poweroff.graceful.empty() )
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.graceful); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.graceful);
/* unfaulted immediate if graceful does not exist */ /* Unfaulted immediate if graceful does not exist */
else if (!node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty()) else if (!node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty())
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate);
else else
@ -193,11 +198,23 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
{ {
want_fit = true ; want_fit = true ;
} }
else if (( command == BMC_THREAD_CMD__POWER_ON ) &&
( daemon_want_fit ( fit, node_ptr->hostname, "power_none" ) == true ))
{
/* Just change the command to query status */
command = BMC_THREAD_CMD__POWER_STATUS ;
}
else if (( command == BMC_THREAD_CMD__POWER_OFF ) && else if (( command == BMC_THREAD_CMD__POWER_OFF ) &&
( daemon_want_fit ( fit, node_ptr->hostname, "power_off" ) == true )) ( daemon_want_fit ( fit, node_ptr->hostname, "power_off" ) == true ))
{ {
want_fit = true ; want_fit = true ;
} }
else if (( command == BMC_THREAD_CMD__POWER_OFF ) &&
( daemon_want_fit ( fit, node_ptr->hostname, "power_none" ) == true ))
{
/* Just change the command to query status */
command = BMC_THREAD_CMD__POWER_STATUS ;
}
else if (( command == BMC_THREAD_CMD__POWER_CYCLE ) && else if (( command == BMC_THREAD_CMD__POWER_CYCLE ) &&
( daemon_want_fit ( fit, node_ptr->hostname, "power_cycle" ) == true )) ( daemon_want_fit ( fit, node_ptr->hostname, "power_cycle" ) == true ))
{ {

View File

@ -4007,7 +4007,6 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
} }
case MTC_RESET__REQ_SEND: case MTC_RESET__REQ_SEND:
{ {
node_ptr->power_action_retries--;
/* Handle loss of connectivity over retries */ /* Handle loss of connectivity over retries */
if ( node_ptr->bmc_provisioned == false ) if ( node_ptr->bmc_provisioned == false )
@ -4022,10 +4021,10 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
{ {
wlog ("%s Reset request rejected ; BMC not accessible ; retry in %d seconds \n", wlog ("%s Reset request rejected ; BMC not accessible ; retry in %d seconds \n",
node_ptr->hostname.c_str(), node_ptr->hostname.c_str(),
MTC_POWER_ACTION_RETRY_DELAY); MTC_RESET_ACTION_RETRY_DELAY);
mtcTimer_reset ( node_ptr->mtcTimer ); mtcTimer_reset ( node_ptr->mtcTimer );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY );
resetStageChange ( node_ptr , MTC_RESET__QUEUE ); resetStageChange ( node_ptr , MTC_RESET__QUEUE );
break ; break ;
} }
@ -4033,7 +4032,6 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
else else
{ {
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET ); rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET );
if ( rc ) if ( rc )
{ {
wlog ("%s Reset request failed (%d)\n", node_ptr->hostname.c_str(), rc ); wlog ("%s Reset request failed (%d)\n", node_ptr->hostname.c_str(), rc );
@ -4044,7 +4042,7 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
blog ("%s Reset requested\n", node_ptr->hostname.c_str()); blog ("%s Reset requested\n", node_ptr->hostname.c_str());
resetStageChange ( node_ptr , MTC_RESET__RESP_WAIT ); resetStageChange ( node_ptr , MTC_RESET__RESP_WAIT );
} }
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY );
} }
break ; break ;
} }
@ -4059,11 +4057,10 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
break ; break ;
} }
else if ( rc )
if ( rc )
{ {
elog ("%s Reset command failed (rc:%d)\n", node_ptr->hostname.c_str(), rc ); elog ("%s Reset command failed (rc:%d)\n", node_ptr->hostname.c_str(), rc );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY );
resetStageChange ( node_ptr, MTC_RESET__QUEUE ); resetStageChange ( node_ptr, MTC_RESET__QUEUE );
} }
else else
@ -4082,7 +4079,7 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{ {
node_ptr->mtcTimer.ring = false ; node_ptr->mtcTimer.ring = false ;
if ( node_ptr->power_action_retries > 0 ) if ( --node_ptr->power_action_retries >= 0 )
{ {
char buffer[64] ; char buffer[64] ;
int attempts = MTC_RESET_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; int attempts = MTC_RESET_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
@ -4455,6 +4452,7 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
case MTC_REINSTALL__POWEROFF: case MTC_REINSTALL__POWEROFF:
{ {
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
mtcTimer_reset ( node_ptr->mtcTimer ) ;
powerStageChange ( node_ptr, MTC_POWEROFF__REQ_SEND ); powerStageChange ( node_ptr, MTC_POWEROFF__REQ_SEND );
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF_WAIT ); reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF_WAIT );
break ; break ;
@ -4975,18 +4973,24 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->bm_ip.c_str(), node_ptr->bm_ip.c_str(),
rc ); rc );
} }
else
{
;
}
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
//the fall through to MTC_POWEROFF__REQ_SEND is intentional
MTCE_FALLTHROUGH; /* don't allow a timeout of zero to be passed in */
if ( power_off_retry_wait == 0 )
power_off_retry_wait = DEFAULT_POWER_OFF_RETRY_WAIT ;
ilog ("%s power off retry wait is %d seconds",
node_ptr->hostname.c_str(), power_off_retry_wait);
mtcTimer_reset ( node_ptr->mtcTimer ) ;
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
break ;
} }
case MTC_POWEROFF__REQ_SEND: case MTC_POWEROFF__REQ_SEND:
{ {
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
/* Handle loss of connectivity over retries */ /* Handle loss of connectivity over retries */
if ( node_ptr->bmc_provisioned == false ) if ( node_ptr->bmc_provisioned == false )
{ {
@ -4998,12 +5002,8 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->bmc_accessible == false ) if ( node_ptr->bmc_accessible == false )
{ {
wlog ("%s Power Off request rejected ; BMC not accessible ; retry in %d seconds\n", wlog ("%s Power Off request rejected ; BMC not accessible",
node_ptr->hostname.c_str(), node_ptr->hostname.c_str());
MTC_POWER_ACTION_RETRY_DELAY);
mtcTimer_reset ( node_ptr->mtcTimer );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
break ; break ;
} }
@ -5013,16 +5013,16 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF ); rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF );
if ( rc ) if ( rc )
{ {
node_ptr->power_action_retries--; wlog ("%s Power-Off request failed (%d)", node_ptr->hostname.c_str(), rc );
wlog ("%s Power-Off request failed (%d)\n", node_ptr->hostname.c_str(), rc );
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
} }
else else
{ {
ilog ("%s Power-Off requested\n", node_ptr->hostname.c_str()); ilog ("%s Power-Off requested", node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT ); powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT );
} }
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_WAIT );
}
} }
break ; break ;
} }
@ -5034,41 +5034,14 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
rc = bmc_command_recv ( node_ptr ); rc = bmc_command_recv ( node_ptr );
if ( rc == RETRY ) if ( rc == RETRY )
{ {
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_RETRY_WAIT );
break ; break ;
} }
else if ( rc ) else if ( rc )
{ {
elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str()); elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str());
// Need to handle retries in this case since we don't
// go through the QUEUE stage.
if ( --node_ptr->power_action_retries > 0 )
{
char buffer[255] ;
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT);
mtcInvApi_update_task ( node_ptr, buffer);
// The power off command can fail due to connectivity
// issue or if the server is now already powered off.
// The latter could occur if the previous power off
// command failed 'in response' but actually did end up
// powering off. In that case, if we continue to just
// retry the power off when the power is already off
// then that will just fail again since most redfish
// implementations fail rather than wave-on a power off
// request while the power is already off. In this case
// its better to switch to power query power status
// again and allow that result to put this power off
// FSM into the correct state to continue/retry the
// quest for power off.
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY ); powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY );
} mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_QUERY_WAIT );
else
{
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
}
} }
else else
{ {
@ -5091,6 +5064,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
plog ("%s is now offline\n", node_ptr->hostname.c_str()); plog ("%s is now offline\n", node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY ); powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_QUERY_WAIT );
} }
else if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) else if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{ {
@ -5100,13 +5074,16 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
break ; break ;
} }
case MTC_POWEROFF__POWERQRY: case MTC_POWEROFF__POWERQRY:
{
/* give the power off action some time to complete */
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{ {
if ( node_ptr->bmc_thread_ctrl.done ) if ( node_ptr->bmc_thread_ctrl.done )
{ {
/* Query Host Power Status */ /* Query Host Power Status */
if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS ) if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS )
{ {
elog ("%s '%s' send failed\n", elog ("%s '%s' send failed",
node_ptr->hostname.c_str(), node_ptr->hostname.c_str(),
bmcUtil_getCmd_str( bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str()); node_ptr->bmc_thread_info.command).c_str());
@ -5117,12 +5094,13 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
{ {
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY_WAIT ); powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY_WAIT );
} }
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_WAIT );
} }
else else
{ {
thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ; thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ;
} }
}
break ; break ;
} }
case MTC_POWEROFF__POWERQRY_WAIT: case MTC_POWEROFF__POWERQRY_WAIT:
@ -5132,7 +5110,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
int rc = bmc_command_recv ( node_ptr ) ; int rc = bmc_command_recv ( node_ptr ) ;
if ( rc == RETRY ) if ( rc == RETRY )
{ {
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_RETRY_WAIT );
break ; break ;
} }
else if ( rc != PASS ) else if ( rc != PASS )
@ -5183,15 +5161,12 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
} }
} }
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
} }
break ; break ;
} }
case MTC_POWEROFF__QUEUE: case MTC_POWEROFF__QUEUE:
{ {
if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) if ( --node_ptr->power_action_retries >= 0 )
{
if ( --node_ptr->power_action_retries > 0 )
{ {
char buffer[255] ; char buffer[255] ;
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
@ -5204,17 +5179,19 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
if (( node_ptr->bmc_thread_info.status ) && if (( node_ptr->bmc_thread_info.status ) &&
( node_ptr->bmc_thread_info.status != FAIL_SYSTEM_CALL)) ( node_ptr->bmc_thread_info.status != FAIL_SYSTEM_CALL))
{ {
wlog ("%s ... %s (rc:%d)\n", node_ptr->hostname.c_str(), wlog ("%s ... %s (rc:%d)", node_ptr->hostname.c_str(),
node_ptr->bmc_thread_info.status_string.c_str(), node_ptr->bmc_thread_info.status_string.c_str(),
node_ptr->bmc_thread_info.status ); node_ptr->bmc_thread_info.status );
} }
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND ); powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
ilog ("%s waiting %d seconds before next power off retry",
node_ptr->hostname.c_str(), power_off_retry_wait);
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, power_off_retry_wait );
} }
else else
{ {
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL ); powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
} }
}
break ; break ;
} }
case MTC_POWEROFF__DONE: case MTC_POWEROFF__DONE:
@ -5294,7 +5271,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->hostname.c_str(), node_ptr->hostname.c_str(),
MTC_POWER_ACTION_RETRY_DELAY); MTC_POWER_ACTION_RETRY_DELAY);
node_ptr->power_action_retries-- ;
mtcTimer_reset ( node_ptr->mtcTimer ); mtcTimer_reset ( node_ptr->mtcTimer );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
@ -5304,7 +5280,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) ; rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) ;
if ( rc ) if ( rc )
{ {
node_ptr->power_action_retries-- ;
powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
} }
else else
@ -5349,18 +5324,11 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
} }
} }
/* failure path handling */
else if ( node_ptr->power_action_retries <= 0 )
{
wlog ("%s current power state query failed ; "
"proceeding with power-on",
node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
}
else else
{ {
powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS ); wlog ("%s power state query failed",
node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
} }
} }
break ; break ;
@ -5383,7 +5351,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->bmc_accessible == false ) if ( node_ptr->bmc_accessible == false )
{ {
node_ptr->power_action_retries--;
wlog ("%s Power-On will fail ; not accessible to BMC ; retry in %d seconds \n", wlog ("%s Power-On will fail ; not accessible to BMC ; retry in %d seconds \n",
node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY); node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY);
@ -5397,7 +5364,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_ON ); rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_ON );
if ( rc ) if ( rc )
{ {
node_ptr->power_action_retries--;
wlog ("%s Power-On request failed (%d)\n", wlog ("%s Power-On request failed (%d)\n",
node_ptr->hostname.c_str(), rc ); node_ptr->hostname.c_str(), rc );
@ -5429,7 +5395,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
if ( rc ) if ( rc )
{ {
node_ptr->power_action_retries--;
elog ("%s Power-On command failed\n", node_ptr->hostname.c_str()); elog ("%s Power-On command failed\n", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
@ -5452,7 +5417,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{ {
node_ptr->mtcTimer.ring = false ; node_ptr->mtcTimer.ring = false ;
if ( node_ptr->power_action_retries > 0 ) if ( --node_ptr->power_action_retries >= 0 )
{ {
char buffer[64] ; char buffer[64] ;
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;

View File

@ -1,6 +1,6 @@
/* /*
* Copyright (c) 2016-2017 Wind River Systems, Inc. * Copyright (c) 2016-2023 Wind River Systems, Inc.
* *
* SPDX-License-Identifier: Apache-2.0 * SPDX-License-Identifier: Apache-2.0
* *
@ -316,7 +316,7 @@ void * mtcThread_bmc ( void * arg )
{ {
string chopped_request = bmcUtil_chop_system_req(request); string chopped_request = bmcUtil_chop_system_req(request);
daemon_remove_file ( datafile.data() ) ; daemon_remove_file ( datafile.data() ) ;
blog_t("%s %s", info_ptr->hostname.c_str(), chopped_request.c_str()); ilog_t("%s %s", info_ptr->hostname.c_str(), chopped_request.c_str());
/****** Make the system call ******/ /****** Make the system call ******/
rc = rc =