metal/mtce/src/maintenance/mtcCmdHdlr.cpp

/*
 * Copyright (c) 2013-2017, 2023 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
 */

/****************************************************************************
 * @file
 * Wind River Titanium Cloud Maintenance Command Handler FSM Implementation
 *
 *  nodeLinkClass::cmd_handler
 *
 ****************************************************************************/

using namespace std;

#define __AREA__ "cmd"

#include "nodeClass.h"     /* for ... nodeLinkClass               */
#include "nodeUtil.h"      /* for ... clean_bm_response_files     */
#include "nodeTimers.h"    /* for ... mtcTimer_start/stop         */
#include "mtcNodeMsg.h"    /* for ... send_mtc_cmd                */
#include "nodeCmds.h"      /* for ... Cmd hdl'ing stages & struct */

extern void mtcTimer_handler ( int sig, siginfo_t *si, void *uc);

string _get_cmd_str( int this_cmd )
{
    string temp ;
    switch (this_cmd)
    {
        case MTC_OPER__MODIFY_HOSTNAME:
        {
            temp = "Modify Hostname";
            break ;
        }
        case MTC_OPER__RESET_PROGRESSION:
        {
            temp = "Reset Progression";
            break ;
        }
        case MTC_OPER__HOST_SERVICES_CMD:
        {
            temp = "Host Services";
            break ;
        }
        case MTC_OPER__RUN_IPMI_COMMAND:
        {
            temp = "IPMI Command";
            break ;
        }
        default:
        {
            temp = "_unknown_" ;
        }
    }
    return(temp);
}

void nodeLinkClass::mtcCmd_workQ_dump ( struct nodeLinkClass::node * node_ptr )
{
    if ( node_ptr->mtcCmd_work_fifo.size() != 0 )
    {
        for ( node_ptr->mtcCmd_work_fifo_ptr  = node_ptr->mtcCmd_work_fifo.begin() ;
              node_ptr->mtcCmd_work_fifo_ptr != node_ptr->mtcCmd_work_fifo.end();
              node_ptr->mtcCmd_work_fifo_ptr++ )
        {
            printf ( "%15s mtceCmd_workQ:%10s seq:%d stage:%d status [%d:%s]\n",
                    node_ptr->hostname.c_str(),
                    _get_cmd_str(node_ptr->mtcCmd_work_fifo_ptr->cmd).c_str(),
                    node_ptr->mtcCmd_work_fifo_ptr->seq,
                    node_ptr->mtcCmd_work_fifo_ptr->stage,
                    node_ptr->mtcCmd_work_fifo_ptr->status,
                    node_ptr->mtcCmd_work_fifo_ptr->status_string.c_str());
        }
    }
}

void nodeLinkClass::mtcCmd_doneQ_dump ( struct nodeLinkClass::node * node_ptr )
{
    if ( node_ptr->mtcCmd_done_fifo.size() != 0 )
    {
        for ( node_ptr->mtcCmd_done_fifo_ptr  = node_ptr->mtcCmd_done_fifo.begin() ;
              node_ptr->mtcCmd_done_fifo_ptr != node_ptr->mtcCmd_done_fifo.end();
              node_ptr->mtcCmd_done_fifo_ptr++ )
        {
            printf ( "%15s mtceCmd_doneQ:%10s seq:%d stage:%d status [%d:%s]\n",
                    node_ptr->hostname.c_str(),
                    _get_cmd_str(node_ptr->mtcCmd_done_fifo_ptr->cmd).c_str(),
                    node_ptr->mtcCmd_done_fifo_ptr->seq,
                    node_ptr->mtcCmd_done_fifo_ptr->stage,
                    node_ptr->mtcCmd_done_fifo_ptr->status,
                    node_ptr->mtcCmd_work_fifo_ptr->status_string.c_str());
        }
    }
}

void nodeLinkClass::mtcCmd_doneQ_dump_all ( void )
{
    struct node * ptr = static_cast<struct node *>(NULL) ;

    /* check for empty list condition */
    if ( head != NULL )
    {
        /* Now search the node list */
        for ( ptr = head ; ptr != NULL ; ptr = ptr->next )
        {
            mtcCmd_doneQ_dump ( ptr );
            mtcCmd_doneQ_purge ( ptr );
        }
    }
}

void nodeLinkClass::mtcCmd_workQ_dump_all ( void )
{
    struct node * ptr = static_cast<struct node *>(NULL) ;

    /* check for empty list condition */
    if ( head != NULL )
    {
        /* Now search the node list */
        for ( ptr = head ; ptr != NULL ; ptr = ptr->next )
        {
            mtcCmd_workQ_dump ( ptr );
        }
    }
}

int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr )
{
    int rc = PASS ;

    /* Should not be called empty but check just in case */
    if ( node_ptr->mtcCmd_work_fifo.size() == 0 )
        return (rc);

    node_ptr->mtcCmd_work_fifo_ptr = node_ptr->mtcCmd_work_fifo.begin ();
    switch ( node_ptr->mtcCmd_work_fifo_ptr->stage )
    {
        case MTC_CMD_STAGE__START:
        {
            dlog ("%s mtcCmd: %d:%d.%d\n",
                      node_ptr->hostname.c_str(),
                      node_ptr->mtcCmd_work_fifo_ptr->cmd,
                      node_ptr->mtcCmd_work_fifo_ptr->parm1,
                      node_ptr->mtcCmd_work_fifo_ptr->parm2);

             if ( node_ptr->mtcCmd_work_fifo_ptr->cmd == MTC_OPER__RESET_PROGRESSION )
             {
                 node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET_PROGRESSION_START ;
             }
             else if ( node_ptr->mtcCmd_work_fifo_ptr->cmd == MTC_OPER__HOST_SERVICES_CMD )
             {
                 node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__HOST_SERVICES_SEND_CMD ;
             }
             else if ( node_ptr->mtcCmd_work_fifo_ptr->cmd == MTC_OPER__MODIFY_HOSTNAME )
             {
                 send_hbs_command   ( node_ptr->hostname, MTC_CMD_DEL_HOST );
                 send_guest_command ( node_ptr->hostname, MTC_CMD_DEL_HOST );

                 node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__MODIFY_HOSTNAME_START ;
             }
             else
             {
                 slog ("%s Unsupported Mtce Command (%d)\n",
                           node_ptr->hostname.c_str(),
                           node_ptr->mtcCmd_work_fifo_ptr->cmd );

                 node_ptr->mtcCmd_work_fifo_ptr->status = FAIL_BAD_PARM ;
                 node_ptr->mtcCmd_work_fifo_ptr->stage  = MTC_CMD_STAGE__DONE ;
             }
             break ;
        }

        case MTC_CMD_STAGE__HOST_SERVICES_SEND_CMD:
        {
            send_mtc_cmd ( node_ptr->hostname, node_ptr->host_services_req.cmd, MGMNT_INTERFACE );

            /* Start timer that waits for the initial command received response
             * There is no point in waiting for the longer host services
             * execution timeout if the far end is not even able to ACK the
             * initial test request. Bare in mind that the execution of the
             * host services command can take a while so its timeout is much
             * longer and polled for in the 3rd phase of this fsm but only
             * if we get an initial command ACK. */
            mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_CMD_RSP_TIMEOUT );

            /* change state to waiting for that initial ACK */
            node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__HOST_SERVICES_RECV_ACK ;
            break ;
        }
        case MTC_CMD_STAGE__HOST_SERVICES_RECV_ACK:
        {
            if ( mtcTimer_expired ( node_ptr->mtcCmd_timer ) )
            {
                node_ptr->mtcCmd_work_fifo_ptr->status =
                node_ptr->host_services_req.status = FAIL_NO_CMD_ACK ;
                node_ptr->host_services_req.status_string =
                node_ptr->host_services_req.name ;
                node_ptr->host_services_req.status_string.append (" ack timeout") ;

                dlog ("%s %s (rc:%d)\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->host_services_req.status_string.c_str(),
                          node_ptr->host_services_req.status );

                node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
            }
            else if ( node_ptr->host_services_req.ack )
            {
                /* get the host services timeout and add MTC_AGENT_TIMEOUT_EXTENSION
                 * seconds so that it is a bit longer than the mtcClient timeout */
                int timeout = daemon_get_cfg_ptr()->host_services_timeout ;
                    timeout += MTC_AGENT_TIMEOUT_EXTENSION ;

                dlog ("%s %s request ack (monitor mode)\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->host_services_req.name.c_str());

                node_ptr->host_services_req.cmd = MTC_CMD_HOST_SVCS_RESULT ;
                node_ptr->mtcCmd_work_fifo_ptr->stage =
                MTC_CMD_STAGE__HOST_SERVICES_WAIT_FOR_RESULT ;
                mtcTimer_reset ( node_ptr->mtcCmd_timer );
                mtcTimer_start ( node_ptr->mtcCmd_timer,
                                 mtcTimer_handler,
                                 timeout );
            }
            else if ( node_ptr->host_services_req.cmd == node_ptr->host_services_req.rsp )
            {
                dlog ("%s %s request ack (legacy mode)\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->host_services_req.name.c_str());

                // Upgrades that lock storage nodes can
                // lead to storage corruption if ceph isn't given
                // enough time to shut down.
                //
                // The following special case for storage node
                // lock forces a 90 sec holdoff for pre-upgrade storage
                // hosts ; i.e. legacy mode.
                //
                if ( is_storage(node_ptr) )
                {
                    ilog ("%s waiting for ceph OSD shutdown\n", node_ptr->hostname.c_str());
                    mtcTimer_reset ( node_ptr->mtcCmd_timer );
                    mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_LOCK_CEPH_DELAY );
                    node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__STORAGE_LOCK_DELAY ;
                }
                else
                {
                    node_ptr->mtcCmd_work_fifo_ptr->status =
                    node_ptr->host_services_req.status = PASS ;

                    node_ptr->mtcCmd_work_fifo_ptr->stage  = MTC_CMD_STAGE__DONE ;
                }
            }
            break ;
        }
        case MTC_CMD_STAGE__STORAGE_LOCK_DELAY:
        {
            /* wait for the timer to expire before moving on */
            if ( mtcTimer_expired ( node_ptr->mtcCmd_timer ) )
            {
                ilog ("%s ceph OSD shutdown wait complete\n",
                          node_ptr->hostname.c_str());

                node_ptr->mtcCmd_work_fifo_ptr->status =
                node_ptr->host_services_req.status = PASS ;

                node_ptr->mtcCmd_work_fifo_ptr->stage  = MTC_CMD_STAGE__DONE ;
            }
            break ;
        }
        case MTC_CMD_STAGE__HOST_SERVICES_WAIT_FOR_RESULT:
        {
            if ( mtcTimer_expired ( node_ptr->mtcCmd_timer ) )
            {
                node_ptr->mtcCmd_work_fifo_ptr->status =
                node_ptr->host_services_req.status = FAIL_TIMEOUT ;

                node_ptr->host_services_req.status_string =
                node_ptr->host_services_req.name ;
                node_ptr->host_services_req.status_string.append (" execution timeout") ;

                dlog ("%s %s (rc:%d)\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->host_services_req.status_string.c_str(),
                          node_ptr->host_services_req.status );
            }
            else if ( node_ptr->host_services_req.rsp != MTC_CMD_HOST_SVCS_RESULT )
            {
                /* waiting for result response ... */
                break ;
            }
            else if ( node_ptr->host_services_req.status == PASS )
            {
                dlog ("%s %s completed\n",
                        node_ptr->hostname.c_str(),
                        node_ptr->host_services_req.name.c_str());

                node_ptr->mtcCmd_work_fifo_ptr->status = PASS ;
            }
            else
            {
                node_ptr->mtcCmd_work_fifo_ptr->status =
                node_ptr->host_services_req.status ;

                if ( ! node_ptr->host_services_req.status_string.empty() )
                {
                    wlog ("%s %s\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->host_services_req.status_string.c_str());
                }

                node_ptr->host_services_req.status_string =
                node_ptr->host_services_req.name ;
                node_ptr->host_services_req.status_string.append (" execution failed") ;

                dlog ("%s %s ; rc:%d\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->host_services_req.status_string.c_str(),
                          node_ptr->host_services_req.status);
            }
            node_ptr->mtcCmd_work_fifo_ptr->stage  = MTC_CMD_STAGE__DONE ;
            break ;
        }
        /***************************************************************************
         *
         *                  'Reset Progression' Command Stages
         *
         * This target handler FSM is responsible for resetting a host through
         * progression escalation of interfaces. First a reboot by command is
         * attempted over the management network. If that fails the same operation
         * is tried over the cluster-host network. If both reboot command
         * attempts fail and the board management network for this host is
         * provisioned then reset through it is attempted.
         * Number of reset retries is specified in the command parameter 1
         * where a value of -1 means infinitely and a value of zero means no
         * retries ; only attempt up to all provisioned interfaces only once.
         *
         * *************************************************************************/
        case MTC_CMD_STAGE__RESET_PROGRESSION_START:
        {
            if ( node_ptr->cmd.task == true )
            {
                /* Management Reboot Failed */
                mtcInvApi_update_task ( node_ptr, MTC_TASK_REBOOT_REQUEST );
            }

            start_offline_handler ( node_ptr );

            node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__REBOOT ;
            break ;
        }
        case MTC_CMD_STAGE__REBOOT:
        {
            int rc = PASS ;
            bool send_reboot_ok = false ;

            node_ptr->reboot_cmd_ack_mgmnt = false ;
            node_ptr->reboot_cmd_ack_clstr = false ;

            /* send reboot command */
            node_ptr->cmdReq = MTC_CMD_REBOOT ;
            node_ptr->cmdRsp = MTC_CMD_NONE   ;
            if (( rc = send_mtc_cmd ( node_ptr->hostname,
                                      MTC_CMD_REBOOT,
                                      MGMNT_INTERFACE )) != PASS )
            {
                wlog ("%s reboot request failed (%s) (rc:%d)\n",
                       node_ptr->hostname.c_str(),
                       get_iface_name_str(MGMNT_INTERFACE), rc);
            }
            else
            {
                 send_reboot_ok = true ;
            }

            if ( clstr_network_provisioned == true )
            {
                if (( rc = send_mtc_cmd ( node_ptr->hostname,
                                          MTC_CMD_REBOOT,
                                          CLSTR_INTERFACE )) != PASS )
                {
                    wlog ("%s reboot request failed (%s) (rc:%d)\n",
                        node_ptr->hostname.c_str(),
                        get_iface_name_str(CLSTR_INTERFACE), rc);
                }
                else
                {
                    send_reboot_ok = true ;
                }
            }

            if ( send_reboot_ok == true )
            {
                node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__REBOOT_ACK ;
                mtcTimer_reset ( node_ptr->mtcCmd_timer );
                mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_CMD_RSP_TIMEOUT );

                ilog ("%s waiting for reboot ACK\n", node_ptr->hostname.c_str() );
            }
            else
            {
                /* This means that the mtcAgent can't send commands.
                 * Very unlikely case. Fail the operation.
                 */
                if ( node_ptr->cmd.task == true )
                {
                    /* Reboot Failed */
                    mtcInvApi_update_task ( node_ptr, MTC_TASK_REBOOT_FAIL );
                }
                node_ptr->mtcCmd_work_fifo_ptr->status = FAIL_SOCKET_SENDTO  ;
                node_ptr->mtcCmd_work_fifo_ptr->stage  = MTC_CMD_STAGE__DONE ;
            }
            break ;
        }
        case MTC_CMD_STAGE__REBOOT_ACK:
        {
            /* can come in from either interface */
            if ( node_ptr->cmdRsp != MTC_CMD_REBOOT )
            {
                if ( node_ptr->mtcCmd_timer.ring == true )
                {
                    if (( node_ptr->cmd.task == true ) && ( node_ptr->cmd_retries == 0 ))
                    {
                        /* no need to repost task on retries */
                        mtcInvApi_update_task ( node_ptr, MTC_TASK_REBOOT_FAIL );
                    }
                    node_ptr->mtcCmd_timer.ring = false ;

                    /* progress to RESET if we have tried
                     * RESET_PROG_MAX_REBOOTS_B4_RESET times already */
                    if ( ++node_ptr->cmd_retries >= RESET_PROG_MAX_REBOOTS_B4_RESET )
                    {
                        wlog ("%s reboot ACK timeout ; max reboot retries reached",
                                  node_ptr->hostname.c_str());
                        if ( node_ptr->bmc_provisioned )
                        {
                            int reset_delay = bmc_reset_delay - (RESET_PROG_MAX_REBOOTS_B4_RESET * MTC_CMD_RSP_TIMEOUT);
                            node_ptr->bmc_reset_pending_log_throttle = 0 ;
                            gettime ( node_ptr->reset_delay_start_time );

                            /* Clear the counts so we can tell if we have been getting mtcAlive
                             * messages from the remote host during the reset delay window */
                            node_ptr->mtcAlive_mgmnt_count = 0 ;
                            node_ptr->mtcAlive_clstr_count = 0 ;

                            wlog ("%s ... bmc reset in %d secs", node_ptr->hostname.c_str(), reset_delay);
                            mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, reset_delay );
                            node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET ;
                        }
                        else
                        {
                            ilog ("%s bmc not provisioned ; search for offline", node_ptr->hostname.c_str());
                            mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, offline_timeout_secs());
                            node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ;
                        }
                    }
                    else
                    {
                        int retry_delay = MTC_CMD_RSP_TIMEOUT ;
                        wlog ("%s reboot ACK timeout ; reboot retry (%d of %d) in %d secs",
                                      node_ptr->hostname.c_str(),
                                      node_ptr->cmd_retries,
                                      RESET_PROG_MAX_REBOOTS_B4_RESET-1,
                                      retry_delay);
                        mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, retry_delay );
                    }
                }
            }
            else
            {
                /* declare successful reboot */
                plog ("%s reboot request succeeded (%s %s)",
                          node_ptr->hostname.c_str(),
                          node_ptr->reboot_cmd_ack_mgmnt ? get_iface_name_str(MGMNT_INTERFACE) : "",
                          node_ptr->reboot_cmd_ack_clstr ? get_iface_name_str(CLSTR_INTERFACE) : "");

                if ( node_ptr->cmd.task == true )
                {
                    /* Management Reboot Failed */
                    mtcInvApi_update_task ( node_ptr, MTC_TASK_REBOOTING );
                }
                set_uptime ( node_ptr, 0 , false );

                /* start timer that verifies board has reset */
                mtcTimer_reset ( node_ptr->mtcCmd_timer );

                /* progress to RESET if we have tried 5 times already */
                if ( ++node_ptr->cmd_retries >= RESET_PROG_MAX_REBOOTS_B4_RESET )
                {
                    int reset_delay = bmc_reset_delay - (RESET_PROG_MAX_REBOOTS_B4_RESET * MTC_CMD_RSP_TIMEOUT) ;
                    node_ptr->bmc_reset_pending_log_throttle = 0 ;
                    gettime ( node_ptr->reset_delay_start_time );

                    /* Clear the counts so we can tell if we have been getting mtcAlive
                     * messages from the remote host during the reset delay window */
                    node_ptr->mtcAlive_mgmnt_count = 0 ;
                    node_ptr->mtcAlive_clstr_count = 0 ;

                    wlog ("%s max reboot retries reached ; still not offline ; reset in %3d secs",
                                  node_ptr->hostname.c_str(), reset_delay);
                    mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, reset_delay );
                    node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET ;
                }
                else
                {
                    ilog ("%s searching for offline ; next reboot attempt in %d seconds\n",
                              node_ptr->hostname.c_str(), offline_timeout_secs());

                    /* After the host is reset we need to wait for it to stop sending mtcAlive messages
                     * Delay the time fo the offline handler to run to completion at least once before
                     * timing out and retrying the reset again */
                    mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, offline_timeout_secs());

                    /* Wait for the host to go offline */
                    node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ;
                }
            }
            break ;
        }
        case MTC_CMD_STAGE__RESET:
        {
            if ( node_ptr->bmc_provisioned == true )
            {
                if ( node_ptr->mtcCmd_timer.ring == true )
                {
                    if ( node_ptr->bmc_accessible == true )
                    {
                        plog ("%s issuing reset over bmc", node_ptr->hostname.c_str());
                        if ( node_ptr->cmd.task == true )
                        {
                            mtcInvApi_update_task ( node_ptr, MTC_TASK_RESET_REQUEST);
                        }

                        /* bmc power control reset by bmc */
                        rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET );
                        if ( rc == PASS )
                        {
                             ilog ("%s bmc reset requested", node_ptr->hostname.c_str());
                             mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_BMC_REQUEST_DELAY );
                             node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET_ACK;
                             break ;
                         }
                         else
                         {
                             node_ptr->mtcCmd_work_fifo_ptr->status = rc ;
                             wlog ("%s bmc reset command request failed (%d)", node_ptr->hostname.c_str(), rc );
                         }
                    }
                    else
                    {
                        wlog ("%s bmc not accessible ; unable to reset", node_ptr->hostname.c_str());
                    }
                }
                else
                {
                    /* To handle potentially large bmc_reset_delay values that could
                     * be longer than a boot time this check cancels the reset once the
                     * node goes online. Maybe the reset did get through or the node
                     * rebooted quite fast.
                     *
                     * However, don't allow momentary heartbeat loss recovery handling
                     * or the failure of just one (mgmnt or clstr) networks to mistakenly
                     * cancel the reset. Prevent the cancel if
                     * - the node uptime is high and
                     * - not receiving mtcAlive both mgmnt and clstr networks.
                     *
                     * Note: online does not mean both networks are receiving mtcAlive,
                     *       Currently just mgmnt needs to see mtcAlive for the node to
                     *       go online.
                     * TODO: Fix this in the future so both are required.
                     *       It came from the days when the cluster-host was named the
                     *       infrastructure network where at that time it was optional.
                     *       Cluster-host is no longer optional. */
                    if (( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) &&
                        ( node_ptr->uptime < MTC_MINS_5 ) &&
                        ( node_ptr->mtcAlive_mgmnt_count ) &&
                        ( node_ptr->mtcAlive_clstr_count ))
                    {
                        mtcTimer_reset ( node_ptr->mtcCmd_timer );
                        ilog ("%s cancelling reset ; host is online ; delay:%d uptime:%d mtcAlive:%d:%d ",
                                  node_ptr->hostname.c_str(),
                                  bmc_reset_delay,
                                  node_ptr->uptime,
                                  node_ptr->mtcAlive_mgmnt_count,
                                  node_ptr->mtcAlive_clstr_count);
                        node_ptr->mtcCmd_work_fifo_ptr->status = PASS ;
                        node_ptr->mtcCmd_work_fifo_ptr->stage  = MTC_CMD_STAGE__DONE ;
                    }
                    else
                    {
                        time_debug_type now_time  ;
                        time_delta_type diff_time ;
                        int reset_delay = bmc_reset_delay - (RESET_PROG_MAX_REBOOTS_B4_RESET * MTC_CMD_RSP_TIMEOUT) ;
                        gettime ( now_time );
                        timedelta ( node_ptr->reset_delay_start_time, now_time, diff_time );
                        if ( reset_delay > diff_time.secs )
                        {
                            #define BMC_RESET_PENDING_LOG_THROTTLE (1000)
                            wlog_throttled ( node_ptr->bmc_reset_pending_log_throttle,
                                             BMC_RESET_PENDING_LOG_THROTTLE,
                                             "%s reset in %3ld secs ; delay:%d uptime:%d mtcAlive:%d:%d",
                                             node_ptr->hostname.c_str(),
                                             reset_delay-diff_time.secs,
                                             bmc_reset_delay,
                                             node_ptr->uptime,
                                             node_ptr->mtcAlive_mgmnt_count,
                                             node_ptr->mtcAlive_clstr_count);
                        }
                    }
                    break ; /* waiting path */
                }
            }
            else if ( node_ptr->bmc_provisioned == false )
            {
                wlog ("%s bmc not provisioned", node_ptr->hostname.c_str());
            }

            /* if we get here then either
             *  - the bmc is not proivisioned,
             *  - the bmc is not accessible after the bmc_reset_delay
             *  - the reset send command failed
             *  So we need to just jump to the offline check which will
             *  retry the reboot/reset if the host still does not go
             *  offline aftrer calculated delay
             */
            mtcTimer_reset ( node_ptr->mtcCmd_timer );
            mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, offline_timeout_secs());
            node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ;
            break ;
        }
        case MTC_CMD_STAGE__RESET_ACK:
        {
            if ( node_ptr->mtcCmd_timer.ring == true )
            {
                 /* bmc power control reset by bmc */
                 rc = bmc_command_recv ( node_ptr );
                 if ( rc == RETRY )
                 {
                     mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_BMC_REQUEST_DELAY );
                     break ;
                 }
                 else if ( rc )
                 {
                     elog ("%s bmc reset request failed [rc:%d]\n", node_ptr->hostname.c_str(), rc);
                     if ( node_ptr->cmd.task == true )
                     {
                        mtcInvApi_update_task ( node_ptr, MTC_TASK_RESET_FAIL);
                     }
                     node_ptr->mtcCmd_work_fifo_ptr->status = rc ;
                 }
                 else
                 {
                     plog ("%s bmc reset request succeeded\n", node_ptr->hostname.c_str());

                     if (( node_ptr->adminAction != MTC_ADMIN_ACTION__RESET ) &&
                         ( node_ptr->adminAction != MTC_ADMIN_ACTION__REBOOT ))
                     {
                         mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_AUTO_RESET );
                     }

                     set_uptime ( node_ptr, 0 , false );
                     if ( node_ptr->cmd.task == true )
                     {
                         mtcInvApi_update_task ( node_ptr, MTC_TASK_RESETTING );
                     }
                  }
                  ilog ("%s waiting for host to go offline ; %d secs before retrying reboot/reset",
                            node_ptr->hostname.c_str(), offline_timeout_secs());
                  node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ;
                  mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, offline_timeout_secs());
             }
             break ;
        }
        case MTC_CMD_STAGE__OFFLINE_CHECK:
        {
            if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE )
            {
                mtcTimer_reset ( node_ptr->mtcCmd_timer );

                clear_service_readies ( node_ptr );

                qlog ("%s reset progression complete ; host is offline (after %d retries)\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->cmd_retries );
                node_ptr->mtcCmd_work_fifo_ptr->status = PASS ;
                node_ptr->mtcCmd_work_fifo_ptr->stage  = MTC_CMD_STAGE__DONE ;
            }

            else if ( node_ptr->mtcCmd_timer.ring == true )
            {
                if ( ++node_ptr->cmd_retries < RESET_PROG_MAX_REBOOTS_B4_RETRY )
                {
                    ilog ("%s reboot (retry %d of %d)\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->cmd_retries,
                              RESET_PROG_MAX_REBOOTS_B4_RETRY );

                    node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__REBOOT ;
                }
                else
                {
                    ilog ("%s still not offline\n", node_ptr->hostname.c_str());
                    node_ptr->mtcCmd_work_fifo_ptr->status = FAIL_RETRY ;
                    node_ptr->mtcCmd_work_fifo_ptr->stage  = MTC_CMD_STAGE__RESET_PROGRESSION_RETRY ;
                }
            }
            break ;
        }
        case MTC_CMD_STAGE__RESET_PROGRESSION_RETRY:
        {
            /* Complete command if we reach max retries */
            if ( ++node_ptr->mtcCmd_work_fifo_ptr->parm2 > node_ptr->mtcCmd_work_fifo_ptr->parm1 )
            {
                plog ("%s reset progression done\n", node_ptr->hostname.c_str());
                node_ptr->mtcCmd_work_fifo_ptr->status = FAIL_RETRY ;
                node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
            }
            else
            {
                wlog ("%s reset progression retry\n", node_ptr->hostname.c_str());
                node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET_PROGRESSION_START ;
            }

            stop_offline_handler ( node_ptr );
            break ;
        }

        case MTC_CMD_STAGE__IPMI_COMMAND_SEND:
        {
            if ( bmc_command_send ( node_ptr, node_ptr->cmdReq ) != PASS )
            {
                elog ("%s IPMI %s Send Failed\n",
                          node_ptr->hostname.c_str(),
                          bmcUtil_getCmd_str(node_ptr->cmdReq).c_str());

                node_ptr->mtcCmd_work_fifo_ptr->status = FAIL_RETRY ;
                node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
            }
            else
            {
                plog ("%s IPMI %s Requested\n",
                          node_ptr->hostname.c_str(),
                          bmcUtil_getCmd_str(node_ptr->cmdReq).c_str());

                mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_BMC_REQUEST_DELAY );
                node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__IPMI_COMMAND_RECV ;
            }
            break ;
        }

        case MTC_CMD_STAGE__IPMI_COMMAND_RECV:
        {
            if ( mtcTimer_expired ( node_ptr->mtcCmd_timer ) )
            {
                rc = bmc_command_recv ( node_ptr );
                if ( rc == RETRY )
                {
                     mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_SECS_5 ) ;
                     break ;
                }
                else if ( rc == PASS )
                {
                    plog ("%s IPMI %s Successful\n", node_ptr->hostname.c_str(),
                                                     bmcUtil_getCmd_str(node_ptr->cmdReq).c_str());
                }
                else
                {
                    plog ("%s IPMI %s Requested\n", node_ptr->hostname.c_str(),
                                                    bmcUtil_getCmd_str(node_ptr->cmdReq).c_str());
                }
                node_ptr->mtcCmd_work_fifo_ptr->status = rc ;
                node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ;
            }
            break ;
        }

        /***************************************************************************
         *
         *                  'Modify Hostname' Command Stages
         *
         * *************************************************************************/
        case MTC_CMD_STAGE__MODIFY_HOSTNAME_START:
        {
            send_hbs_command   ( node_ptr->hostname, MTC_CMD_DEL_HOST );
            send_hwmon_command ( node_ptr->hostname, MTC_CMD_DEL_HOST );
            send_guest_command ( node_ptr->hostname, MTC_CMD_DEL_HOST );

            mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, work_queue_timeout );

            node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__MODIFY_HOSTNAME_DELETE_WAIT ;

            break ;
        }
        case MTC_CMD_STAGE__MODIFY_HOSTNAME_DELETE_WAIT:
        {
            /* We still doing enable work ? */
            if ( node_ptr->libEvent_work_fifo.size () == 0 )
            {
                string name = node_ptr->mtcCmd_work_fifo_ptr->name ;

                if ( node_ptr->mtcCmd_timer.tid )
                    mtcTimer_stop ( node_ptr->mtcCmd_timer );

                /* make the change */
                hostname_inventory.remove ( node_ptr->hostname );
                node_ptr->hostname = name ;
                hostname_inventory.push_back ( node_ptr->hostname );

                /* update the timer hostname */
                node_ptr->mtcTimer.hostname = name ;
                node_ptr->mtcAlive_timer.hostname = name ;
                node_ptr->mtcSwact_timer.hostname = name ;
                node_ptr->mtcCmd_timer.hostname = name ;
                node_ptr->oosTestTimer.hostname = name ;
                node_ptr->insvTestTimer.hostname = name ;
                node_ptr->mtcConfig_timer.hostname = name ;

                mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, work_queue_timeout );

                node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__MODIFY_HOSTNAME_CREATE_WAIT ;

                /* return RETRY so that the FSM reloads the inventory loop */
                return (RETRY);
            }
            if ( node_ptr->mtcCmd_timer.ring == true )
            {
                elog ("%s mtcCmd timeout ; purging host's work queue\n", node_ptr->hostname.c_str());
                workQueue_purge ( node_ptr );
                node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
            }
            break ;
        }
        case MTC_CMD_STAGE__MODIFY_HOSTNAME_CREATE_WAIT:
        {
            /* We still doing create work ? */
            if ( node_ptr->libEvent_work_fifo.size() == 0 )
            {
                if ( node_ptr->mtcCmd_timer.tid )
                    mtcTimer_stop ( node_ptr->mtcCmd_timer );

                send_hbs_command   ( node_ptr->hostname, MTC_CMD_ADD_HOST );
                send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
                send_guest_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );

                if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
                {
                   send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
                }
                node_ptr->mtcCmd_work_fifo_ptr->status = PASS          ;
                node_ptr->mtcCmd_work_fifo_ptr->stage  = MTC_CMD_STAGE__DONE ;
            }
            if ( node_ptr->mtcCmd_timer.ring == true )
            {
                elog ("%s hostname change failed\n", node_ptr->hostname.c_str());
                elog ("... workQueue empty timeout ; purging host's work queue\n");
                workQueue_purge ( node_ptr );
                node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ;
            }
            break ;
        }
        case MTC_CMD_STAGE__DONE:
        case MTC_CMD_STAGE__STAGES:
        default:
        {
            int size ;

            mtcTimer_reset ( node_ptr->mtcCmd_timer );

            if ( node_ptr->mtcCmd_work_fifo_ptr->status != PASS )
            {
                qlog ("%s Command '%s' (%d) Failed (Status:%d)\n",
                          node_ptr->hostname.c_str(),
                          _get_cmd_str(node_ptr->mtcCmd_work_fifo_ptr->cmd).c_str(),
                                       node_ptr->mtcCmd_work_fifo_ptr->cmd,
                                       node_ptr->mtcCmd_work_fifo_ptr->status );
            }
            else
            {
                qlog ("%s Command '%s' Completed\n", node_ptr->hostname.c_str(),
                          _get_cmd_str(node_ptr->mtcCmd_work_fifo_ptr->cmd).c_str());
            }
            if ( ( size = node_ptr->mtcCmd_done_fifo.size()) != 0 )
            {
                wlog ( "%s mtcCmd doneQ not empty (contains %d elements)\n",
                           node_ptr->hostname.c_str(), size );
                mtcCmd_doneQ_purge ( node_ptr );
            }
            node_ptr->mtcCmd_done_fifo.push_front(node_ptr->mtcCmd_work_fifo.front());
            node_ptr->mtcCmd_work_fifo.pop_front();
            break ;
        }
    }
    return (PASS);
}

/* ***********************************************************************
 *
 * Name       : nodeLinkClass::mtcCmd_workQ_purge
 *
 * Description: Removes all items from the work queue.
 *
 */
int nodeLinkClass::mtcCmd_workQ_purge ( struct nodeLinkClass::node * node_ptr )
{
    int size = node_ptr->mtcCmd_work_fifo.size() ;
    if ( size )
    {
        wlog ("%s purging %d items from work queue\n", node_ptr->hostname.c_str(), size );
        for ( node_ptr->mtcCmd_work_fifo_ptr = node_ptr->mtcCmd_work_fifo.begin();
              node_ptr->mtcCmd_work_fifo_ptr != node_ptr->mtcCmd_work_fifo.end();
              node_ptr->mtcCmd_work_fifo_ptr++ )
        {
            wlog ("%s purging mtcCmd '%s' in stage %d from work queue\n",
                      node_ptr->hostname.c_str(),
                      _get_cmd_str(node_ptr->mtcCmd_work_fifo_ptr->cmd).c_str(),
                      node_ptr->mtcCmd_work_fifo_ptr->stage);
        }
        node_ptr->mtcCmd_work_fifo.clear();
    }
    else
    {
        qlog ("%s all work done\n", node_ptr->hostname.c_str());
    }
    return (PASS);
}


/* ***********************************************************************
 *
 * Name       : nodeLinkClass::mtcCmd_doneQ_purge
 *
 * Description: Removes all items from the mtcCmd done queue.
 *
 * Returns a failure, the sequence number of the first command
 * in the done queue that did not PASS.
 *
 */
int nodeLinkClass::mtcCmd_doneQ_purge ( struct nodeLinkClass::node * node_ptr )
{
    int rc = PASS ;
    int size = node_ptr->mtcCmd_done_fifo.size() ;
    if ( size )
    {
        int index = 0 ;
        for ( node_ptr->mtcCmd_done_fifo_ptr = node_ptr->mtcCmd_done_fifo.begin();
              node_ptr->mtcCmd_done_fifo_ptr != node_ptr->mtcCmd_done_fifo.end();
              node_ptr->mtcCmd_done_fifo_ptr++ )
        {
            index++ ;
            if ( node_ptr->mtcCmd_done_fifo_ptr->status )
            {
                dlog ("%s mtcCmd:%d failed (status:%d) (%d of %d)\n",
                        node_ptr->hostname.c_str(),
                        node_ptr->mtcCmd_done_fifo_ptr->cmd,
                        node_ptr->mtcCmd_done_fifo_ptr->status,
                        index, size);
                /* Save sequence of first failed command */
                if ( rc == PASS )
                {
                    rc = node_ptr->mtcCmd_done_fifo_ptr->seq ;
                }
            }
        }
        if ( rc == PASS )
        {
            dlog ("%s all (%d) mtcCmd operations passed\n", node_ptr->hostname.c_str(), size );
        }

        qlog ("%s purging %d items from done queue\n", node_ptr->hostname.c_str(), size );
        node_ptr->mtcCmd_done_fifo.clear();
    }
    return (rc);
}