Support custom mcagent timeout
* support Timeout::Error for case when agent off by timeout; * add data about the relationship between the timeout limits specified in the DDL and in particular running agent; * increase limit for execute_shell_command magent; * refactoring. Change-Id: Ib259ec5c59fd8f903839e8ca487480c7ea29cb38 Closes-Bug: #1268701
This commit is contained in:
parent
34cf0019cb
commit
f967bf7d64
|
@ -61,6 +61,8 @@ module Astute
|
|||
conf[:PUPPET_FADE_INTERVAL] = 10 # retry every ## seconds to check puppet state if it was running
|
||||
conf[:PROVISIONING_TIMEOUT] = 90 * 60 # timeout for booting target OS in provision
|
||||
conf[:REBOOT_TIMEOUT] = 120 # how long it can take for node to reboot
|
||||
conf[:DUMP_TIMEOUT] = 3600 # maximum time it waits for the dump (meaningles to be larger
|
||||
# than the specified in timeout of execute_shell_command mcagent
|
||||
|
||||
conf[:REDHAT_CHECK_CREDENTIALS_TIMEOUT] = 30 # checking redhat credentials througs mcollective
|
||||
conf[:REDHAT_GET_LICENSES_POOL_TIMEOUT] = 60 # getting redhat licenses through mcollective
|
||||
|
|
|
@ -16,28 +16,26 @@
|
|||
module Astute
|
||||
module Dump
|
||||
def self.dump_environment(ctx, lastdump)
|
||||
timeout = 500
|
||||
shell = MClient.new(ctx, 'execute_shell_command', ['master'])
|
||||
timeout = Astute.config.DUMP_TIMEOUT
|
||||
shell = MClient.new(ctx, 'execute_shell_command', ['master'], check_result=true, timeout=timeout, retries=1)
|
||||
begin
|
||||
Timeout.timeout(timeout) do
|
||||
result = shell.execute(
|
||||
:cmd => "/opt/nailgun/bin/nailgun_dump >>/var/log/dump.log 2>&1 && cat #{lastdump}").first
|
||||
Astute.logger.debug("#{ctx.task_id}: \
|
||||
result = shell.execute(
|
||||
:cmd => "/opt/nailgun/bin/nailgun_dump >>/var/log/dump.log 2>&1 && cat #{lastdump}").first
|
||||
Astute.logger.debug("#{ctx.task_id}: \
|
||||
stdout: #{result[:data][:stdout]} stderr: #{result[:data][:stderr]} \
|
||||
exit code: #{result[:data][:exit_code]}")
|
||||
if result[:data][:exit_code] == 0
|
||||
Astute.logger.info("#{ctx.task_id}: Snapshot is done. Result: #{result[:data][:stdout]}")
|
||||
report_success(ctx, result[:data][:stdout].rstrip)
|
||||
else
|
||||
Astute.logger.error("#{ctx.task_id}: Dump command returned non zero exit code")
|
||||
report_error(ctx, "exit code: #{result[:data][:exit_code]} stderr: #{result[:data][:stderr]}")
|
||||
end
|
||||
if result[:data][:exit_code] == 0
|
||||
Astute.logger.info("#{ctx.task_id}: Snapshot is done. Result: #{result[:data][:stdout]}")
|
||||
report_success(ctx, result[:data][:stdout].rstrip)
|
||||
else
|
||||
Astute.logger.error("#{ctx.task_id}: Dump command returned non zero exit code")
|
||||
report_error(ctx, "exit code: #{result[:data][:exit_code]} stderr: #{result[:data][:stderr]}")
|
||||
end
|
||||
rescue Timeout::Error
|
||||
msg = "Dump is timed out"
|
||||
Astute.logger.error("#{ctx.task_id}: #{msg}")
|
||||
report_error(ctx, msg)
|
||||
rescue Exception => e
|
||||
rescue => e
|
||||
msg = "Exception occured during dump task: message: #{e.message} \
|
||||
trace: #{e.backtrace.inspect}"
|
||||
Astute.logger.error("#{ctx.task_id}: #{msg}")
|
||||
|
|
|
@ -25,5 +25,7 @@ module Astute
|
|||
class FailedToRebootNodesError < AstuteError; end
|
||||
# Deployment engine error
|
||||
class DeploymentEngineError < AstuteError; end
|
||||
# MClient errors
|
||||
class MClientError < AstuteError; end
|
||||
|
||||
end
|
||||
|
|
|
@ -14,20 +14,29 @@
|
|||
|
||||
|
||||
require 'mcollective'
|
||||
require 'timeout'
|
||||
|
||||
module Astute
|
||||
|
||||
class MClientTimeout < Timeout::Error; end
|
||||
|
||||
class MClient
|
||||
include MCollective::RPC
|
||||
|
||||
attr_accessor :retries
|
||||
|
||||
def initialize(ctx, agent, nodes=nil, check_result=true, timeout=nil)
|
||||
def initialize(ctx, agent, nodes=nil, check_result=true, timeout=nil, retries=Astute.config.MC_RETRIES)
|
||||
@task_id = ctx.task_id
|
||||
@agent = agent
|
||||
@nodes = nodes.map { |n| n.to_s } if nodes
|
||||
@check_result = check_result
|
||||
@retries = Astute.config.MC_RETRIES
|
||||
#FIXME: this timeout does not work
|
||||
# Will be used a minimum of two things: the specified parameter(timeout)
|
||||
# and timeout from DDL (10 sec by default if not explicitly specified in DDL)
|
||||
# If timeout here is nil will be used value from DDL.
|
||||
# Examples:
|
||||
# timeout - 10 sec, DDL - 20 sec. Result — 10 sec.
|
||||
# timeout - 30 sec, DDL - 20 sec. Result — 20 sec.
|
||||
# timeout - 20 sec, DDL - not set. Result — 10 sec.
|
||||
@timeout = timeout
|
||||
initialize_mclient
|
||||
end
|
||||
|
@ -57,6 +66,7 @@ module Astute
|
|||
|
||||
def check_results_with_retries(method, args)
|
||||
err_msg = ''
|
||||
timeout_nodes_count = 0
|
||||
# Following error might happen because of misconfiguration, ex. direct_addressing = 1 only on client
|
||||
# or.. could be just some hang? Let's retry if @retries is set
|
||||
if @mc_res.length < @nodes.length
|
||||
|
@ -81,11 +91,12 @@ module Astute
|
|||
if @on_respond_timeout
|
||||
@on_respond_timeout.call not_responded
|
||||
else
|
||||
err_msg += "MCollective agents '#{not_responded.join(',')}' didn't respond. \n"
|
||||
err_msg += "MCollective agents '#{not_responded.join(',')}' didn't respond within the allotted time.\n"
|
||||
timeout_nodes_count += not_responded.size
|
||||
end
|
||||
end
|
||||
end
|
||||
failed = @mc_res.select{|x| x.results[:statuscode] != 0 }
|
||||
failed = @mc_res.select { |x| x.results[:statuscode] != 0 }
|
||||
if failed.any?
|
||||
err_msg += "MCollective call failed in agent '#{@agent}', "\
|
||||
"method '#{method}', failed nodes: \n"
|
||||
|
@ -93,9 +104,15 @@ module Astute
|
|||
err_msg += "ID: #{n.results[:sender]} - Reason: #{n.results[:statusmsg]}\n"
|
||||
end
|
||||
end
|
||||
unless err_msg.empty?
|
||||
if err_msg.present?
|
||||
Astute.logger.error err_msg
|
||||
raise "#{@task_id}: #{err_msg}"
|
||||
expired_size = failed.count { |n| n.results[:statusmsg] == 'execution expired' }
|
||||
# Detect TimeOut: 1 condition - fail because of DDL timeout, 2 - fail because of custom timeout
|
||||
if (failed.present? && failed.size == expired_size) || (timeout_nodes_count > 0 && failed.empty?)
|
||||
raise MClientTimeout, "#{@task_id}: #{err_msg}"
|
||||
else
|
||||
raise MClientError, "#{@task_id}: #{err_msg}"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -306,7 +306,12 @@ module Astute
|
|||
end
|
||||
|
||||
def run_shell_command(context, node_uids, cmd)
|
||||
shell = MClient.new(context, 'execute_shell_command', node_uids)
|
||||
shell = MClient.new(context,
|
||||
'execute_shell_command',
|
||||
node_uids,
|
||||
check_result=true,
|
||||
timeout=60,
|
||||
retries=1)
|
||||
response = shell.execute(:cmd => cmd).first
|
||||
Astute.logger.debug("#{context.task_id}: cmd: #{cmd}
|
||||
stdout: #{response[:data][:stdout]}
|
||||
|
|
|
@ -149,13 +149,11 @@ module Astute
|
|||
end
|
||||
|
||||
def exec_cmd_with_timeout(cmd, timeout, timeout_expired_msg)
|
||||
shell = MClient.new(@ctx, 'execute_shell_command', ['master'])
|
||||
shell = MClient.new(@ctx, 'execute_shell_command', ['master'], check_result=true, timeout, retries=1)
|
||||
begin
|
||||
Timeout.timeout(timeout) do
|
||||
response = shell.execute(:cmd => cmd).first
|
||||
report_error(timeout_expired_msg) unless response
|
||||
return response
|
||||
end
|
||||
response = shell.execute(:cmd => cmd).first
|
||||
report_error(timeout_expired_msg) unless response
|
||||
return response
|
||||
rescue Timeout::Error
|
||||
Astute.logger.warn("Time out error for shell command '#{cmd}'")
|
||||
report_error(timeout_expired_msg)
|
||||
|
|
|
@ -4,7 +4,7 @@ metadata :name => "Execute shell command",
|
|||
:license => "Apache License 2.0",
|
||||
:version => "0.0.1",
|
||||
:url => "http://mirantis.com",
|
||||
:timeout => 600
|
||||
:timeout => 3600
|
||||
|
||||
action "execute", :description => "Execute shell command" do
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ describe 'dump_environment' do
|
|||
end
|
||||
|
||||
Astute::MClient = mock() do
|
||||
expects(:new).with(ctx, 'execute_shell_command', ['master']).returns(agent)
|
||||
expects(:new).with(ctx, 'execute_shell_command', ['master'], true, Astute.config.DUMP_TIMEOUT, 1).returns(agent)
|
||||
end
|
||||
Astute::Dump.dump_environment(ctx, lastdump)
|
||||
end
|
||||
|
@ -73,7 +73,7 @@ describe 'dump_environment' do
|
|||
|
||||
it "should report error if any other exception occured" do
|
||||
agent = mock() do
|
||||
expects(:execute).raises(Exception, "MESSAGE")
|
||||
expects(:execute).raises(StandardError , "MESSAGE")
|
||||
end
|
||||
Astute::MClient = mock() do
|
||||
stubs(:new).returns(agent)
|
||||
|
|
|
@ -65,11 +65,12 @@ describe MClient do
|
|||
mc_valid_result2 = mock_mc_result({:sender => '2'})
|
||||
|
||||
rpcclient.stubs(:echo).returns([mc_valid_result]).then.
|
||||
returns([mc_valid_result2]).then
|
||||
returns([mc_valid_result2])
|
||||
|
||||
mclient = MClient.new(@ctx, "faketest", nodes.map {|x| x['uid']})
|
||||
mclient.retries = 1
|
||||
expect { mclient.echo(:msg => 'hello world') }.to raise_error(/MCollective agents '3' didn't respond./)
|
||||
expect { mclient.echo(:msg => 'hello world') }.to \
|
||||
raise_error(Astute::MClientTimeout, /MCollective agents '3' didn't respond./)
|
||||
end
|
||||
|
||||
it "should raise error if agent returns statuscode != 0" do
|
||||
|
@ -85,11 +86,12 @@ describe MClient do
|
|||
mc_failed_result = mock_mc_result({:sender => '2', :statuscode => 1})
|
||||
|
||||
rpcclient.stubs(:echo).returns([mc_valid_result]).then.
|
||||
returns([mc_failed_result]).then
|
||||
returns([mc_failed_result]).then.
|
||||
returns([mc_failed_result])
|
||||
|
||||
mclient = MClient.new(@ctx, "faketest", nodes.map {|x| x['uid']})
|
||||
mclient.retries = 1
|
||||
expect { mclient.echo(:msg => 'hello world') }.to \
|
||||
raise_error(/MCollective agents '3' didn't respond./)
|
||||
raise_error(Astute::MClientError, /ID: 2 - Reason:/)
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue