Support custom mcagent timeout

* support Timeout::Error for case when agent off by timeout;
* add data about the relationship between the timeout limits
specified in the DDL and in particular running agent;
* increase limit for execute_shell_command magent;
* refactoring.

Change-Id: Ib259ec5c59fd8f903839e8ca487480c7ea29cb38
Closes-Bug: #1268701
This commit is contained in:
Vladimir 2014-01-15 16:30:06 +04:00
parent 34cf0019cb
commit f967bf7d64
9 changed files with 59 additions and 35 deletions

View File

@ -61,6 +61,8 @@ module Astute
conf[:PUPPET_FADE_INTERVAL] = 10 # retry every ## seconds to check puppet state if it was running
conf[:PROVISIONING_TIMEOUT] = 90 * 60 # timeout for booting target OS in provision
conf[:REBOOT_TIMEOUT] = 120 # how long it can take for node to reboot
conf[:DUMP_TIMEOUT] = 3600 # maximum time it waits for the dump (meaningles to be larger
# than the specified in timeout of execute_shell_command mcagent
conf[:REDHAT_CHECK_CREDENTIALS_TIMEOUT] = 30 # checking redhat credentials througs mcollective
conf[:REDHAT_GET_LICENSES_POOL_TIMEOUT] = 60 # getting redhat licenses through mcollective

View File

@ -16,28 +16,26 @@
module Astute
module Dump
def self.dump_environment(ctx, lastdump)
timeout = 500
shell = MClient.new(ctx, 'execute_shell_command', ['master'])
timeout = Astute.config.DUMP_TIMEOUT
shell = MClient.new(ctx, 'execute_shell_command', ['master'], check_result=true, timeout=timeout, retries=1)
begin
Timeout.timeout(timeout) do
result = shell.execute(
:cmd => "/opt/nailgun/bin/nailgun_dump >>/var/log/dump.log 2>&1 && cat #{lastdump}").first
Astute.logger.debug("#{ctx.task_id}: \
result = shell.execute(
:cmd => "/opt/nailgun/bin/nailgun_dump >>/var/log/dump.log 2>&1 && cat #{lastdump}").first
Astute.logger.debug("#{ctx.task_id}: \
stdout: #{result[:data][:stdout]} stderr: #{result[:data][:stderr]} \
exit code: #{result[:data][:exit_code]}")
if result[:data][:exit_code] == 0
Astute.logger.info("#{ctx.task_id}: Snapshot is done. Result: #{result[:data][:stdout]}")
report_success(ctx, result[:data][:stdout].rstrip)
else
Astute.logger.error("#{ctx.task_id}: Dump command returned non zero exit code")
report_error(ctx, "exit code: #{result[:data][:exit_code]} stderr: #{result[:data][:stderr]}")
end
if result[:data][:exit_code] == 0
Astute.logger.info("#{ctx.task_id}: Snapshot is done. Result: #{result[:data][:stdout]}")
report_success(ctx, result[:data][:stdout].rstrip)
else
Astute.logger.error("#{ctx.task_id}: Dump command returned non zero exit code")
report_error(ctx, "exit code: #{result[:data][:exit_code]} stderr: #{result[:data][:stderr]}")
end
rescue Timeout::Error
msg = "Dump is timed out"
Astute.logger.error("#{ctx.task_id}: #{msg}")
report_error(ctx, msg)
rescue Exception => e
rescue => e
msg = "Exception occured during dump task: message: #{e.message} \
trace: #{e.backtrace.inspect}"
Astute.logger.error("#{ctx.task_id}: #{msg}")

View File

@ -25,5 +25,7 @@ module Astute
class FailedToRebootNodesError < AstuteError; end
# Deployment engine error
class DeploymentEngineError < AstuteError; end
# MClient errors
class MClientError < AstuteError; end
end

View File

@ -14,20 +14,29 @@
require 'mcollective'
require 'timeout'
module Astute
class MClientTimeout < Timeout::Error; end
class MClient
include MCollective::RPC
attr_accessor :retries
def initialize(ctx, agent, nodes=nil, check_result=true, timeout=nil)
def initialize(ctx, agent, nodes=nil, check_result=true, timeout=nil, retries=Astute.config.MC_RETRIES)
@task_id = ctx.task_id
@agent = agent
@nodes = nodes.map { |n| n.to_s } if nodes
@check_result = check_result
@retries = Astute.config.MC_RETRIES
#FIXME: this timeout does not work
# Will be used a minimum of two things: the specified parameter(timeout)
# and timeout from DDL (10 sec by default if not explicitly specified in DDL)
# If timeout here is nil will be used value from DDL.
# Examples:
# timeout - 10 sec, DDL - 20 sec. Result — 10 sec.
# timeout - 30 sec, DDL - 20 sec. Result — 20 sec.
# timeout - 20 sec, DDL - not set. Result — 10 sec.
@timeout = timeout
initialize_mclient
end
@ -57,6 +66,7 @@ module Astute
def check_results_with_retries(method, args)
err_msg = ''
timeout_nodes_count = 0
# Following error might happen because of misconfiguration, ex. direct_addressing = 1 only on client
# or.. could be just some hang? Let's retry if @retries is set
if @mc_res.length < @nodes.length
@ -81,11 +91,12 @@ module Astute
if @on_respond_timeout
@on_respond_timeout.call not_responded
else
err_msg += "MCollective agents '#{not_responded.join(',')}' didn't respond. \n"
err_msg += "MCollective agents '#{not_responded.join(',')}' didn't respond within the allotted time.\n"
timeout_nodes_count += not_responded.size
end
end
end
failed = @mc_res.select{|x| x.results[:statuscode] != 0 }
failed = @mc_res.select { |x| x.results[:statuscode] != 0 }
if failed.any?
err_msg += "MCollective call failed in agent '#{@agent}', "\
"method '#{method}', failed nodes: \n"
@ -93,9 +104,15 @@ module Astute
err_msg += "ID: #{n.results[:sender]} - Reason: #{n.results[:statusmsg]}\n"
end
end
unless err_msg.empty?
if err_msg.present?
Astute.logger.error err_msg
raise "#{@task_id}: #{err_msg}"
expired_size = failed.count { |n| n.results[:statusmsg] == 'execution expired' }
# Detect TimeOut: 1 condition - fail because of DDL timeout, 2 - fail because of custom timeout
if (failed.present? && failed.size == expired_size) || (timeout_nodes_count > 0 && failed.empty?)
raise MClientTimeout, "#{@task_id}: #{err_msg}"
else
raise MClientError, "#{@task_id}: #{err_msg}"
end
end
end

View File

@ -306,7 +306,12 @@ module Astute
end
def run_shell_command(context, node_uids, cmd)
shell = MClient.new(context, 'execute_shell_command', node_uids)
shell = MClient.new(context,
'execute_shell_command',
node_uids,
check_result=true,
timeout=60,
retries=1)
response = shell.execute(:cmd => cmd).first
Astute.logger.debug("#{context.task_id}: cmd: #{cmd}
stdout: #{response[:data][:stdout]}

View File

@ -149,13 +149,11 @@ module Astute
end
def exec_cmd_with_timeout(cmd, timeout, timeout_expired_msg)
shell = MClient.new(@ctx, 'execute_shell_command', ['master'])
shell = MClient.new(@ctx, 'execute_shell_command', ['master'], check_result=true, timeout, retries=1)
begin
Timeout.timeout(timeout) do
response = shell.execute(:cmd => cmd).first
report_error(timeout_expired_msg) unless response
return response
end
response = shell.execute(:cmd => cmd).first
report_error(timeout_expired_msg) unless response
return response
rescue Timeout::Error
Astute.logger.warn("Time out error for shell command '#{cmd}'")
report_error(timeout_expired_msg)

View File

@ -4,7 +4,7 @@ metadata :name => "Execute shell command",
:license => "Apache License 2.0",
:version => "0.0.1",
:url => "http://mirantis.com",
:timeout => 600
:timeout => 3600
action "execute", :description => "Execute shell command" do

View File

@ -33,7 +33,7 @@ describe 'dump_environment' do
end
Astute::MClient = mock() do
expects(:new).with(ctx, 'execute_shell_command', ['master']).returns(agent)
expects(:new).with(ctx, 'execute_shell_command', ['master'], true, Astute.config.DUMP_TIMEOUT, 1).returns(agent)
end
Astute::Dump.dump_environment(ctx, lastdump)
end
@ -73,7 +73,7 @@ describe 'dump_environment' do
it "should report error if any other exception occured" do
agent = mock() do
expects(:execute).raises(Exception, "MESSAGE")
expects(:execute).raises(StandardError , "MESSAGE")
end
Astute::MClient = mock() do
stubs(:new).returns(agent)

View File

@ -65,11 +65,12 @@ describe MClient do
mc_valid_result2 = mock_mc_result({:sender => '2'})
rpcclient.stubs(:echo).returns([mc_valid_result]).then.
returns([mc_valid_result2]).then
returns([mc_valid_result2])
mclient = MClient.new(@ctx, "faketest", nodes.map {|x| x['uid']})
mclient.retries = 1
expect { mclient.echo(:msg => 'hello world') }.to raise_error(/MCollective agents '3' didn't respond./)
expect { mclient.echo(:msg => 'hello world') }.to \
raise_error(Astute::MClientTimeout, /MCollective agents '3' didn't respond./)
end
it "should raise error if agent returns statuscode != 0" do
@ -85,11 +86,12 @@ describe MClient do
mc_failed_result = mock_mc_result({:sender => '2', :statuscode => 1})
rpcclient.stubs(:echo).returns([mc_valid_result]).then.
returns([mc_failed_result]).then
returns([mc_failed_result]).then.
returns([mc_failed_result])
mclient = MClient.new(@ctx, "faketest", nodes.map {|x| x['uid']})
mclient.retries = 1
expect { mclient.echo(:msg => 'hello world') }.to \
raise_error(/MCollective agents '3' didn't respond./)
raise_error(Astute::MClientError, /ID: 2 - Reason:/)
end
end