Support custom mcagent timeout

* support Timeout::Error for case when agent off by timeout;
* add data about the relationship between the timeout limits
specified in the DDL and in particular running agent;
* increase limit for execute_shell_command magent;
* refactoring.

Change-Id: Ib259ec5c59fd8f903839e8ca487480c7ea29cb38
Closes-Bug: #1268701
This commit is contained in:
Vladimir 2014-01-15 16:30:06 +04:00
parent 34cf0019cb
commit f967bf7d64
9 changed files with 59 additions and 35 deletions

View File

@ -61,6 +61,8 @@ module Astute
conf[:PUPPET_FADE_INTERVAL] = 10 # retry every ## seconds to check puppet state if it was running conf[:PUPPET_FADE_INTERVAL] = 10 # retry every ## seconds to check puppet state if it was running
conf[:PROVISIONING_TIMEOUT] = 90 * 60 # timeout for booting target OS in provision conf[:PROVISIONING_TIMEOUT] = 90 * 60 # timeout for booting target OS in provision
conf[:REBOOT_TIMEOUT] = 120 # how long it can take for node to reboot conf[:REBOOT_TIMEOUT] = 120 # how long it can take for node to reboot
conf[:DUMP_TIMEOUT] = 3600 # maximum time it waits for the dump (meaningles to be larger
# than the specified in timeout of execute_shell_command mcagent
conf[:REDHAT_CHECK_CREDENTIALS_TIMEOUT] = 30 # checking redhat credentials througs mcollective conf[:REDHAT_CHECK_CREDENTIALS_TIMEOUT] = 30 # checking redhat credentials througs mcollective
conf[:REDHAT_GET_LICENSES_POOL_TIMEOUT] = 60 # getting redhat licenses through mcollective conf[:REDHAT_GET_LICENSES_POOL_TIMEOUT] = 60 # getting redhat licenses through mcollective

View File

@ -16,28 +16,26 @@
module Astute module Astute
module Dump module Dump
def self.dump_environment(ctx, lastdump) def self.dump_environment(ctx, lastdump)
timeout = 500 timeout = Astute.config.DUMP_TIMEOUT
shell = MClient.new(ctx, 'execute_shell_command', ['master']) shell = MClient.new(ctx, 'execute_shell_command', ['master'], check_result=true, timeout=timeout, retries=1)
begin begin
Timeout.timeout(timeout) do result = shell.execute(
result = shell.execute( :cmd => "/opt/nailgun/bin/nailgun_dump >>/var/log/dump.log 2>&1 && cat #{lastdump}").first
:cmd => "/opt/nailgun/bin/nailgun_dump >>/var/log/dump.log 2>&1 && cat #{lastdump}").first Astute.logger.debug("#{ctx.task_id}: \
Astute.logger.debug("#{ctx.task_id}: \
stdout: #{result[:data][:stdout]} stderr: #{result[:data][:stderr]} \ stdout: #{result[:data][:stdout]} stderr: #{result[:data][:stderr]} \
exit code: #{result[:data][:exit_code]}") exit code: #{result[:data][:exit_code]}")
if result[:data][:exit_code] == 0 if result[:data][:exit_code] == 0
Astute.logger.info("#{ctx.task_id}: Snapshot is done. Result: #{result[:data][:stdout]}") Astute.logger.info("#{ctx.task_id}: Snapshot is done. Result: #{result[:data][:stdout]}")
report_success(ctx, result[:data][:stdout].rstrip) report_success(ctx, result[:data][:stdout].rstrip)
else else
Astute.logger.error("#{ctx.task_id}: Dump command returned non zero exit code") Astute.logger.error("#{ctx.task_id}: Dump command returned non zero exit code")
report_error(ctx, "exit code: #{result[:data][:exit_code]} stderr: #{result[:data][:stderr]}") report_error(ctx, "exit code: #{result[:data][:exit_code]} stderr: #{result[:data][:stderr]}")
end
end end
rescue Timeout::Error rescue Timeout::Error
msg = "Dump is timed out" msg = "Dump is timed out"
Astute.logger.error("#{ctx.task_id}: #{msg}") Astute.logger.error("#{ctx.task_id}: #{msg}")
report_error(ctx, msg) report_error(ctx, msg)
rescue Exception => e rescue => e
msg = "Exception occured during dump task: message: #{e.message} \ msg = "Exception occured during dump task: message: #{e.message} \
trace: #{e.backtrace.inspect}" trace: #{e.backtrace.inspect}"
Astute.logger.error("#{ctx.task_id}: #{msg}") Astute.logger.error("#{ctx.task_id}: #{msg}")

View File

@ -25,5 +25,7 @@ module Astute
class FailedToRebootNodesError < AstuteError; end class FailedToRebootNodesError < AstuteError; end
# Deployment engine error # Deployment engine error
class DeploymentEngineError < AstuteError; end class DeploymentEngineError < AstuteError; end
# MClient errors
class MClientError < AstuteError; end
end end

View File

@ -14,20 +14,29 @@
require 'mcollective' require 'mcollective'
require 'timeout'
module Astute module Astute
class MClientTimeout < Timeout::Error; end
class MClient class MClient
include MCollective::RPC include MCollective::RPC
attr_accessor :retries attr_accessor :retries
def initialize(ctx, agent, nodes=nil, check_result=true, timeout=nil) def initialize(ctx, agent, nodes=nil, check_result=true, timeout=nil, retries=Astute.config.MC_RETRIES)
@task_id = ctx.task_id @task_id = ctx.task_id
@agent = agent @agent = agent
@nodes = nodes.map { |n| n.to_s } if nodes @nodes = nodes.map { |n| n.to_s } if nodes
@check_result = check_result @check_result = check_result
@retries = Astute.config.MC_RETRIES # Will be used a minimum of two things: the specified parameter(timeout)
#FIXME: this timeout does not work # and timeout from DDL (10 sec by default if not explicitly specified in DDL)
# If timeout here is nil will be used value from DDL.
# Examples:
# timeout - 10 sec, DDL - 20 sec. Result — 10 sec.
# timeout - 30 sec, DDL - 20 sec. Result — 20 sec.
# timeout - 20 sec, DDL - not set. Result — 10 sec.
@timeout = timeout @timeout = timeout
initialize_mclient initialize_mclient
end end
@ -57,6 +66,7 @@ module Astute
def check_results_with_retries(method, args) def check_results_with_retries(method, args)
err_msg = '' err_msg = ''
timeout_nodes_count = 0
# Following error might happen because of misconfiguration, ex. direct_addressing = 1 only on client # Following error might happen because of misconfiguration, ex. direct_addressing = 1 only on client
# or.. could be just some hang? Let's retry if @retries is set # or.. could be just some hang? Let's retry if @retries is set
if @mc_res.length < @nodes.length if @mc_res.length < @nodes.length
@ -81,11 +91,12 @@ module Astute
if @on_respond_timeout if @on_respond_timeout
@on_respond_timeout.call not_responded @on_respond_timeout.call not_responded
else else
err_msg += "MCollective agents '#{not_responded.join(',')}' didn't respond. \n" err_msg += "MCollective agents '#{not_responded.join(',')}' didn't respond within the allotted time.\n"
timeout_nodes_count += not_responded.size
end end
end end
end end
failed = @mc_res.select{|x| x.results[:statuscode] != 0 } failed = @mc_res.select { |x| x.results[:statuscode] != 0 }
if failed.any? if failed.any?
err_msg += "MCollective call failed in agent '#{@agent}', "\ err_msg += "MCollective call failed in agent '#{@agent}', "\
"method '#{method}', failed nodes: \n" "method '#{method}', failed nodes: \n"
@ -93,9 +104,15 @@ module Astute
err_msg += "ID: #{n.results[:sender]} - Reason: #{n.results[:statusmsg]}\n" err_msg += "ID: #{n.results[:sender]} - Reason: #{n.results[:statusmsg]}\n"
end end
end end
unless err_msg.empty? if err_msg.present?
Astute.logger.error err_msg Astute.logger.error err_msg
raise "#{@task_id}: #{err_msg}" expired_size = failed.count { |n| n.results[:statusmsg] == 'execution expired' }
# Detect TimeOut: 1 condition - fail because of DDL timeout, 2 - fail because of custom timeout
if (failed.present? && failed.size == expired_size) || (timeout_nodes_count > 0 && failed.empty?)
raise MClientTimeout, "#{@task_id}: #{err_msg}"
else
raise MClientError, "#{@task_id}: #{err_msg}"
end
end end
end end

View File

@ -306,7 +306,12 @@ module Astute
end end
def run_shell_command(context, node_uids, cmd) def run_shell_command(context, node_uids, cmd)
shell = MClient.new(context, 'execute_shell_command', node_uids) shell = MClient.new(context,
'execute_shell_command',
node_uids,
check_result=true,
timeout=60,
retries=1)
response = shell.execute(:cmd => cmd).first response = shell.execute(:cmd => cmd).first
Astute.logger.debug("#{context.task_id}: cmd: #{cmd} Astute.logger.debug("#{context.task_id}: cmd: #{cmd}
stdout: #{response[:data][:stdout]} stdout: #{response[:data][:stdout]}

View File

@ -149,13 +149,11 @@ module Astute
end end
def exec_cmd_with_timeout(cmd, timeout, timeout_expired_msg) def exec_cmd_with_timeout(cmd, timeout, timeout_expired_msg)
shell = MClient.new(@ctx, 'execute_shell_command', ['master']) shell = MClient.new(@ctx, 'execute_shell_command', ['master'], check_result=true, timeout, retries=1)
begin begin
Timeout.timeout(timeout) do response = shell.execute(:cmd => cmd).first
response = shell.execute(:cmd => cmd).first report_error(timeout_expired_msg) unless response
report_error(timeout_expired_msg) unless response return response
return response
end
rescue Timeout::Error rescue Timeout::Error
Astute.logger.warn("Time out error for shell command '#{cmd}'") Astute.logger.warn("Time out error for shell command '#{cmd}'")
report_error(timeout_expired_msg) report_error(timeout_expired_msg)

View File

@ -4,7 +4,7 @@ metadata :name => "Execute shell command",
:license => "Apache License 2.0", :license => "Apache License 2.0",
:version => "0.0.1", :version => "0.0.1",
:url => "http://mirantis.com", :url => "http://mirantis.com",
:timeout => 600 :timeout => 3600
action "execute", :description => "Execute shell command" do action "execute", :description => "Execute shell command" do

View File

@ -33,7 +33,7 @@ describe 'dump_environment' do
end end
Astute::MClient = mock() do Astute::MClient = mock() do
expects(:new).with(ctx, 'execute_shell_command', ['master']).returns(agent) expects(:new).with(ctx, 'execute_shell_command', ['master'], true, Astute.config.DUMP_TIMEOUT, 1).returns(agent)
end end
Astute::Dump.dump_environment(ctx, lastdump) Astute::Dump.dump_environment(ctx, lastdump)
end end
@ -73,7 +73,7 @@ describe 'dump_environment' do
it "should report error if any other exception occured" do it "should report error if any other exception occured" do
agent = mock() do agent = mock() do
expects(:execute).raises(Exception, "MESSAGE") expects(:execute).raises(StandardError , "MESSAGE")
end end
Astute::MClient = mock() do Astute::MClient = mock() do
stubs(:new).returns(agent) stubs(:new).returns(agent)

View File

@ -65,11 +65,12 @@ describe MClient do
mc_valid_result2 = mock_mc_result({:sender => '2'}) mc_valid_result2 = mock_mc_result({:sender => '2'})
rpcclient.stubs(:echo).returns([mc_valid_result]).then. rpcclient.stubs(:echo).returns([mc_valid_result]).then.
returns([mc_valid_result2]).then returns([mc_valid_result2])
mclient = MClient.new(@ctx, "faketest", nodes.map {|x| x['uid']}) mclient = MClient.new(@ctx, "faketest", nodes.map {|x| x['uid']})
mclient.retries = 1 mclient.retries = 1
expect { mclient.echo(:msg => 'hello world') }.to raise_error(/MCollective agents '3' didn't respond./) expect { mclient.echo(:msg => 'hello world') }.to \
raise_error(Astute::MClientTimeout, /MCollective agents '3' didn't respond./)
end end
it "should raise error if agent returns statuscode != 0" do it "should raise error if agent returns statuscode != 0" do
@ -85,11 +86,12 @@ describe MClient do
mc_failed_result = mock_mc_result({:sender => '2', :statuscode => 1}) mc_failed_result = mock_mc_result({:sender => '2', :statuscode => 1})
rpcclient.stubs(:echo).returns([mc_valid_result]).then. rpcclient.stubs(:echo).returns([mc_valid_result]).then.
returns([mc_failed_result]).then returns([mc_failed_result]).then.
returns([mc_failed_result])
mclient = MClient.new(@ctx, "faketest", nodes.map {|x| x['uid']}) mclient = MClient.new(@ctx, "faketest", nodes.map {|x| x['uid']})
mclient.retries = 1 mclient.retries = 1
expect { mclient.echo(:msg => 'hello world') }.to \ expect { mclient.echo(:msg => 'hello world') }.to \
raise_error(/MCollective agents '3' didn't respond./) raise_error(Astute::MClientError, /ID: 2 - Reason:/)
end end
end end