Support custom mcagent timeout
* support Timeout::Error for case when agent off by timeout; * add data about the relationship between the timeout limits specified in the DDL and in particular running agent; * increase limit for execute_shell_command magent; * refactoring. Change-Id: Ib259ec5c59fd8f903839e8ca487480c7ea29cb38 Closes-Bug: #1268701
This commit is contained in:
parent
34cf0019cb
commit
f967bf7d64
|
@ -61,6 +61,8 @@ module Astute
|
||||||
conf[:PUPPET_FADE_INTERVAL] = 10 # retry every ## seconds to check puppet state if it was running
|
conf[:PUPPET_FADE_INTERVAL] = 10 # retry every ## seconds to check puppet state if it was running
|
||||||
conf[:PROVISIONING_TIMEOUT] = 90 * 60 # timeout for booting target OS in provision
|
conf[:PROVISIONING_TIMEOUT] = 90 * 60 # timeout for booting target OS in provision
|
||||||
conf[:REBOOT_TIMEOUT] = 120 # how long it can take for node to reboot
|
conf[:REBOOT_TIMEOUT] = 120 # how long it can take for node to reboot
|
||||||
|
conf[:DUMP_TIMEOUT] = 3600 # maximum time it waits for the dump (meaningles to be larger
|
||||||
|
# than the specified in timeout of execute_shell_command mcagent
|
||||||
|
|
||||||
conf[:REDHAT_CHECK_CREDENTIALS_TIMEOUT] = 30 # checking redhat credentials througs mcollective
|
conf[:REDHAT_CHECK_CREDENTIALS_TIMEOUT] = 30 # checking redhat credentials througs mcollective
|
||||||
conf[:REDHAT_GET_LICENSES_POOL_TIMEOUT] = 60 # getting redhat licenses through mcollective
|
conf[:REDHAT_GET_LICENSES_POOL_TIMEOUT] = 60 # getting redhat licenses through mcollective
|
||||||
|
|
|
@ -16,28 +16,26 @@
|
||||||
module Astute
|
module Astute
|
||||||
module Dump
|
module Dump
|
||||||
def self.dump_environment(ctx, lastdump)
|
def self.dump_environment(ctx, lastdump)
|
||||||
timeout = 500
|
timeout = Astute.config.DUMP_TIMEOUT
|
||||||
shell = MClient.new(ctx, 'execute_shell_command', ['master'])
|
shell = MClient.new(ctx, 'execute_shell_command', ['master'], check_result=true, timeout=timeout, retries=1)
|
||||||
begin
|
begin
|
||||||
Timeout.timeout(timeout) do
|
result = shell.execute(
|
||||||
result = shell.execute(
|
:cmd => "/opt/nailgun/bin/nailgun_dump >>/var/log/dump.log 2>&1 && cat #{lastdump}").first
|
||||||
:cmd => "/opt/nailgun/bin/nailgun_dump >>/var/log/dump.log 2>&1 && cat #{lastdump}").first
|
Astute.logger.debug("#{ctx.task_id}: \
|
||||||
Astute.logger.debug("#{ctx.task_id}: \
|
|
||||||
stdout: #{result[:data][:stdout]} stderr: #{result[:data][:stderr]} \
|
stdout: #{result[:data][:stdout]} stderr: #{result[:data][:stderr]} \
|
||||||
exit code: #{result[:data][:exit_code]}")
|
exit code: #{result[:data][:exit_code]}")
|
||||||
if result[:data][:exit_code] == 0
|
if result[:data][:exit_code] == 0
|
||||||
Astute.logger.info("#{ctx.task_id}: Snapshot is done. Result: #{result[:data][:stdout]}")
|
Astute.logger.info("#{ctx.task_id}: Snapshot is done. Result: #{result[:data][:stdout]}")
|
||||||
report_success(ctx, result[:data][:stdout].rstrip)
|
report_success(ctx, result[:data][:stdout].rstrip)
|
||||||
else
|
else
|
||||||
Astute.logger.error("#{ctx.task_id}: Dump command returned non zero exit code")
|
Astute.logger.error("#{ctx.task_id}: Dump command returned non zero exit code")
|
||||||
report_error(ctx, "exit code: #{result[:data][:exit_code]} stderr: #{result[:data][:stderr]}")
|
report_error(ctx, "exit code: #{result[:data][:exit_code]} stderr: #{result[:data][:stderr]}")
|
||||||
end
|
|
||||||
end
|
end
|
||||||
rescue Timeout::Error
|
rescue Timeout::Error
|
||||||
msg = "Dump is timed out"
|
msg = "Dump is timed out"
|
||||||
Astute.logger.error("#{ctx.task_id}: #{msg}")
|
Astute.logger.error("#{ctx.task_id}: #{msg}")
|
||||||
report_error(ctx, msg)
|
report_error(ctx, msg)
|
||||||
rescue Exception => e
|
rescue => e
|
||||||
msg = "Exception occured during dump task: message: #{e.message} \
|
msg = "Exception occured during dump task: message: #{e.message} \
|
||||||
trace: #{e.backtrace.inspect}"
|
trace: #{e.backtrace.inspect}"
|
||||||
Astute.logger.error("#{ctx.task_id}: #{msg}")
|
Astute.logger.error("#{ctx.task_id}: #{msg}")
|
||||||
|
|
|
@ -25,5 +25,7 @@ module Astute
|
||||||
class FailedToRebootNodesError < AstuteError; end
|
class FailedToRebootNodesError < AstuteError; end
|
||||||
# Deployment engine error
|
# Deployment engine error
|
||||||
class DeploymentEngineError < AstuteError; end
|
class DeploymentEngineError < AstuteError; end
|
||||||
|
# MClient errors
|
||||||
|
class MClientError < AstuteError; end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -14,20 +14,29 @@
|
||||||
|
|
||||||
|
|
||||||
require 'mcollective'
|
require 'mcollective'
|
||||||
|
require 'timeout'
|
||||||
|
|
||||||
module Astute
|
module Astute
|
||||||
|
|
||||||
|
class MClientTimeout < Timeout::Error; end
|
||||||
|
|
||||||
class MClient
|
class MClient
|
||||||
include MCollective::RPC
|
include MCollective::RPC
|
||||||
|
|
||||||
attr_accessor :retries
|
attr_accessor :retries
|
||||||
|
|
||||||
def initialize(ctx, agent, nodes=nil, check_result=true, timeout=nil)
|
def initialize(ctx, agent, nodes=nil, check_result=true, timeout=nil, retries=Astute.config.MC_RETRIES)
|
||||||
@task_id = ctx.task_id
|
@task_id = ctx.task_id
|
||||||
@agent = agent
|
@agent = agent
|
||||||
@nodes = nodes.map { |n| n.to_s } if nodes
|
@nodes = nodes.map { |n| n.to_s } if nodes
|
||||||
@check_result = check_result
|
@check_result = check_result
|
||||||
@retries = Astute.config.MC_RETRIES
|
# Will be used a minimum of two things: the specified parameter(timeout)
|
||||||
#FIXME: this timeout does not work
|
# and timeout from DDL (10 sec by default if not explicitly specified in DDL)
|
||||||
|
# If timeout here is nil will be used value from DDL.
|
||||||
|
# Examples:
|
||||||
|
# timeout - 10 sec, DDL - 20 sec. Result — 10 sec.
|
||||||
|
# timeout - 30 sec, DDL - 20 sec. Result — 20 sec.
|
||||||
|
# timeout - 20 sec, DDL - not set. Result — 10 sec.
|
||||||
@timeout = timeout
|
@timeout = timeout
|
||||||
initialize_mclient
|
initialize_mclient
|
||||||
end
|
end
|
||||||
|
@ -57,6 +66,7 @@ module Astute
|
||||||
|
|
||||||
def check_results_with_retries(method, args)
|
def check_results_with_retries(method, args)
|
||||||
err_msg = ''
|
err_msg = ''
|
||||||
|
timeout_nodes_count = 0
|
||||||
# Following error might happen because of misconfiguration, ex. direct_addressing = 1 only on client
|
# Following error might happen because of misconfiguration, ex. direct_addressing = 1 only on client
|
||||||
# or.. could be just some hang? Let's retry if @retries is set
|
# or.. could be just some hang? Let's retry if @retries is set
|
||||||
if @mc_res.length < @nodes.length
|
if @mc_res.length < @nodes.length
|
||||||
|
@ -81,11 +91,12 @@ module Astute
|
||||||
if @on_respond_timeout
|
if @on_respond_timeout
|
||||||
@on_respond_timeout.call not_responded
|
@on_respond_timeout.call not_responded
|
||||||
else
|
else
|
||||||
err_msg += "MCollective agents '#{not_responded.join(',')}' didn't respond. \n"
|
err_msg += "MCollective agents '#{not_responded.join(',')}' didn't respond within the allotted time.\n"
|
||||||
|
timeout_nodes_count += not_responded.size
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
failed = @mc_res.select{|x| x.results[:statuscode] != 0 }
|
failed = @mc_res.select { |x| x.results[:statuscode] != 0 }
|
||||||
if failed.any?
|
if failed.any?
|
||||||
err_msg += "MCollective call failed in agent '#{@agent}', "\
|
err_msg += "MCollective call failed in agent '#{@agent}', "\
|
||||||
"method '#{method}', failed nodes: \n"
|
"method '#{method}', failed nodes: \n"
|
||||||
|
@ -93,9 +104,15 @@ module Astute
|
||||||
err_msg += "ID: #{n.results[:sender]} - Reason: #{n.results[:statusmsg]}\n"
|
err_msg += "ID: #{n.results[:sender]} - Reason: #{n.results[:statusmsg]}\n"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
unless err_msg.empty?
|
if err_msg.present?
|
||||||
Astute.logger.error err_msg
|
Astute.logger.error err_msg
|
||||||
raise "#{@task_id}: #{err_msg}"
|
expired_size = failed.count { |n| n.results[:statusmsg] == 'execution expired' }
|
||||||
|
# Detect TimeOut: 1 condition - fail because of DDL timeout, 2 - fail because of custom timeout
|
||||||
|
if (failed.present? && failed.size == expired_size) || (timeout_nodes_count > 0 && failed.empty?)
|
||||||
|
raise MClientTimeout, "#{@task_id}: #{err_msg}"
|
||||||
|
else
|
||||||
|
raise MClientError, "#{@task_id}: #{err_msg}"
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -306,7 +306,12 @@ module Astute
|
||||||
end
|
end
|
||||||
|
|
||||||
def run_shell_command(context, node_uids, cmd)
|
def run_shell_command(context, node_uids, cmd)
|
||||||
shell = MClient.new(context, 'execute_shell_command', node_uids)
|
shell = MClient.new(context,
|
||||||
|
'execute_shell_command',
|
||||||
|
node_uids,
|
||||||
|
check_result=true,
|
||||||
|
timeout=60,
|
||||||
|
retries=1)
|
||||||
response = shell.execute(:cmd => cmd).first
|
response = shell.execute(:cmd => cmd).first
|
||||||
Astute.logger.debug("#{context.task_id}: cmd: #{cmd}
|
Astute.logger.debug("#{context.task_id}: cmd: #{cmd}
|
||||||
stdout: #{response[:data][:stdout]}
|
stdout: #{response[:data][:stdout]}
|
||||||
|
|
|
@ -149,13 +149,11 @@ module Astute
|
||||||
end
|
end
|
||||||
|
|
||||||
def exec_cmd_with_timeout(cmd, timeout, timeout_expired_msg)
|
def exec_cmd_with_timeout(cmd, timeout, timeout_expired_msg)
|
||||||
shell = MClient.new(@ctx, 'execute_shell_command', ['master'])
|
shell = MClient.new(@ctx, 'execute_shell_command', ['master'], check_result=true, timeout, retries=1)
|
||||||
begin
|
begin
|
||||||
Timeout.timeout(timeout) do
|
response = shell.execute(:cmd => cmd).first
|
||||||
response = shell.execute(:cmd => cmd).first
|
report_error(timeout_expired_msg) unless response
|
||||||
report_error(timeout_expired_msg) unless response
|
return response
|
||||||
return response
|
|
||||||
end
|
|
||||||
rescue Timeout::Error
|
rescue Timeout::Error
|
||||||
Astute.logger.warn("Time out error for shell command '#{cmd}'")
|
Astute.logger.warn("Time out error for shell command '#{cmd}'")
|
||||||
report_error(timeout_expired_msg)
|
report_error(timeout_expired_msg)
|
||||||
|
|
|
@ -4,7 +4,7 @@ metadata :name => "Execute shell command",
|
||||||
:license => "Apache License 2.0",
|
:license => "Apache License 2.0",
|
||||||
:version => "0.0.1",
|
:version => "0.0.1",
|
||||||
:url => "http://mirantis.com",
|
:url => "http://mirantis.com",
|
||||||
:timeout => 600
|
:timeout => 3600
|
||||||
|
|
||||||
action "execute", :description => "Execute shell command" do
|
action "execute", :description => "Execute shell command" do
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@ describe 'dump_environment' do
|
||||||
end
|
end
|
||||||
|
|
||||||
Astute::MClient = mock() do
|
Astute::MClient = mock() do
|
||||||
expects(:new).with(ctx, 'execute_shell_command', ['master']).returns(agent)
|
expects(:new).with(ctx, 'execute_shell_command', ['master'], true, Astute.config.DUMP_TIMEOUT, 1).returns(agent)
|
||||||
end
|
end
|
||||||
Astute::Dump.dump_environment(ctx, lastdump)
|
Astute::Dump.dump_environment(ctx, lastdump)
|
||||||
end
|
end
|
||||||
|
@ -73,7 +73,7 @@ describe 'dump_environment' do
|
||||||
|
|
||||||
it "should report error if any other exception occured" do
|
it "should report error if any other exception occured" do
|
||||||
agent = mock() do
|
agent = mock() do
|
||||||
expects(:execute).raises(Exception, "MESSAGE")
|
expects(:execute).raises(StandardError , "MESSAGE")
|
||||||
end
|
end
|
||||||
Astute::MClient = mock() do
|
Astute::MClient = mock() do
|
||||||
stubs(:new).returns(agent)
|
stubs(:new).returns(agent)
|
||||||
|
|
|
@ -65,11 +65,12 @@ describe MClient do
|
||||||
mc_valid_result2 = mock_mc_result({:sender => '2'})
|
mc_valid_result2 = mock_mc_result({:sender => '2'})
|
||||||
|
|
||||||
rpcclient.stubs(:echo).returns([mc_valid_result]).then.
|
rpcclient.stubs(:echo).returns([mc_valid_result]).then.
|
||||||
returns([mc_valid_result2]).then
|
returns([mc_valid_result2])
|
||||||
|
|
||||||
mclient = MClient.new(@ctx, "faketest", nodes.map {|x| x['uid']})
|
mclient = MClient.new(@ctx, "faketest", nodes.map {|x| x['uid']})
|
||||||
mclient.retries = 1
|
mclient.retries = 1
|
||||||
expect { mclient.echo(:msg => 'hello world') }.to raise_error(/MCollective agents '3' didn't respond./)
|
expect { mclient.echo(:msg => 'hello world') }.to \
|
||||||
|
raise_error(Astute::MClientTimeout, /MCollective agents '3' didn't respond./)
|
||||||
end
|
end
|
||||||
|
|
||||||
it "should raise error if agent returns statuscode != 0" do
|
it "should raise error if agent returns statuscode != 0" do
|
||||||
|
@ -85,11 +86,12 @@ describe MClient do
|
||||||
mc_failed_result = mock_mc_result({:sender => '2', :statuscode => 1})
|
mc_failed_result = mock_mc_result({:sender => '2', :statuscode => 1})
|
||||||
|
|
||||||
rpcclient.stubs(:echo).returns([mc_valid_result]).then.
|
rpcclient.stubs(:echo).returns([mc_valid_result]).then.
|
||||||
returns([mc_failed_result]).then
|
returns([mc_failed_result]).then.
|
||||||
|
returns([mc_failed_result])
|
||||||
|
|
||||||
mclient = MClient.new(@ctx, "faketest", nodes.map {|x| x['uid']})
|
mclient = MClient.new(@ctx, "faketest", nodes.map {|x| x['uid']})
|
||||||
mclient.retries = 1
|
mclient.retries = 1
|
||||||
expect { mclient.echo(:msg => 'hello world') }.to \
|
expect { mclient.echo(:msg => 'hello world') }.to \
|
||||||
raise_error(/MCollective agents '3' didn't respond./)
|
raise_error(Astute::MClientError, /ID: 2 - Reason:/)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue