From f967bf7d643c35e9c3bf8d8ad6cd6a13caa6e4ac Mon Sep 17 00:00:00 2001 From: Vladimir Date: Wed, 15 Jan 2014 16:30:06 +0400 Subject: [PATCH] Support custom mcagent timeout * support Timeout::Error for case when agent off by timeout; * add data about the relationship between the timeout limits specified in the DDL and in particular running agent; * increase limit for execute_shell_command magent; * refactoring. Change-Id: Ib259ec5c59fd8f903839e8ca487480c7ea29cb38 Closes-Bug: #1268701 --- lib/astute/config.rb | 2 ++ lib/astute/dump.rb | 26 ++++++++++++------------- lib/astute/exceptions.rb | 2 ++ lib/astute/mclient.rb | 31 +++++++++++++++++++++++------- lib/astute/orchestrator.rb | 7 ++++++- lib/astute/redhat_checker.rb | 10 ++++------ mcagents/execute_shell_command.ddl | 2 +- spec/unit/dump_spec.rb | 4 ++-- spec/unit/mclient_spec.rb | 10 ++++++---- 9 files changed, 59 insertions(+), 35 deletions(-) diff --git a/lib/astute/config.rb b/lib/astute/config.rb index 70a0a33b..27d571ff 100644 --- a/lib/astute/config.rb +++ b/lib/astute/config.rb @@ -61,6 +61,8 @@ module Astute conf[:PUPPET_FADE_INTERVAL] = 10 # retry every ## seconds to check puppet state if it was running conf[:PROVISIONING_TIMEOUT] = 90 * 60 # timeout for booting target OS in provision conf[:REBOOT_TIMEOUT] = 120 # how long it can take for node to reboot + conf[:DUMP_TIMEOUT] = 3600 # maximum time it waits for the dump (meaningles to be larger + # than the specified in timeout of execute_shell_command mcagent conf[:REDHAT_CHECK_CREDENTIALS_TIMEOUT] = 30 # checking redhat credentials througs mcollective conf[:REDHAT_GET_LICENSES_POOL_TIMEOUT] = 60 # getting redhat licenses through mcollective diff --git a/lib/astute/dump.rb b/lib/astute/dump.rb index b0fecfe2..2b127738 100644 --- a/lib/astute/dump.rb +++ b/lib/astute/dump.rb @@ -16,28 +16,26 @@ module Astute module Dump def self.dump_environment(ctx, lastdump) - timeout = 500 - shell = MClient.new(ctx, 'execute_shell_command', ['master']) + timeout = Astute.config.DUMP_TIMEOUT + shell = MClient.new(ctx, 'execute_shell_command', ['master'], check_result=true, timeout=timeout, retries=1) begin - Timeout.timeout(timeout) do - result = shell.execute( - :cmd => "/opt/nailgun/bin/nailgun_dump >>/var/log/dump.log 2>&1 && cat #{lastdump}").first - Astute.logger.debug("#{ctx.task_id}: \ + result = shell.execute( + :cmd => "/opt/nailgun/bin/nailgun_dump >>/var/log/dump.log 2>&1 && cat #{lastdump}").first + Astute.logger.debug("#{ctx.task_id}: \ stdout: #{result[:data][:stdout]} stderr: #{result[:data][:stderr]} \ exit code: #{result[:data][:exit_code]}") - if result[:data][:exit_code] == 0 - Astute.logger.info("#{ctx.task_id}: Snapshot is done. Result: #{result[:data][:stdout]}") - report_success(ctx, result[:data][:stdout].rstrip) - else - Astute.logger.error("#{ctx.task_id}: Dump command returned non zero exit code") - report_error(ctx, "exit code: #{result[:data][:exit_code]} stderr: #{result[:data][:stderr]}") - end + if result[:data][:exit_code] == 0 + Astute.logger.info("#{ctx.task_id}: Snapshot is done. Result: #{result[:data][:stdout]}") + report_success(ctx, result[:data][:stdout].rstrip) + else + Astute.logger.error("#{ctx.task_id}: Dump command returned non zero exit code") + report_error(ctx, "exit code: #{result[:data][:exit_code]} stderr: #{result[:data][:stderr]}") end rescue Timeout::Error msg = "Dump is timed out" Astute.logger.error("#{ctx.task_id}: #{msg}") report_error(ctx, msg) - rescue Exception => e + rescue => e msg = "Exception occured during dump task: message: #{e.message} \ trace: #{e.backtrace.inspect}" Astute.logger.error("#{ctx.task_id}: #{msg}") diff --git a/lib/astute/exceptions.rb b/lib/astute/exceptions.rb index 1f887b69..4a5a68b6 100644 --- a/lib/astute/exceptions.rb +++ b/lib/astute/exceptions.rb @@ -25,5 +25,7 @@ module Astute class FailedToRebootNodesError < AstuteError; end # Deployment engine error class DeploymentEngineError < AstuteError; end + # MClient errors + class MClientError < AstuteError; end end diff --git a/lib/astute/mclient.rb b/lib/astute/mclient.rb index 8b90e62f..1da7acfb 100644 --- a/lib/astute/mclient.rb +++ b/lib/astute/mclient.rb @@ -14,20 +14,29 @@ require 'mcollective' +require 'timeout' module Astute + + class MClientTimeout < Timeout::Error; end + class MClient include MCollective::RPC attr_accessor :retries - def initialize(ctx, agent, nodes=nil, check_result=true, timeout=nil) + def initialize(ctx, agent, nodes=nil, check_result=true, timeout=nil, retries=Astute.config.MC_RETRIES) @task_id = ctx.task_id @agent = agent @nodes = nodes.map { |n| n.to_s } if nodes @check_result = check_result - @retries = Astute.config.MC_RETRIES - #FIXME: this timeout does not work + # Will be used a minimum of two things: the specified parameter(timeout) + # and timeout from DDL (10 sec by default if not explicitly specified in DDL) + # If timeout here is nil will be used value from DDL. + # Examples: + # timeout - 10 sec, DDL - 20 sec. Result — 10 sec. + # timeout - 30 sec, DDL - 20 sec. Result — 20 sec. + # timeout - 20 sec, DDL - not set. Result — 10 sec. @timeout = timeout initialize_mclient end @@ -57,6 +66,7 @@ module Astute def check_results_with_retries(method, args) err_msg = '' + timeout_nodes_count = 0 # Following error might happen because of misconfiguration, ex. direct_addressing = 1 only on client # or.. could be just some hang? Let's retry if @retries is set if @mc_res.length < @nodes.length @@ -81,11 +91,12 @@ module Astute if @on_respond_timeout @on_respond_timeout.call not_responded else - err_msg += "MCollective agents '#{not_responded.join(',')}' didn't respond. \n" + err_msg += "MCollective agents '#{not_responded.join(',')}' didn't respond within the allotted time.\n" + timeout_nodes_count += not_responded.size end end end - failed = @mc_res.select{|x| x.results[:statuscode] != 0 } + failed = @mc_res.select { |x| x.results[:statuscode] != 0 } if failed.any? err_msg += "MCollective call failed in agent '#{@agent}', "\ "method '#{method}', failed nodes: \n" @@ -93,9 +104,15 @@ module Astute err_msg += "ID: #{n.results[:sender]} - Reason: #{n.results[:statusmsg]}\n" end end - unless err_msg.empty? + if err_msg.present? Astute.logger.error err_msg - raise "#{@task_id}: #{err_msg}" + expired_size = failed.count { |n| n.results[:statusmsg] == 'execution expired' } + # Detect TimeOut: 1 condition - fail because of DDL timeout, 2 - fail because of custom timeout + if (failed.present? && failed.size == expired_size) || (timeout_nodes_count > 0 && failed.empty?) + raise MClientTimeout, "#{@task_id}: #{err_msg}" + else + raise MClientError, "#{@task_id}: #{err_msg}" + end end end diff --git a/lib/astute/orchestrator.rb b/lib/astute/orchestrator.rb index 467476de..143ffcca 100644 --- a/lib/astute/orchestrator.rb +++ b/lib/astute/orchestrator.rb @@ -306,7 +306,12 @@ module Astute end def run_shell_command(context, node_uids, cmd) - shell = MClient.new(context, 'execute_shell_command', node_uids) + shell = MClient.new(context, + 'execute_shell_command', + node_uids, + check_result=true, + timeout=60, + retries=1) response = shell.execute(:cmd => cmd).first Astute.logger.debug("#{context.task_id}: cmd: #{cmd} stdout: #{response[:data][:stdout]} diff --git a/lib/astute/redhat_checker.rb b/lib/astute/redhat_checker.rb index 75f5c6ee..96068070 100644 --- a/lib/astute/redhat_checker.rb +++ b/lib/astute/redhat_checker.rb @@ -149,13 +149,11 @@ module Astute end def exec_cmd_with_timeout(cmd, timeout, timeout_expired_msg) - shell = MClient.new(@ctx, 'execute_shell_command', ['master']) + shell = MClient.new(@ctx, 'execute_shell_command', ['master'], check_result=true, timeout, retries=1) begin - Timeout.timeout(timeout) do - response = shell.execute(:cmd => cmd).first - report_error(timeout_expired_msg) unless response - return response - end + response = shell.execute(:cmd => cmd).first + report_error(timeout_expired_msg) unless response + return response rescue Timeout::Error Astute.logger.warn("Time out error for shell command '#{cmd}'") report_error(timeout_expired_msg) diff --git a/mcagents/execute_shell_command.ddl b/mcagents/execute_shell_command.ddl index ed4ffa4e..f93ed0d0 100644 --- a/mcagents/execute_shell_command.ddl +++ b/mcagents/execute_shell_command.ddl @@ -4,7 +4,7 @@ metadata :name => "Execute shell command", :license => "Apache License 2.0", :version => "0.0.1", :url => "http://mirantis.com", - :timeout => 600 + :timeout => 3600 action "execute", :description => "Execute shell command" do diff --git a/spec/unit/dump_spec.rb b/spec/unit/dump_spec.rb index 490b606a..53e40c88 100644 --- a/spec/unit/dump_spec.rb +++ b/spec/unit/dump_spec.rb @@ -33,7 +33,7 @@ describe 'dump_environment' do end Astute::MClient = mock() do - expects(:new).with(ctx, 'execute_shell_command', ['master']).returns(agent) + expects(:new).with(ctx, 'execute_shell_command', ['master'], true, Astute.config.DUMP_TIMEOUT, 1).returns(agent) end Astute::Dump.dump_environment(ctx, lastdump) end @@ -73,7 +73,7 @@ describe 'dump_environment' do it "should report error if any other exception occured" do agent = mock() do - expects(:execute).raises(Exception, "MESSAGE") + expects(:execute).raises(StandardError , "MESSAGE") end Astute::MClient = mock() do stubs(:new).returns(agent) diff --git a/spec/unit/mclient_spec.rb b/spec/unit/mclient_spec.rb index c9194a58..20050982 100644 --- a/spec/unit/mclient_spec.rb +++ b/spec/unit/mclient_spec.rb @@ -65,11 +65,12 @@ describe MClient do mc_valid_result2 = mock_mc_result({:sender => '2'}) rpcclient.stubs(:echo).returns([mc_valid_result]).then. - returns([mc_valid_result2]).then + returns([mc_valid_result2]) mclient = MClient.new(@ctx, "faketest", nodes.map {|x| x['uid']}) mclient.retries = 1 - expect { mclient.echo(:msg => 'hello world') }.to raise_error(/MCollective agents '3' didn't respond./) + expect { mclient.echo(:msg => 'hello world') }.to \ + raise_error(Astute::MClientTimeout, /MCollective agents '3' didn't respond./) end it "should raise error if agent returns statuscode != 0" do @@ -85,11 +86,12 @@ describe MClient do mc_failed_result = mock_mc_result({:sender => '2', :statuscode => 1}) rpcclient.stubs(:echo).returns([mc_valid_result]).then. - returns([mc_failed_result]).then + returns([mc_failed_result]).then. + returns([mc_failed_result]) mclient = MClient.new(@ctx, "faketest", nodes.map {|x| x['uid']}) mclient.retries = 1 expect { mclient.echo(:msg => 'hello world') }.to \ - raise_error(/MCollective agents '3' didn't respond./) + raise_error(Astute::MClientError, /ID: 2 - Reason:/) end end