From 7c0485eb1a8c75a57936eaecce935091659429e9 Mon Sep 17 00:00:00 2001 From: "Vladimir Sharshov (warpc)" Date: Fri, 30 Dec 2016 14:11:36 +0300 Subject: [PATCH] Network problem tolerance puppet status check Connection between node and Astute can be lost some times, so we need more tries to get info about task status on node. Two changes: - instead of 1 try Astute will run 6 tries with 10 timeout for every attempt; - it will process such behavior for puppet using separately retries: puppet_undefined_retries Instead of full puppet retry status retry is safety because it is idempotent. Puppet undefined retries can be setup using Astute config or sending undefined_retries in puppet task parameters same way as for usual retries. Most important thing: it will refresh to original value every time when Astute get defined answer. Change-Id: Ie86576a3400be5a6b11041c8e6acf89abf3bbd51 Related-Bug: #1653210 Closes-Bug: #1653737 --- lib/astute/config.rb | 1 + lib/astute/mclients/puppet_mclient.rb | 8 +-- lib/astute/puppet_job.rb | 45 +++++++++++-- lib/astute/tasks/puppet.rb | 2 + spec/unit/puppet_job_spec.rb | 97 +++++++++++++++++++++++++-- 5 files changed, 138 insertions(+), 15 deletions(-) diff --git a/lib/astute/config.rb b/lib/astute/config.rb index 431a0f18..2984c13b 100644 --- a/lib/astute/config.rb +++ b/lib/astute/config.rb @@ -62,6 +62,7 @@ module Astute conf[:puppet_start_interval] = 2 # interval between attemps to start puppet conf[:puppet_retries] = 2 # how many times astute will try to run puppet conf[:puppet_succeed_retries] = 0 # use this to rerun a puppet task again if it was successful (idempotency) + conf[:puppet_undefined_retries] = 3 # how many times astute will try to get actual status of node before fail conf[:puppet_module_path] = '/etc/puppet/modules' # where we should find basic modules for puppet conf[:puppet_noop_run] = false # enable Puppet noop run conf[:mc_retries] = 10 # MClient tries to call mcagent before failure diff --git a/lib/astute/mclients/puppet_mclient.rb b/lib/astute/mclients/puppet_mclient.rb index ca1975f0..d50b9309 100644 --- a/lib/astute/mclients/puppet_mclient.rb +++ b/lib/astute/mclients/puppet_mclient.rb @@ -56,14 +56,14 @@ module Astute # Create configured puppet mcollective agent # @return [Astute::MClient] - def puppetd + def puppetd(timeout=nil, retries=1) puppetd = MClient.new( @ctx, "puppetd", [@node_id], _check_result=true, - _timeout=nil, - _retries=1, + _timeout=timeout, + _retries=retries, _enable_result_logging=false ) puppetd.on_respond_timeout do |uids| @@ -77,7 +77,7 @@ module Astute # Run last_run_summary action using mcollective puppet agent # @return [Hash] return hash with status and resources def last_run_summary - @summary = puppetd.last_run_summary( + @summary = puppetd(_timeout=10, _retries=6).last_run_summary( :puppet_noop_run => @options['puppet_noop_run'], :raw_report => @options['raw_report'] ).first[:data] diff --git a/lib/astute/puppet_job.rb b/lib/astute/puppet_job.rb index 5ab94f24..1334ffea 100644 --- a/lib/astute/puppet_job.rb +++ b/lib/astute/puppet_job.rb @@ -39,8 +39,6 @@ module Astute 'stopped', 'disabled' ] - FAILED_STATUSES = UNDEFINED_STATUSES + STOPED_STATUSES - def initialize(task, puppet_mclient, options) @task = task @retries = options['retries'] @@ -48,6 +46,8 @@ module Astute @puppet_start_interval = options['puppet_start_interval'] @time_observer = TimeObserver.new(options['timeout']) @succeed_retries = options['succeed_retries'] + @undefined_retries = options['undefined_retries'] + @original_undefined_retries = options['undefined_retries'] @puppet_mclient = puppet_mclient end @@ -76,6 +76,8 @@ module Astute processing_running_task when 'failed' processing_error_task + when 'undefined' + processing_undefined_task end time_is_up! if should_stop? @@ -169,8 +171,10 @@ module Astute 'successful' when BUSY_STATUSES.include?(mco_puppet_status) 'running' - when FAILED_STATUSES.include?(mco_puppet_status) + when STOPED_STATUSES.include?(mco_puppet_status) 'failed' + when UNDEFINED_STATUSES.include?(mco_puppet_status) + 'undefined' else raise StatusValidationError, "Unknow puppet status: #{mco_puppet_status}" @@ -189,7 +193,7 @@ module Astute # @return [void] def log_current_status(status) message = "#{task_details_for_log}, status: #{status}" - if FAILED_STATUSES.include?(status) + if (UNDEFINED_STATUSES + STOPED_STATUSES).include?(status) Astute.logger.error message else Astute.logger.debug message @@ -197,8 +201,9 @@ module Astute end # Process additional action in case of puppet succeed - # @return [String] Task status: successful, failed or running + # @return [String] Task status: successful or running def processing_succeed_task + reset_undefined_retries! Astute.logger.debug "Puppet completed within "\ "#{@time_observer.since_start} seconds" if @succeed_retries > 0 @@ -218,8 +223,9 @@ module Astute end # Process additional action in case of puppet failed - # @return [String] Task status: successful, failed or running + # @return [String] Task status: failed or running def processing_error_task + reset_undefined_retries! if @retries > 0 @retries -= 1 Astute.logger.debug "Puppet on node will be "\ @@ -236,12 +242,39 @@ module Astute end end + # Process additional action in case of undefined puppet status + # @return [String] Task status: failed or running + def processing_undefined_task + if @undefined_retries > 0 + @undefined_retries -= 1 + Astute.logger.debug "Puppet on node has undefined status. "\ + "#{@undefined_retries} retries remained. "\ + "#{task_details_for_log}" + Astute.logger.info "Retrying to check status for following "\ + "nodes: #{@puppet_mclient.node_id}" + 'running' + else + Astute.logger.error "Node has failed to get status. There is"\ + " no more retries for status check. #{task_details_for_log}" + 'failed' + end + end + # Process additional action in case of puppet running # @return [String]: Task status: successful, failed or running def processing_running_task + reset_undefined_retries! 'running' end + # Reset undefined retries to original value + # @return [void] + def reset_undefined_retries! + Astute.logger.debug "Reset undefined retries to original "\ + "value: #{@original_undefined_retries}" + @undefined_retries = @original_undefined_retries + end + end #PuppetJob end diff --git a/lib/astute/tasks/puppet.rb b/lib/astute/tasks/puppet.rb index 5f9fe9bc..69fc7237 100644 --- a/lib/astute/tasks/puppet.rb +++ b/lib/astute/tasks/puppet.rb @@ -47,6 +47,7 @@ module Astute 'timeout' => Astute.config.puppet_timeout, 'puppet_debug' => false, 'succeed_retries' => Astute.config.puppet_succeed_retries, + 'undefined_retries' => Astute.config.puppet_undefined_retries, 'raw_report' => Astute.config.puppet_raw_report, 'puppet_noop_run' => Astute.config.puppet_noop_run, 'puppet_start_timeout' => Astute.config.puppet_start_timeout, @@ -67,6 +68,7 @@ module Astute { 'retries' => @task['parameters']['retries'], 'succeed_retries' => @task['parameters']['succeed_retries'], + 'undefined_retries' => @task['parameters']['undefined_retries'], 'timeout' => @task['parameters']['timeout'], 'puppet_start_timeout' => @task['parameters'][ 'puppet_start_timeout'], diff --git a/spec/unit/puppet_job_spec.rb b/spec/unit/puppet_job_spec.rb index cd18e2bd..ed6f8b08 100644 --- a/spec/unit/puppet_job_spec.rb +++ b/spec/unit/puppet_job_spec.rb @@ -31,6 +31,7 @@ describe Astute::PuppetJob do { 'retries' => 1, 'succeed_retries' => 0, + 'undefined_retries' => 1, 'timeout' => 1, 'puppet_start_timeout' => 1, 'puppet_start_interval' => 0 @@ -71,7 +72,7 @@ describe Astute::PuppetJob do describe '#task_status=' do it 'should raise error if status do not support' do expect {subject.send(:task_status=, 'unknow_status')}.to \ - raise_error(StatusValidationError, /unknow_status/) + raise_error(Astute::StatusValidationError, /unknow_status/) end end @@ -84,7 +85,7 @@ describe Astute::PuppetJob do puppet_mclient.expects(:status).returns('unknow_status') expect {subject.status}.to raise_error( - StatusValidationError, /unknow_status/ + Astute::StatusValidationError, /unknow_status/ ) end end @@ -124,7 +125,7 @@ describe Astute::PuppetJob do end it 'should return runing when magent failed but can retry' do - puppet_mclient.expects(:run).twice.returns(true) + puppet_mclient.expects(:run).once.returns(true) subject.run puppet_mclient.stubs(:status) @@ -149,13 +150,45 @@ describe Astute::PuppetJob do expect(subject.status).to eq('successful') end + it 'should successful if undefined/failed but retry succeed' do + puppet_mclient.stubs(:run).returns(true) + options['undefined_retries'] = 1 + options['retries'] = 1 + subject.run + + puppet_mclient.stubs(:status) + .then.returns('undefined') + .returns('stopped') + .then.returns('undefined') + .then.returns('succeed') + + 3.times { expect(subject.status).to eq('running') } + expect(subject.status). to eq('successful') + end + it 'should successful if failed but retry succeed' do puppet_mclient.stubs(:run).returns(true) + options['undefined_retries'] = 0 options['retries'] = 2 subject.run puppet_mclient.stubs(:status) .returns('stopped') + .then.returns('stopped') + .then.returns('succeed') + + 2.times { expect(subject.status).to eq('running') } + expect(subject.status). to eq('successful') + end + + it 'should successful if undefined but retry succeed' do + puppet_mclient.stubs(:run).returns(true) + options['undefined_retries'] = 2 + options['retries'] = 0 + subject.run + + puppet_mclient.stubs(:status) + .returns('undefined') .then.returns('undefined') .then.returns('succeed') @@ -188,7 +221,7 @@ describe Astute::PuppetJob do subject.run puppet_mclient.stubs(:status) - .returns('undefined') + .returns('stopped') .then.returns('stopped') expect(subject.status).to eq('running') @@ -210,13 +243,67 @@ describe Astute::PuppetJob do subject.run puppet_mclient.stubs(:status) - .returns('undefined') + .returns('stopped') .then.returns('stopped') expect(subject.status).to eq('running') 3.times { expect(subject.status). to eq('failed') } end end + + context 'undefined' do + it 'should return failed if undefined and no more retries' do + puppet_mclient.stubs(:run).returns(true) + subject.run + + puppet_mclient.stubs(:status) + .returns('undefined') + .then.returns('undefined') + + expect(subject.status).to eq('running') + expect(subject.status).to eq('failed') + end + + it 'should return failed if time is over and no result' do + puppet_mclient.stubs(:run).returns(true) + options['timeout'] = 0 + subject.run + + puppet_mclient.stubs(:status).returns('undefined') + expect(subject.status).to eq('failed') + end + + it 'should do nothing if final status set and retries end' do + puppet_mclient.stubs(:run).returns(true) + options['undefined'] = 0 + subject.run + + puppet_mclient.stubs(:status) + .returns('undefined') + .then.returns('undefined') + + expect(subject.status).to eq('running') + 3.times { expect(subject.status). to eq('failed') } + end + + it 'should reset retries if answer was received' do + puppet_mclient.stubs(:run).returns(true) + options['undefined_retries'] = 1 + options['retries'] = 0 + subject.run + + puppet_mclient.stubs(:status) + .then.returns('undefined') + .returns('running') + .then.returns('undefined') + .then.returns('succeed') + + 3.times { expect(subject.status).to eq('running') } + expect(subject.status). to eq('successful') + end + end + + end end