Network problem tolerance puppet status check

Connection between node and Astute can be lost some
times, so we need more tries to get info about task
status on node.

Two changes:

- instead of 1 try Astute will run 6 tries with 10
timeout for every attempt;
- it will process such behavior for puppet using separately
  retries: puppet_undefined_retries

Instead of full puppet retry status retry is safety because
it is idempotent.

Puppet undefined retries can be setup using Astute config
or sending undefined_retries in puppet task parameters same
way as for usual retries. Most important thing: it will refresh
to original value every time when Astute get defined answer.

Change-Id: Ie86576a3400be5a6b11041c8e6acf89abf3bbd51
Related-Bug: #1653210
Closes-Bug: #1653737
This commit is contained in:
Vladimir Sharshov (warpc) 2016-12-30 14:11:36 +03:00
parent 06fe7856f6
commit 7c0485eb1a
5 changed files with 138 additions and 15 deletions

View File

@ -62,6 +62,7 @@ module Astute
conf[:puppet_start_interval] = 2 # interval between attemps to start puppet
conf[:puppet_retries] = 2 # how many times astute will try to run puppet
conf[:puppet_succeed_retries] = 0 # use this to rerun a puppet task again if it was successful (idempotency)
conf[:puppet_undefined_retries] = 3 # how many times astute will try to get actual status of node before fail
conf[:puppet_module_path] = '/etc/puppet/modules' # where we should find basic modules for puppet
conf[:puppet_noop_run] = false # enable Puppet noop run
conf[:mc_retries] = 10 # MClient tries to call mcagent before failure

View File

@ -56,14 +56,14 @@ module Astute
# Create configured puppet mcollective agent
# @return [Astute::MClient]
def puppetd
def puppetd(timeout=nil, retries=1)
puppetd = MClient.new(
@ctx,
"puppetd",
[@node_id],
_check_result=true,
_timeout=nil,
_retries=1,
_timeout=timeout,
_retries=retries,
_enable_result_logging=false
)
puppetd.on_respond_timeout do |uids|
@ -77,7 +77,7 @@ module Astute
# Run last_run_summary action using mcollective puppet agent
# @return [Hash] return hash with status and resources
def last_run_summary
@summary = puppetd.last_run_summary(
@summary = puppetd(_timeout=10, _retries=6).last_run_summary(
:puppet_noop_run => @options['puppet_noop_run'],
:raw_report => @options['raw_report']
).first[:data]

View File

@ -39,8 +39,6 @@ module Astute
'stopped', 'disabled'
]
FAILED_STATUSES = UNDEFINED_STATUSES + STOPED_STATUSES
def initialize(task, puppet_mclient, options)
@task = task
@retries = options['retries']
@ -48,6 +46,8 @@ module Astute
@puppet_start_interval = options['puppet_start_interval']
@time_observer = TimeObserver.new(options['timeout'])
@succeed_retries = options['succeed_retries']
@undefined_retries = options['undefined_retries']
@original_undefined_retries = options['undefined_retries']
@puppet_mclient = puppet_mclient
end
@ -76,6 +76,8 @@ module Astute
processing_running_task
when 'failed'
processing_error_task
when 'undefined'
processing_undefined_task
end
time_is_up! if should_stop?
@ -169,8 +171,10 @@ module Astute
'successful'
when BUSY_STATUSES.include?(mco_puppet_status)
'running'
when FAILED_STATUSES.include?(mco_puppet_status)
when STOPED_STATUSES.include?(mco_puppet_status)
'failed'
when UNDEFINED_STATUSES.include?(mco_puppet_status)
'undefined'
else
raise StatusValidationError,
"Unknow puppet status: #{mco_puppet_status}"
@ -189,7 +193,7 @@ module Astute
# @return [void]
def log_current_status(status)
message = "#{task_details_for_log}, status: #{status}"
if FAILED_STATUSES.include?(status)
if (UNDEFINED_STATUSES + STOPED_STATUSES).include?(status)
Astute.logger.error message
else
Astute.logger.debug message
@ -197,8 +201,9 @@ module Astute
end
# Process additional action in case of puppet succeed
# @return [String] Task status: successful, failed or running
# @return [String] Task status: successful or running
def processing_succeed_task
reset_undefined_retries!
Astute.logger.debug "Puppet completed within "\
"#{@time_observer.since_start} seconds"
if @succeed_retries > 0
@ -218,8 +223,9 @@ module Astute
end
# Process additional action in case of puppet failed
# @return [String] Task status: successful, failed or running
# @return [String] Task status: failed or running
def processing_error_task
reset_undefined_retries!
if @retries > 0
@retries -= 1
Astute.logger.debug "Puppet on node will be "\
@ -236,12 +242,39 @@ module Astute
end
end
# Process additional action in case of undefined puppet status
# @return [String] Task status: failed or running
def processing_undefined_task
if @undefined_retries > 0
@undefined_retries -= 1
Astute.logger.debug "Puppet on node has undefined status. "\
"#{@undefined_retries} retries remained. "\
"#{task_details_for_log}"
Astute.logger.info "Retrying to check status for following "\
"nodes: #{@puppet_mclient.node_id}"
'running'
else
Astute.logger.error "Node has failed to get status. There is"\
" no more retries for status check. #{task_details_for_log}"
'failed'
end
end
# Process additional action in case of puppet running
# @return [String]: Task status: successful, failed or running
def processing_running_task
reset_undefined_retries!
'running'
end
# Reset undefined retries to original value
# @return [void]
def reset_undefined_retries!
Astute.logger.debug "Reset undefined retries to original "\
"value: #{@original_undefined_retries}"
@undefined_retries = @original_undefined_retries
end
end #PuppetJob
end

View File

@ -47,6 +47,7 @@ module Astute
'timeout' => Astute.config.puppet_timeout,
'puppet_debug' => false,
'succeed_retries' => Astute.config.puppet_succeed_retries,
'undefined_retries' => Astute.config.puppet_undefined_retries,
'raw_report' => Astute.config.puppet_raw_report,
'puppet_noop_run' => Astute.config.puppet_noop_run,
'puppet_start_timeout' => Astute.config.puppet_start_timeout,
@ -67,6 +68,7 @@ module Astute
{
'retries' => @task['parameters']['retries'],
'succeed_retries' => @task['parameters']['succeed_retries'],
'undefined_retries' => @task['parameters']['undefined_retries'],
'timeout' => @task['parameters']['timeout'],
'puppet_start_timeout' => @task['parameters'][
'puppet_start_timeout'],

View File

@ -31,6 +31,7 @@ describe Astute::PuppetJob do
{
'retries' => 1,
'succeed_retries' => 0,
'undefined_retries' => 1,
'timeout' => 1,
'puppet_start_timeout' => 1,
'puppet_start_interval' => 0
@ -71,7 +72,7 @@ describe Astute::PuppetJob do
describe '#task_status=' do
it 'should raise error if status do not support' do
expect {subject.send(:task_status=, 'unknow_status')}.to \
raise_error(StatusValidationError, /unknow_status/)
raise_error(Astute::StatusValidationError, /unknow_status/)
end
end
@ -84,7 +85,7 @@ describe Astute::PuppetJob do
puppet_mclient.expects(:status).returns('unknow_status')
expect {subject.status}.to raise_error(
StatusValidationError, /unknow_status/
Astute::StatusValidationError, /unknow_status/
)
end
end
@ -124,7 +125,7 @@ describe Astute::PuppetJob do
end
it 'should return runing when magent failed but can retry' do
puppet_mclient.expects(:run).twice.returns(true)
puppet_mclient.expects(:run).once.returns(true)
subject.run
puppet_mclient.stubs(:status)
@ -149,13 +150,45 @@ describe Astute::PuppetJob do
expect(subject.status).to eq('successful')
end
it 'should successful if undefined/failed but retry succeed' do
puppet_mclient.stubs(:run).returns(true)
options['undefined_retries'] = 1
options['retries'] = 1
subject.run
puppet_mclient.stubs(:status)
.then.returns('undefined')
.returns('stopped')
.then.returns('undefined')
.then.returns('succeed')
3.times { expect(subject.status).to eq('running') }
expect(subject.status). to eq('successful')
end
it 'should successful if failed but retry succeed' do
puppet_mclient.stubs(:run).returns(true)
options['undefined_retries'] = 0
options['retries'] = 2
subject.run
puppet_mclient.stubs(:status)
.returns('stopped')
.then.returns('stopped')
.then.returns('succeed')
2.times { expect(subject.status).to eq('running') }
expect(subject.status). to eq('successful')
end
it 'should successful if undefined but retry succeed' do
puppet_mclient.stubs(:run).returns(true)
options['undefined_retries'] = 2
options['retries'] = 0
subject.run
puppet_mclient.stubs(:status)
.returns('undefined')
.then.returns('undefined')
.then.returns('succeed')
@ -188,7 +221,7 @@ describe Astute::PuppetJob do
subject.run
puppet_mclient.stubs(:status)
.returns('undefined')
.returns('stopped')
.then.returns('stopped')
expect(subject.status).to eq('running')
@ -210,13 +243,67 @@ describe Astute::PuppetJob do
subject.run
puppet_mclient.stubs(:status)
.returns('undefined')
.returns('stopped')
.then.returns('stopped')
expect(subject.status).to eq('running')
3.times { expect(subject.status). to eq('failed') }
end
end
context 'undefined' do
it 'should return failed if undefined and no more retries' do
puppet_mclient.stubs(:run).returns(true)
subject.run
puppet_mclient.stubs(:status)
.returns('undefined')
.then.returns('undefined')
expect(subject.status).to eq('running')
expect(subject.status).to eq('failed')
end
it 'should return failed if time is over and no result' do
puppet_mclient.stubs(:run).returns(true)
options['timeout'] = 0
subject.run
puppet_mclient.stubs(:status).returns('undefined')
expect(subject.status).to eq('failed')
end
it 'should do nothing if final status set and retries end' do
puppet_mclient.stubs(:run).returns(true)
options['undefined'] = 0
subject.run
puppet_mclient.stubs(:status)
.returns('undefined')
.then.returns('undefined')
expect(subject.status).to eq('running')
3.times { expect(subject.status). to eq('failed') }
end
it 'should reset retries if answer was received' do
puppet_mclient.stubs(:run).returns(true)
options['undefined_retries'] = 1
options['retries'] = 0
subject.run
puppet_mclient.stubs(:status)
.then.returns('undefined')
.returns('running')
.then.returns('undefined')
.then.returns('succeed')
3.times { expect(subject.status).to eq('running') }
expect(subject.status). to eq('successful')
end
end
end
end