Reset env reboots nodes twice sometimes

There was second node reboot during reset environment.
Root cause of this is kernel panic on node, caused by removing
partitions and data on the node (dd command). In this way,
node can't send status about removing process and astute
retries to send removing and reboot requests to node. This
problem fixed by detecting already removed nodes using comparing
time from boot.

Change-Id: I5e54b9f741cdc762ffdcf46781e2a62dd7057a6c
Closes-Bug: #1478020
Signed-off-by: Ruslan Aliev <raliev@mirantis.com>
This commit is contained in:
Ruslan Aliev 2016-01-27 20:41:29 +03:00 committed by Vladimir Sharshov
parent 1bb6db2c4b
commit c14a4ddf69
2 changed files with 85 additions and 8 deletions

View File

@ -77,6 +77,26 @@ module Astute
[mclient_skipped_nodes, mclient_nodes]
end
def get_already_removed_nodes(nodes)
removed_nodes = []
control_time = {}
nodes.uids.sort.each_slice(Astute.config[:max_nodes_per_call]) do |part|
control_time.merge!(get_boot_time(part))
end
nodes.each do |uid, node|
boot_time = control_time[uid].to_i
next if boot_time.zero?
if node.boot_time
removed_nodes << uid if boot_time != node.boot_time
else
node.boot_time = boot_time
end
end
removed_nodes
end
def remove_nodes(nodes)
if nodes.empty?
Astute.logger.info "#{@ctx.task_id}: Nodes to remove are not provided. Do nothing."
@ -84,10 +104,19 @@ module Astute
end
erased_nodes, mclient_nodes = skipped_unskipped_mclient_nodes(nodes)
removed_nodes = get_already_removed_nodes(mclient_nodes)
removed_nodes.each do |uid|
erased_node = Node.new('uid' => uid)
erased_nodes << erased_node
mclient_nodes.delete(uid)
Astute.logger.info "#{@ctx.task_id}: Node #{uid} is removed already, skipping"
end
responses = mclient_remove_nodes(mclient_nodes)
inaccessible_uids = mclient_nodes.uids - responses.map { |response| response[:sender] }
inaccessible_nodes = NodesHash.build(inaccessible_uids.map do |uid|
{'uid' => uid, 'error' => 'Node not answered by RPC.'}
{'uid' => uid, 'error' => 'Node not answered by RPC.', 'boot_time' => mclient_nodes[uid][:boot_time]}
end)
error_nodes = NodesHash.new
@ -136,5 +165,33 @@ module Astute
responses.map(&:results)
end
def run_shell_without_check(context, node_uids, cmd, timeout=10)
shell = MClient.new(
context,
'execute_shell_command',
node_uids,
check_result=false,
timeout=timeout
)
results = shell.execute(:cmd => cmd)
results.inject({}) do |h, res|
Astute.logger.debug(
"#{context.task_id}: cmd: #{cmd}\n" \
"stdout: #{res.results[:data][:stdout]}\n" \
"stderr: #{res.results[:data][:stderr]}\n" \
"exit code: #{res.results[:data][:exit_code]}")
h.merge({res.results[:sender] => res.results[:data][:stdout].chomp})
end
end
def get_boot_time(node_uids)
run_shell_without_check(
@ctx,
node_uids,
"stat --printf='%Y' /proc/1",
timeout=10
)
end
end
end

View File

@ -20,6 +20,7 @@ describe Astute::NodesRemover do
let(:nodes) { [{'uid' => '1'}, {'uid' => '2'}] }
let(:ctx) { mock_ctx }
let(:ctl_time) { {'1' => '100', '2' => '200'} }
let(:mcollective_answer) do
[
@ -30,6 +31,7 @@ describe Astute::NodesRemover do
before(:each) do
Astute::NodesRemover.any_instance.stubs(:mclient_remove_piece_nodes).returns(mcollective_answer)
Astute::NodesRemover.any_instance.stubs(:run_shell_without_check).returns(ctl_time)
end
it 'should erase nodes (mbr) and reboot nodes(default)' do
@ -54,7 +56,7 @@ describe Astute::NodesRemover do
{'uid' => '3', 'mclient_remove' => false},
{'uid' => '2'},
],
"inaccessible_nodes" => [{"uid"=>"1", "error"=>"Node not answered by RPC."}]
"inaccessible_nodes" => [{"uid"=>"1", "error"=>"Node not answered by RPC.", "boot_time"=>100}]
}
)
end
@ -78,8 +80,8 @@ describe Astute::NodesRemover do
nr = Astute::NodesRemover.new(ctx, nodes)
nr.stubs(:mclient_remove_nodes).with(
Astute::NodesHash.build([
{'uid' => '1'},
{'uid' => '2', 'mclient_remove' => true}
{'uid' => '1', 'boot_time' => 100},
{'uid' => '2', 'mclient_remove' => true, 'boot_time' => 200}
])
).returns(mcollective_answer).once
nr.remove
@ -106,8 +108,8 @@ describe Astute::NodesRemover do
{ "nodes"=>[],
"status" => "error",
"error_nodes" => [
{"uid"=>"1", "error"=>"RPC agent 'erase_node' failed. Result:\n{:sender=>\"1\", :statuscode=>1, :data=>{:rebooted=>false}}\n"},
{"uid"=>"2", "error"=>"RPC agent 'erase_node' failed. Result:\n{:sender=>\"2\", :statuscode=>1, :data=>{:rebooted=>false}}\n"}
{"uid"=>"1", "error"=>"RPC agent 'erase_node' failed. Result:\n{:sender=>\"1\", :statuscode=>1, :data=>{:rebooted=>false}}\n", "boot_time"=>100},
{"uid"=>"2", "error"=>"RPC agent 'erase_node' failed. Result:\n{:sender=>\"2\", :statuscode=>1, :data=>{:rebooted=>false}}\n", "boot_time"=>200}
]
}
)
@ -159,14 +161,32 @@ describe Astute::NodesRemover do
{ "nodes"=>[],
"status" => "error",
"error_nodes" => [
{"uid"=>"1", "error"=>"RPC method 'erase_node' failed with message: Could not reboot"},
{"uid"=>"2", "error"=>"RPC method 'erase_node' failed with message: Could not reboot"}
{"uid"=>"1", "error"=>"RPC method 'erase_node' failed with message: Could not reboot", "boot_time"=>100},
{"uid"=>"2", "error"=>"RPC method 'erase_node' failed with message: Could not reboot", "boot_time"=>200}
]
}
)
end
end
context 'nodes fail to send status, but erased and rebooted' do
let(:mcollective_answer) do
[]
end
let(:ctl_time2) { {} }
let(:ctl_time3) { {'1' => '150', '2' => '250'} }
it 'should process rebooted nodes as erased' do
Astute::NodesRemover.any_instance.stubs(:mclient_remove_piece_nodes).returns(mcollective_answer)
Astute::NodesRemover.any_instance.stubs(:run_shell_without_check).returns(ctl_time)
.then.returns(ctl_time2).then.returns(ctl_time3)
expect(Astute::NodesRemover.new(ctx, nodes, reboot=true).remove).to eq(
{ "nodes"=>[{"uid"=>"1"}, {"uid"=>"2"}] }
)
end
end
context 'erase node when change node status from bootstrap to provisioning' do
let(:mcollective_answer) do
[