Use async shell call for provision

This change allow to use async shell task based on
puppet to run provision commands.

It is transition change between old run way of image
provision and provision as graph which will also
used async shell to run.

It is more fault tolerance way to provision because
temporary problem with connection between master node
and provisioning node do not block or fail provision.

Important notice: it is allow only if bootstrap image
has puppet and daemonize packages which is true for 9.2
or higher releases.

Change-Id: Ie634fae9b63bf0c103ec8926647af75b57cefe23
Related-Bug: #1644618
This commit is contained in:
Vladimir Sharshov (warpc) 2016-12-27 15:38:26 +03:00
parent 1dc4d754b7
commit dc47550460
5 changed files with 58 additions and 38 deletions

View File

@ -108,6 +108,7 @@ module Astute
conf[:graph_dot_dir] = "/var/lib/astute/graphs" # default dir patch for debug graph file
conf[:enable_graph_file] = true # enable debug graph records to file
conf[:puppet_raw_report] = false # enable puppet detailed report
conf[:task_poll_delay] = 1 # sleeps for ## sec between task status calls
# Server settings
conf[:broker_host] = 'localhost'

View File

@ -75,19 +75,13 @@ module Astute
Astute.logger.debug("#{ctx.task_id}: running provision script: " \
"#{uids.join(', ')}")
results = run_shell_command(
failed_uids |= run_shell_task(
ctx,
uids,
'flock -n /var/lock/provision.lock provision',
Astute.config.provisioning_timeout
)
results.select{ |_node_id, result| !result }.keys.each do |node_id|
failed_uids << node_id
Astute.logger.error("#{ctx.task_id}: Provision command returned " \
"non zero exit code on node: #{node_id}")
end
failed_uids
end
@ -122,24 +116,42 @@ module Astute
).process
end
def self.run_shell_command(context, node_uids, cmd, timeout=3600)
shell = MClient.new(
context,
'execute_shell_command',
node_uids,
check_result=true,
timeout=timeout,
retries=1
)
results = shell.execute(:cmd => cmd)
results.inject({}) do |summary, node|
summary.merge(node.results[:sender] => node.results[:data][:exit_code] == 0)
def self.run_shell_task(ctx, node_uids, cmd, timeout=3600)
shell_tasks = node_uids.inject([]) do |tasks, node_id|
tasks << Shell.new(generate_shell_hook(node_id, cmd, timeout), ctx)
end
rescue MClientTimeout, MClientError => e
Astute.logger.error("#{context.task_id}: cmd: #{cmd} " \
"mcollective error: #{e.message}")
{}
shell_tasks.each(&:run)
while shell_tasks.any? { |t| !t.finished? } do
shell_tasks.select { |t| !t.finished? }.each(&:status)
sleep Astute.config.task_poll_delay
end
failed_uids = shell_tasks.select{ |t| t.failed? }.inject([]) do |task|
Astute.logger.error("#{ctx.task_id}: Provision command returned " \
"non zero exit code on node: #{task.node_id}")
failed_uids << task.node_id
end
failed_uids
rescue => e
Astute.logger.error("#{ctx.task_id}: cmd: #{cmd} " \
"error: #{e.message}, trace #{e.backtrace}")
node_uids
end
def self.generate_shell_hook(node_id, cmd, timeout)
{
"node_id" => node_id,
"id" => "provision_#{node_id}",
"parameters" => {
"cmd" => cmd,
"cwd" => "/",
"timeout" => timeout,
"retries" => 0
}
}
end
end

View File

@ -61,7 +61,7 @@ module Astute
def sync_run
run
loop do
sleep 1
sleep Astute.config.task_poll_delay
status
break if finished?
end

View File

@ -31,6 +31,10 @@ module Astute
{}
end
def node_id
@task['node_id']
end
private
SHELL_MANIFEST_DIR = '/etc/puppet/shell_manifests'

View File

@ -102,12 +102,12 @@ describe Astute::ImageProvision do
.with([reboot_hook], ctx, 'provision')
.returns(nailgun_hook)
nailgun_hook.expects(:process).once
provisioner.reboot(ctx, node_ids, task_id="reboot_provisioned_nodes")
provisioner.reboot(ctx, node_ids, _task_id="reboot_provisioned_nodes")
end
it 'should not run hook if no nodes present' do
Astute::NailgunHooks.expects(:new).never
provisioner.reboot(ctx, [], task_id="reboot_provisioned_nodes")
provisioner.reboot(ctx, [], _task_id="reboot_provisioned_nodes")
end
end
@ -184,24 +184,28 @@ describe Astute::ImageProvision do
end
describe ".run_provision" do
it 'should run provision on nodes using shell magent' do
provisioner.expects(:run_shell_command).once.with(
ctx,
nodes.map { |n| n['uid'] },
'flock -n /var/lock/provision.lock provision',
Astute.config.provisioning_timeout
).returns({5 => true, 6 => true})
before do
provisioner.stubs(:sleep)
end
provisioner.run_provision(ctx, nodes.map { |n| n['uid'] }, [])
it 'should run provision on nodes using shell magent' do
Astute::Shell.any_instance.stubs(:process)
Astute::Shell.any_instance.expects(:run).once
Astute::Shell.any_instance.expects(:finished?).times(3)
.returns(false).
then.returns(true)
Astute::Shell.any_instance.expects(:failed?).once.returns(false)
provisioner.run_provision(ctx, [5], [])
end
it 'should run return failed nodes' do
provisioner.stubs(:run_shell_command).once.returns({5 => true, 6 => false})
provisioner.stubs(:run_shell_task).once.returns([6])
expect(provisioner.run_provision(ctx, nodes.map { |n| n['uid'] }, [])).to eql([6])
end
it 'should not erase info about alread failed nodes' do
provisioner.stubs(:run_shell_command).once.returns({5 => true, 6 => false})
provisioner.stubs(:run_shell_task).once.returns([6])
failed_uids = [3]
expect(provisioner.run_provision(
ctx,
@ -212,4 +216,3 @@ describe Astute::ImageProvision do
end
end