Merge "Strong puppet hung check"

2013-12-13 19:07:51 +00:00 · 2013-12-13 19:07:51 +00:00 · 75aa0877cb
parent 7b77437a5d 1b8d6e18a3
commit 75aa0877cb
7 changed files with 245 additions and 211 deletions
--- a/examples/example_astute_config.yaml
+++ b/examples/example_astute_config.yaml
@ -19,4 +19,4 @@ PUPPET_FADE_TIMEOUT: 60
 # PUPPET_FADE_INTERVAL is used in puppetd.rb file.
 # Retry every PUPPET_FADE_INTERVAL seconds to check puppet state if it was
 # in 'running' state.
-PUPPET_FADE_INTERVAL: 1
+PUPPET_FADE_INTERVAL: 5
--- a/lib/astute/config.rb
+++ b/lib/astute/config.rb
@ -58,7 +58,7 @@ module Astute
    conf[:PUPPET_FADE_TIMEOUT] = 60       # how long it can take for puppet to exit after dumping to last_run_summary
    conf[:MC_RETRIES] = 5                 # MClient tries to call mcagent before failure
    conf[:MC_RETRY_INTERVAL] = 1          # MClient sleeps for ## sec between retries
-    conf[:PUPPET_FADE_INTERVAL] = 1       # retry every ## seconds to check puppet state if it was running
+    conf[:PUPPET_FADE_INTERVAL] = 5       # retry every ## seconds to check puppet state if it was running
    conf[:PROVISIONING_TIMEOUT] = 90 * 60 # timeout for booting target OS in provision
    conf[:REBOOT_TIMEOUT] = 120           # how long it can take for node to reboot
--- a/lib/astute/deployment_engine/nailyfact.rb
+++ b/lib/astute/deployment_engine/nailyfact.rb
@ -16,7 +16,7 @@ require 'yaml'
 class Astute::DeploymentEngine::NailyFact < Astute::DeploymentEngine
-  def deploy_piece(nodes, retries=2, change_node_status=true)
+  def deploy_piece(nodes, retries=2)
    return false unless validate_nodes(nodes)
    @ctx.reporter.report(nodes_status(nodes, 'deploying', {'progress' => 0}))
@ -30,7 +30,7 @@ class Astute::DeploymentEngine::NailyFact < Astute::DeploymentEngine
    nodes.each { |node| upload_facts(node) }
    Astute.logger.info "#{@ctx.task_id}: Required attrs/metadata passed via facts extension. Starting deployment."
-    Astute::PuppetdDeployer.deploy(@ctx, nodes, retries, change_node_status)
+    Astute::PuppetdDeployer.deploy(@ctx, nodes, retries)
    nodes_roles = nodes.map { |n| {n['uid'] => n['role']} }
    Astute.logger.info "#{@ctx.task_id}: Finished deployment of nodes => roles: #{nodes_roles.inspect}"
  end
--- a/lib/astute/puppetd.rb
+++ b/lib/astute/puppetd.rb
@ -18,6 +18,147 @@ require 'timeout'
 module Astute
  module PuppetdDeployer
    def self.deploy(ctx, nodes, retries=2)
      @ctx = ctx
      @nodes_roles = nodes.inject({}) { |h, n| h.merge({n['uid'] => n['role']}) }
      @node_retries = nodes.inject({}) { |h, n| h.merge({n['uid'] => retries}) }
      @nodes = nodes
      Astute.logger.debug "Waiting for puppet to finish deployment on all
                           nodes (timeout = #{Astute.config.PUPPET_TIMEOUT} sec)..."
      time_before = Time.now
      deploy_nodes(@nodes.map { |n| n['uid'] })
      time_spent = Time.now - time_before
      Astute.logger.info "#{@ctx.task_id}: Spent #{time_spent} seconds on puppet run "\
                         "for following nodes(uids): #{@nodes.map {|n| n['uid']}.join(',')}"
    end
    private
    # Runs puppetd.runonce only if puppet is stopped on the host at the time
    # If it isn't stopped, we wait a bit and try again.
    # Returns list of nodes uids which appear to be with hung puppet.
    def self.puppetd_runonce(uids)
      started = Time.now.to_i
      while Time.now.to_i - started < Astute.config.PUPPET_FADE_TIMEOUT
        running_uids = puppetd(uids).last_run_summary.select { |x|
          ['running', 'idling'].include?(x.results[:data][:status])
        }.map { |n| n.results[:sender] }
        stopped_uids = uids - running_uids
        @nodes.select { |n| stopped_uids.include? n['uid'] }
             .group_by { |n| n['debug'] }
             .each do |debug, stop_nodes|
               puppetd(stop_nodes.map { |n| n['uid'] }).runonce(:puppet_debug => debug)
             end
        break if running_uids.empty?
        uids = running_uids
        sleep Astute.config.PUPPET_FADE_INTERVAL
      end
      Astute.logger.debug "puppetd_runonce completed within #{Time.now.to_i - started} seconds."
      Astute.logger.warn "Following nodes have puppet hung: '#{running_uids.join(',')}'" if running_uids.present?
      running_uids
    end
    def self.calc_nodes_status(last_run, prev_run, hung_nodes=[])
      # Finished are those which are not in running state,
      #   and changed their last_run time, which is changed after application of catalog,
      #   at the time of updating last_run_summary file. At that particular time puppet is
      #   still running, and will finish in a couple of seconds.
      # If Puppet had crashed before it got a catalog (e.g. certificate problems),
      #   it didn't update last_run_summary file and switched to 'stopped' state.
      stopped = last_run.select { |x| ['stopped', 'disabled'].include? x.results[:data][:status] }
      # Select all finished nodes which not failed and changed last_run time.
      succeed_nodes = stopped.select { |n|
        prev_n = prev_run.find{|ps| ps.results[:sender] == n.results[:sender] }
        n.results[:data][:status] == 'stopped' &&
        n.results[:data][:resources]['failed'].to_i == 0 &&
        n.results[:data][:resources]['failed_to_restart'].to_i == 0 &&
        n.results[:data][:time]['last_run'] != (prev_n && prev_n.results[:data][:time]['last_run'])
      }.map{|x| x.results[:sender] }
      stopped_nodes = stopped.map { |x| x.results[:sender] }
      error_nodes = stopped_nodes - succeed_nodes
      running_nodes = last_run.map {|n| n.results[:sender]} - stopped_nodes
      # Hunged nodes can change state at this moment(success, error or still run),
      # but we should to turn it on only in error_nodes
      succeed_nodes -= hung_nodes
      error_nodes = (error_nodes + hung_nodes).uniq
      running_nodes = last_run.map {|n| n.results[:sender]} - stopped_nodes - hung_nodes
      nodes_to_check = running_nodes + succeed_nodes + error_nodes
      unless nodes_to_check.size == last_run.size
        raise "Should never happen. Internal error in nodes statuses calculation. Statuses calculated for: #{nodes_to_check.inspect},"
                    "nodes passed to check statuses of: #{last_run.map {|n| n.results[:sender]}}"
      end
      {'succeed' => succeed_nodes, 'error' => error_nodes, 'running' => running_nodes}
    end
    def self.puppetd(uids)
      puppetd = MClient.new(@ctx, "puppetd", Array(uids))
      puppetd.on_respond_timeout do |uids|
        nodes = uids.map do |uid|
          { 'uid' => uid, 'status' => 'error', 'error_type' => 'deploy', 'role' => @nodes_roles[uid] }
        end
        @ctx.report_and_update_status('nodes' => nodes)
      end
      puppetd
    end
    def self.processing_error_nodes(error_nodes)
      nodes_to_report = []
      nodes_to_retry = []
      error_nodes.each do |uid|
        if @node_retries[uid] > 0
          @node_retries[uid] -= 1
          Astute.logger.debug "Puppet on node #{uid.inspect} will be restarted. "\
                              "#{@node_retries[uid]} retries remained."
          nodes_to_retry << uid
        else
          Astute.logger.debug "Node #{uid.inspect} has failed to deploy. There is no more retries for puppet run."
          nodes_to_report << {'uid' => uid, 'status' => 'error', 'error_type' => 'deploy', 'role' => @nodes_roles[uid] }
        end
      end
      return nodes_to_report, nodes_to_retry
    end
    def self.processing_running_nodes(running_nodes)
      nodes_to_report = []
      if running_nodes.present?
        begin
          # Pass nodes because logs calculation needs IP address of node, not just uid
          nodes_progress = @ctx.deploy_log_parser.progress_calculate(running_nodes, @nodes)
          if nodes_progress.present?
            Astute.logger.debug "Got progress for nodes: #{nodes_progress.inspect}"
            # Nodes with progress are running, so they are not included in nodes_to_report yet
            nodes_progress.map! { |x| x.merge!('status' => 'deploying', 'role' => @nodes_roles[x['uid']]) }
            nodes_to_report = nodes_progress
          end
        rescue => e
          Astute.logger.warn "Some error occurred when parse logs for nodes progress: #{e.message}, "\
                             "trace: #{e.format_backtrace}"
        end
      end
      nodes_to_report
    end
    def self.processing_succeed_nodes(succeed_nodes)
      succeed_nodes.map do |uid|
        { 'uid' => uid, 'status' => 'ready', 'role' => @nodes_roles[uid] }
      end
    end
    # As I (Andrey Danin) understand, Puppet agent goes through these steps:
    #   * Puppetd has 'stopped' state.
    #   * We run it as a run_once, and puppetd goes to 'idling' state - it trying to
@ -31,153 +172,39 @@ module Astute
    #   * After puppetd finished all internal jobs connected with finished catalog,
    #       it goes to 'idling' state.
    #   * After a short time it goes to 'stopped' state because we ran it as a run_once.
-
+    def self.deploy_nodes(nodes_to_check)
    private
    # Runs puppetd.runonce only if puppet is stopped on the host at the time
    # If it isn't stopped, we wait a bit and try again.
    # Returns list of nodes uids which appear to be with hung puppet.
    def self.puppetd_runonce(puppetd, uids, nodes)
      debug_mode_dict = nodes.inject({}) {|dict, node| dict[node['uid']] = node['debug']; dict}
      started = Time.now.to_i
      while Time.now.to_i - started < Astute.config.PUPPET_FADE_TIMEOUT
        puppetd.discover(:nodes => uids)
        last_run = puppetd.last_run_summary
        running_uids = last_run.select {|x| x.results[:data][:status] != 'stopped'}.map {|n| n.results[:sender]}
        stopped_uids = uids - running_uids
        # If stopped_uids is empty this cycle will not be called.
        stopped_uids.each do |uid|
          puppetd.discover(:nodes => [uid])
          puppetd.runonce(:puppet_debug => debug_mode_dict[uid])
        end
        uids = running_uids
        break if uids.empty?
        sleep Astute.config.PUPPET_FADE_INTERVAL
      end
      Astute.logger.debug "puppetd_runonce completed within #{Time.now.to_i - started} seconds."
      Astute.logger.debug "Following nodes have puppet hung: '#{running_uids.join(',')}'" if running_uids.any?
      running_uids
    end
    def self.calc_nodes_status(last_run, prev_run)
      # Finished are those which are not in running state,
      #   and changed their last_run time, which is changed after application of catalog,
      #   at the time of updating last_run_summary file. At that particular time puppet is
      #   still running, and will finish in a couple of seconds.
      # If Puppet had crashed before it got a catalog (e.g. certificate problems),
      #   it didn't update last_run_summary file and switched to 'stopped' state.
      stopped = last_run.select {|x| x.results[:data][:status] == 'stopped'}
      # Select all finished nodes which not failed and changed last_run time.
      succeed_nodes = stopped.select { |n|
        prev_n = prev_run.find{|ps| ps.results[:sender] == n.results[:sender] }
        n.results[:data][:resources]['failed'].to_i == 0 &&
          n.results[:data][:resources]['failed_to_restart'].to_i == 0 &&
          n.results[:data][:time]['last_run'] != (prev_n && prev_n.results[:data][:time]['last_run'])
      }.map{|x| x.results[:sender] }
      stopped_nodes = stopped.map {|x| x.results[:sender]}
      error_nodes = stopped_nodes - succeed_nodes
      # Running are all which didn't appear in finished
      running_nodes = last_run.map {|n| n.results[:sender]} - stopped_nodes
      nodes_to_check = running_nodes + succeed_nodes + error_nodes
      unless nodes_to_check.size == last_run.size
        raise "Shoud never happen. Internal error in nodes statuses calculation. Statuses calculated for: #{nodes_to_check.inspect},"
                    "nodes passed to check statuses of: #{last_run.map {|n| n.results[:sender]}}"
      end
      {'succeed' => succeed_nodes, 'error' => error_nodes, 'running' => running_nodes}
    end
    public
    def self.deploy(ctx, nodes, retries=2, change_node_status=true)
      # TODO: can we hide retries, ignore_failure into @ctx ?
      uids = nodes.map { |n| n['uid'] }
      nodes_roles = {}
      nodes.each { |n| nodes_roles[n['uid']] = n['role'] }
      # Keep info about retries for each node
      node_retries = {}
      uids.each {|x| node_retries.merge!({x => retries}) }
      Astute.logger.debug "Waiting for puppet to finish deployment on all nodes (timeout = #{Astute.config.PUPPET_TIMEOUT} sec)..."
      time_before = Time.now
      Timeout::timeout(Astute.config.PUPPET_TIMEOUT) do
-        puppetd = MClient.new(ctx, "puppetd", uids)
+        prev_summary = puppetd(nodes_to_check).last_run_summary
-        puppetd.on_respond_timeout do |uids|
+        hung_nodes = puppetd_runonce(nodes_to_check)
-          nodes = uids.map do |uid|
+
-            { 'uid' => uid, 'status' => 'error', 'error_type' => 'deploy', 'role' => nodes_roles[uid] }
+        while nodes_to_check.present?
-          end
+          last_run = puppetd(nodes_to_check).last_run_summary
-          ctx.report_and_update_status('nodes' => nodes)
+          calc_nodes = calc_nodes_status(last_run, prev_summary, hung_nodes)
        end if change_node_status
        prev_summary = puppetd.last_run_summary
        puppetd_runonce(puppetd, uids, nodes)
        nodes_to_check = uids
        last_run = puppetd.last_run_summary
        while nodes_to_check.any?
          calc_nodes = calc_nodes_status(last_run, prev_summary)
          Astute.logger.debug "Nodes statuses: #{calc_nodes.inspect}"
-          # At least we will report about successfully deployed nodes
+          report_succeed = processing_succeed_nodes calc_nodes['succeed']
-          nodes_to_report = []
+          report_error, nodes_to_retry = processing_error_nodes(calc_nodes['error'])
-          if change_node_status
+          report_running = processing_running_nodes(calc_nodes['running'])
            nodes_to_report.concat(calc_nodes['succeed'].map do |uid| 
              { 'uid' => uid, 'status' => 'ready', 'role' => nodes_roles[uid] }
            end)
          end
-          # Process retries
+          nodes_to_report = report_succeed + report_error + report_running
-          nodes_to_retry = []
+          @ctx.report_and_update_status('nodes' => nodes_to_report) if nodes_to_report.present?
-          calc_nodes['error'].each do |uid|
+
-            if node_retries[uid] > 0
+          if nodes_to_retry.present?
              node_retries[uid] -= 1
              Astute.logger.debug "Puppet on node #{uid.inspect} will be restarted. "\
                                  "#{node_retries[uid]} retries remained."
              nodes_to_retry << uid
            else
              Astute.logger.debug "Node #{uid.inspect} has failed to deploy. There is no more retries for puppet run."
              nodes_to_report << {'uid' => uid, 'status' => 'error', 'error_type' => 'deploy', 'role' => nodes_roles[uid] } if change_node_status
            end
          end
          if nodes_to_retry.any?
            Astute.logger.info "Retrying to run puppet for following error nodes: #{nodes_to_retry.join(',')}"
-            puppetd_runonce(puppetd, nodes_to_retry, nodes)
+            hung_nodes = puppetd_runonce(nodes_to_retry)
            # We need this magic with prev_summary to reflect new puppetd run statuses..
            prev_summary.delete_if { |x| nodes_to_retry.include?(x.results[:sender]) }
            prev_summary += last_run.select { |x| nodes_to_retry.include?(x.results[:sender]) }
          end
          # /end of processing retries
          if calc_nodes['running'].any?
            begin
              # Pass nodes because logs calculation needs IP address of node, not just uid
              nodes_progress = ctx.deploy_log_parser.progress_calculate(calc_nodes['running'], nodes)
              if nodes_progress.any?
                Astute.logger.debug "Got progress for nodes: #{nodes_progress.inspect}"
                # Nodes with progress are running, so they are not included in nodes_to_report yet
                nodes_progress.map! { |x| x.merge!('status' => 'deploying', 'role' => nodes_roles[x['uid']]) }
                nodes_to_report += nodes_progress
              end
            rescue => e
              Astute.logger.warn "Some error occurred when parse logs for nodes progress: #{e.message}, "\
                                 "trace: #{e.format_backtrace}"
            end
          end
          ctx.report_and_update_status('nodes' => nodes_to_report) if nodes_to_report.any?
          # we will iterate only over running nodes and those that we restart deployment for
          nodes_to_check = calc_nodes['running'] + nodes_to_retry
          break if nodes_to_check.empty?
          break if nodes_to_check.empty?
          sleep Astute.config.PUPPET_DEPLOY_INTERVAL
          puppetd.discover(:nodes => nodes_to_check)
          last_run = puppetd.last_run_summary
        end
      end
      time_spent = Time.now - time_before
      Astute.logger.info "#{ctx.task_id}: Spent #{time_spent} seconds on puppet run "\
                         "for following nodes(uids): #{nodes.map {|n| n['uid']}.join(',')}"
    end
  end
 end
--- a/spec/spec_helper.rb
+++ b/spec/spec_helper.rb
@ -33,6 +33,7 @@ Dir[File.join(File.dirname(__FILE__), 'unit/fixtures/*.rb')].each { |file| requi
 #               resetting time to sleep significantly increases tests speed
 Astute.config.PUPPET_DEPLOY_INTERVAL = 0
 Astute.config.PUPPET_FADE_INTERVAL = 0
 Astute.config.PUPPET_FADE_TIMEOUT = 1
 Astute.config.MC_RETRY_INTERVAL = 0
 Astute.config.PROVISIONING_TIMEOUT = 0
 Astute.config.REBOOT_TIMEOUT = 0
--- a/spec/unit/nailyfact_deploy_spec.rb
+++ b/spec/unit/nailyfact_deploy_spec.rb
@ -60,7 +60,7 @@ describe "NailyFact DeploymentEngine" do
      it "it should not raise an exception if deployment mode is unknown" do
        deploy_engine.expects(:upload_facts).times(deploy_data.size)
-        Astute::PuppetdDeployer.stubs(:deploy).with(ctx, deploy_data, instance_of(Fixnum), true).once
+        Astute::PuppetdDeployer.stubs(:deploy).with(ctx, deploy_data, instance_of(Fixnum)).once
        expect {deploy_engine.deploy(deploy_data)}.to_not raise_exception
      end
    end
@ -74,8 +74,8 @@ describe "NailyFact DeploymentEngine" do
        deploy_engine.expects(:upload_facts).times(deploy_data.size)
        # we got two calls, one for controller (high priority), and another for all computes (same low priority)
-        Astute::PuppetdDeployer.expects(:deploy).with(ctx, controller_nodes, instance_of(Fixnum), true).once
+        Astute::PuppetdDeployer.expects(:deploy).with(ctx, controller_nodes, instance_of(Fixnum)).once
-        Astute::PuppetdDeployer.expects(:deploy).with(ctx, compute_nodes, instance_of(Fixnum), true).once
+        Astute::PuppetdDeployer.expects(:deploy).with(ctx, compute_nodes, instance_of(Fixnum)).once
        expect {deploy_engine.deploy(deploy_data)}.to_not raise_exception
      end
@ -122,12 +122,12 @@ describe "NailyFact DeploymentEngine" do
        deploy_engine.expects(:upload_facts).at_least_once
        primary_controller = deploy_data.find { |n| n['role'] == 'primary-controller' }
-        Astute::PuppetdDeployer.expects(:deploy).with(ctx, [primary_controller], 2, true).once
+        Astute::PuppetdDeployer.expects(:deploy).with(ctx, [primary_controller], 2).once
        controller_nodes.each do |n|
-          Astute::PuppetdDeployer.expects(:deploy).with(ctx, [n], 2, true).once
+          Astute::PuppetdDeployer.expects(:deploy).with(ctx, [n], 2).once
        end
-        Astute::PuppetdDeployer.expects(:deploy).with(ctx, compute_nodes, instance_of(Fixnum), true).once
+        Astute::PuppetdDeployer.expects(:deploy).with(ctx, compute_nodes, instance_of(Fixnum)).once
        deploy_engine.deploy(deploy_data)
      end
--- a/spec/unit/puppetd_spec.rb
+++ b/spec/unit/puppetd_spec.rb
@ -22,15 +22,15 @@ describe "Puppetd" do
  context "PuppetdDeployer" do
    let(:reporter) { mock('reporter') }
-    
+
    let(:ctx) do
      Context.new("task id", ProxyReporter::DeploymentProxyReporter.new(reporter), Astute::LogParser::NoParsing.new)
    end
-    
+
    let(:nodes) { [{'uid' => '1', 'role' => 'compute'}] }
-    
+
    let(:rpcclient) { mock_rpcclient(nodes) }
-    
+
    let(:last_run_result) do
      {
        :statuscode =>0,
@ -48,113 +48,94 @@ describe "Puppetd" do
        :sender=>"1"
      }
    end
-    
+
    let(:last_run_result_running) do
      res = deep_copy(last_run_result)
      res[:data].merge!(:status => 'running', :running => 1, :stopped => 0)
      res
    end
-    
+
    let(:last_run_result_fail) do
      res = deep_copy(last_run_result_running)
-      res[:data].merge!(:runtime => 1358426000, 
+      res[:data].merge!(:runtime => 1358426000,
                        :time => {"last_run" => 1358426000},
                        :resources => {"failed" => 1}
                       )
      res
    end
-    
+
    let(:last_run_failed) do
      res = deep_copy(last_run_result_fail)
      res[:data].merge!(:status => 'stopped', :stopped => 1, :running => 0)
      res
    end
-    
+
    let(:last_run_result_finished) do
      res = deep_copy last_run_result
      res[:data][:time]['last_run'] = 1358428000
      res[:data][:status] = 'stopped'
      res
    end
-    
+
    context 'reportet behavior' do
-      let(:last_run_result) do
+
         {
           :data=> {
              :time=>{"last_run"=>1358425701},
              :status => "running", 
              :resources => {'failed' => 0},
              :running => 1, 
              :idling => 0
            },
            :sender=>"1"
          }
      end
      let(:prepare_mcollective_env) do
        last_run_result_new = deep_copy last_run_result
        last_run_result_new[:data][:time]['last_run'] = 1358426000
-        
+
        rpcclient_new_res = mock_mc_result(last_run_result_new)
        rpcclient_finished_res = mock_mc_result(last_run_result_finished)
        rpcclient_valid_result = mock_mc_result(last_run_result)
        rpcclient.stubs(:last_run_summary).returns([rpcclient_valid_result]).then.
            returns([rpcclient_valid_result]).then.
            returns([ mock_mc_result(last_run_result_running) ]).then.
            returns([ mock_mc_result(last_run_result_running) ]).then.
            returns([rpcclient_new_res]).then.
            returns([rpcclient_finished_res])
-        
+
        rpcclient
      end
      it "reports ready status for node if puppet deploy finished successfully" do
        prepare_mcollective_env
-        
+
        reporter.expects(:report).with('nodes' => [{'uid' => '1', 'status' => 'ready', 'progress' => 100, 'role' => 'compute'}])
        rpcclient.expects(:runonce).at_least_once.returns([mock_mc_result(last_run_result)])
-        
+
        Astute::PuppetdDeployer.deploy(ctx, nodes, retries=0)
      end
      it "doesn't report ready status for node if change_node_status disabled" do
        prepare_mcollective_env
        reporter.expects(:report).never
        rpcclient.expects(:runonce).at_least_once.returns([mock_mc_result(last_run_result)])
        Astute::PuppetdDeployer.deploy(ctx, nodes, retries=0, change_node_status=false)
      end
      context 'multiroles behavior' do
        let(:nodes) { [{'uid' => '1', 'role' => 'compute'}] }
        let(:nodes_multiroles) { [{'uid' => '1', 'role' => 'controller'}] }
        before(:each) do
-          @ctx = Context.new("task id", 
+          @ctx = Context.new("task id",
-                             ProxyReporter::DeploymentProxyReporter.new(reporter, nodes + nodes_multiroles), 
+                             ProxyReporter::DeploymentProxyReporter.new(reporter, nodes + nodes_multiroles),
                             Astute::LogParser::NoParsing.new
                            )
        end
-        
+
        it "it should not send final status before all roles of node will deploy" do
          prepare_mcollective_env
-          
+
-          reporter.expects(:report).with('nodes' => [{'uid' => '1', 'status' => 'deploying', 'progress' => 50, 'role' => 'compute'}])          
+          reporter.expects(:report).with('nodes' => [{'uid' => '1', 'status' => 'deploying', 'progress' => 50, 'role' => 'compute'}])
          rpcclient.expects(:runonce).at_least_once.returns([mock_mc_result(last_run_result)])
-        
+
          Astute::PuppetdDeployer.deploy(@ctx, nodes, retries=0)
        end
      end
-      
+
    end
    context "puppet state transitions" do
-      
+
      let(:last_run_result_idle_pre) do
        res = deep_copy(last_run_result)
        res[:data].merge!(:status => 'idling', :idling => 1, :stopped => 0)
        res
      end
-      
+
      let(:last_run_result_idle_post) do
        res = deep_copy(last_run_result_fail)
        res[:data].merge!(:status => 'idling', :idling => 1, :running => 0)
@ -172,11 +153,11 @@ describe "Puppetd" do
          returns([ mock_mc_result(last_run_result_fail) ]).then.
          returns([ mock_mc_result(last_run_result_fail) ]).then.
          returns([ mock_mc_result(last_run_failed) ])
-        
+
        reporter.expects(:report).with('nodes' => [{'status' => 'error', 'error_type' => 'deploy', 'uid' => '1', 'role' => 'compute'}])
        rpcclient.expects(:runonce).once.
          returns([ mock_mc_result(last_run_result) ])
-        
+
        Astute::PuppetdDeployer.deploy(ctx, nodes, 0)
      end
@ -188,11 +169,11 @@ describe "Puppetd" do
          returns([ mock_mc_result(last_run_result_running) ]).then.
          returns([ mock_mc_result(last_run_result_fail) ]).then.
          returns([ mock_mc_result(last_run_failed) ])
-        
+
        reporter.expects(:report).with('nodes' => [{'status' => 'error', 'error_type' => 'deploy', 'uid' => '1', 'role' => 'compute'}])
        rpcclient.expects(:runonce).once.
          returns([ mock_mc_result(last_run_result) ])
-        
+
        Astute::PuppetdDeployer.deploy(ctx, nodes, 0)
      end
@ -202,26 +183,26 @@ describe "Puppetd" do
          returns([ mock_mc_result(last_run_result) ]).then.
          returns([ mock_mc_result(last_run_result_running) ]).then.
          returns([ mock_mc_result(last_run_failed) ])
-        
+
        reporter.expects(:report).with('nodes' => [{'status' => 'error', 'error_type' => 'deploy', 'uid' => '1', 'role' => 'compute'}])
        rpcclient.expects(:runonce).once.
          returns([ mock_mc_result(last_run_result) ])
-        
+
        Astute::PuppetdDeployer.deploy(ctx, nodes, 0)
      end
-      it "publishes error status for node if puppet failed (a cycle w/ one running state only)" do
+      it "publishes error status for node if puppet failed (a cycle with one running state only)" do
        rpcclient.stubs(:last_run_summary).times(5).
          returns([ mock_mc_result(last_run_result) ]).then.
          returns([ mock_mc_result(last_run_result) ]).then.
          returns([ mock_mc_result(last_run_result_running) ]).then.
          returns([ mock_mc_result(last_run_result_fail) ]).then.
          returns([ mock_mc_result(last_run_failed) ])
-        
+
        reporter.expects(:report).with('nodes' => [{'status' => 'error', 'error_type' => 'deploy', 'uid' => '1', 'role' => 'compute'}])
        rpcclient.expects(:runonce).once.
          returns([ mock_mc_result(last_run_result) ])
-        
+
        Astute::PuppetdDeployer.deploy(ctx, nodes, 0)
      end
@ -230,30 +211,55 @@ describe "Puppetd" do
          returns([ mock_mc_result(last_run_result) ]).then.
          returns([ mock_mc_result(last_run_result) ]).then.
          returns([ mock_mc_result(last_run_failed) ])
-        
+
        reporter.expects(:report).with('nodes' => [{'status' => 'error', 'error_type' => 'deploy', 'uid' => '1', 'role' => 'compute'}])
        rpcclient.expects(:runonce).once.
          returns([ mock_mc_result(last_run_result) ])
-        
+
        Astute::PuppetdDeployer.deploy(ctx, nodes, 0)
      end
    end
-    it "doesn't publish error status for node if change_node_status disabled" do
+      context '' do
-      reporter.expects(:report).never
+        around(:each) do |example|
          old_value = Astute.config.PUPPET_FADE_INTERVAL
          example.run
          Astute.config.PUPPET_FADE_INTERVAL = old_value
        end
-      rpcclient_valid_result = mock_mc_result(last_run_result)
+        before(:each) do
-      rpcclient_new_res = mock_mc_result(last_run_result_fail)
+          Astute.config.PUPPET_FADE_INTERVAL = 1
-      rpcclient_finished_res = mock_mc_result(last_run_failed)
+        end
-      rpcclient.stubs(:last_run_summary).returns([rpcclient_valid_result]).then.
+        it "publishes error status for node if puppet running alien task (attempts been exhausted)" do
-          returns([rpcclient_valid_result]).then.
+          rpcclient.stubs(:last_run_summary).at_least(3).
-          returns([rpcclient_new_res]).then.
+            returns([ mock_mc_result(last_run_result_running) ]).then.
-          returns([rpcclient_finished_res])
+            returns([ mock_mc_result(last_run_result_running) ]).then.
-      rpcclient.expects(:runonce).at_least_once.returns([rpcclient_valid_result])
+            returns([ mock_mc_result(last_run_result_running) ]).then.
            returns([ mock_mc_result(last_run_result_running) ])
          reporter.expects(:report).with('nodes' => [{'status' => 'error', 'error_type' => 'deploy', 'uid' => '1', 'role' => 'compute'}])
          rpcclient.expects(:runonce).never
          Astute::PuppetdDeployer.deploy(ctx, nodes, 0)
        end
        it "ignore exit code of puppet running of alien task (waited for alien task stop and launched own)" do
          rpcclient.stubs(:last_run_summary).at_least(3).
            returns([ mock_mc_result(last_run_result_running) ]).then.
            returns([ mock_mc_result(last_run_result_running) ]).then.
            returns([ mock_mc_result(last_run_failed) ]).then.
            returns([ mock_mc_result(last_run_failed) ]).then.
            returns([ mock_mc_result(last_run_result_running) ]).then.
            returns([ mock_mc_result(last_run_result_running) ]).then.
            returns([ mock_mc_result(last_run_result_finished) ])
          rpcclient.expects(:runonce).at_least(1).returns([ mock_mc_result(last_run_result) ])
          reporter.expects(:report).with('nodes' => [{'uid' => '1', 'status' => 'ready', 'progress' => 100, 'role' => 'compute'}])
          Astute::PuppetdDeployer.deploy(ctx, nodes, 1)
        end
      end
      MClient.any_instance.stubs(:rpcclient).returns(rpcclient)
      Astute::PuppetdDeployer.deploy(ctx, nodes, retries=0, change_node_status=false)
    end
    it "retries to run puppet if it fails" do
@ -268,7 +274,7 @@ describe "Puppetd" do
          returns([rpcclient_failed]).then.
          returns([rpcclient_fail]).then.
          returns([rpcclient_succeed])
-      
+
      reporter.expects(:report).with('nodes' => [{'uid' => '1', 'status' => 'ready', 'progress' => 100, 'role' => 'compute'}])
      rpcclient.expects(:runonce).at_least_once.returns([rpcclient_valid_result])