Fixes for race condition when adding primitives without OCF present

* Change the servie "status" to report "stopped" if the
  primitive has failures on the node.
* Enable "status" failure check.
* Support error detection for a missing OCF monitor operations.
* Add operations status debug method
* Add forgotten cib_reset to the wait_for_online

Change-Id: I34fbf8b4a7d2420fb568719f473bc9b40063cc82
This commit is contained in:
Dmitry Ilyin 2016-08-16 13:16:05 -05:00 committed by Alex Schultz
parent fe4d448938
commit c1c2031c84
7 changed files with 68 additions and 26 deletions

View File

@ -19,6 +19,9 @@
- symmetric-cluster
- no-quorum-policy
# Show the debug messages for the resource operations status calculation
:debug_show_operations: false
# don't actually do any changes to the system
# only show what command would have been run
:debug_enabled: false
@ -48,10 +51,14 @@
:status_mode_simple: :global
# cleanup the primitive during these actions?
:cleanup_on_status: false
:cleanup_on_start: true
:cleanup_on_stop: true
# set the primitive status to stopped if there are failures
# forcing the primitive to be started again and cleaned up
# on this node
:cleanup_on_status: true
# try to stop and disable the basic service on these provider actions
# the basic service is the service managed by the system
# init scripts or the upstart/systemd units

View File

@ -31,7 +31,10 @@ module Pacemaker
message = "Waiting #{max_wait_time} seconds for Pacemaker to become online"
message += " (#{comment})" if comment
debug message
retry_block { online? }
retry_block do
cib_reset 'wait_for_online'
online?
end
debug 'Pacemaker is online'
end

View File

@ -102,7 +102,7 @@ module Pacemaker
node_status_string = '?' unless node_status_string.is_a? String
node_status_string = node_status_string.upcase
node_block = "#{node_name}: #{node_status_string}"
node_block += ' (F)' if primitive_has_failures?(primitive, node_name) && (!primitive_is_running? primitive, node_name)
node_block += ' (F)' if primitive_has_failures? primitive, node_name
node_block += ' (L)' if service_location_exists? primitive_full_name(primitive), node_name
nodes << node_block
end
@ -115,5 +115,24 @@ module Pacemaker
report += " at '#{tag}'" if tag
report + "\n"
end
# Generate the report message for the operation status calculation
# @param [Array<Hash>] operations
# @param [Hash<String => String>] resource
# @param [String] node_name
# @return [String]
def resource_operations_report(operations, resource, node_name)
report = "Operations status debug start for the node: '#{node_name}'\n"
report += "Resource: '#{resource['id']}'\n"
operations.each do |operation|
type = operation.fetch('operation', '?').capitalize
rc_code = operation.fetch('rc-code', '?')
op_code = operation.fetch('op-status', '?')
report += "* #{type.ljust 7}: rc:#{rc_code} op:#{op_code}\n"
end
report += "Status: #{resource['status']} Failed: #{resource['failed']}\n"
report + "Operations status debug end for the node: '#{node_name}'\n"
end
end
end

View File

@ -84,8 +84,9 @@ module Pacemaker
# decode lrm_resources section of CIB
# @param lrm_resources [REXML::Element]
# @param [String] node_name
# @return [Hash<String => Hash>]
def decode_lrm_resources(lrm_resources)
def decode_lrm_resources(lrm_resources, node_name=nil)
resources = {}
lrm_resources.each do |lrm_resource|
resource = attributes_to_hash lrm_resource
@ -97,6 +98,7 @@ module Pacemaker
resource.store 'ops', ops
resource.store 'status', determine_primitive_status(ops)
resource.store 'failed', failed_operations_found?(ops)
debug resource_operations_report ops, resource, node_name if pacemaker_options[:debug_show_operations]
resources.store id, resource
end
resources
@ -128,7 +130,7 @@ module Pacemaker
next unless lrm
lrm_resources = cib_section_lrm_resources lrm
next unless lrm_resources
resources = decode_lrm_resources lrm_resources
resources = decode_lrm_resources lrm_resources, node_name
node.store 'primitives', resources
@node_status_structure.store node_name, node
end
@ -141,8 +143,9 @@ module Pacemaker
# @return [TrueClass,FalseClass]
def failed_operations_found?(ops)
ops.each do |op|
# skip incompleate ops
next unless op['op-status'] == '0'
# skip pending ops
next if op['op-status'] == '-1'
# skip useless ops
next unless %w(start stop monitor promote).include? op['operation']
@ -173,8 +176,8 @@ module Pacemaker
.fetch('status', nil)
else
statuses = []
node_status.each do |_k, v|
status = v.fetch('primitives', {})
node_status.each do |_node_name, node_status|
status = node_status.fetch('primitives', {})
.fetch(primitive, {})
.fetch('status', nil)
statuses << status

View File

@ -147,12 +147,6 @@ Puppet::Type.type(:service).provide(:pacemaker_xml, parent: Puppet::Provider::Pa
cib_reset 'service_status'
wait_for_online 'service_status'
if pacemaker_options[:cleanup_on_status]
if !pacemaker_options[:cleanup_only_if_failures] || primitive_has_failures?(name, hostname)
cleanup
end
end
out = if primitive_is_master? name
service_status_mode pacemaker_options[:status_mode_master]
elsif primitive_is_clone? name
@ -168,6 +162,13 @@ Puppet::Type.type(:service).provide(:pacemaker_xml, parent: Puppet::Provider::Pa
end
end
if pacemaker_options[:cleanup_on_status]
if out == :running and primitive_has_failures? name, hostname
debug "Primitive: '#{name}' has failures on the node: '#{hostname}' Service status set to 'stopped'."
out = :stopped
end
end
debug "Return: '#{out}' (#{out.class})"
debug cluster_debug_report "#{@resource} status"
out

View File

@ -75,13 +75,13 @@ describe Puppet::Provider::PacemakerXML do
'then' => 'p_neutron-dhcp-agent',
},
'order-test1-test2-Mandatory' => {
'first'=>'test1',
'first-action'=>'promote',
'id'=>'order-test1-test2-Mandatory',
'kind'=>'Mandatory',
'symmetrical'=>'true',
'then'=>'test2',
'then-action'=>'start',
'first' => 'test1',
'first-action' => 'promote',
'id' => 'order-test1-test2-Mandatory',
'kind' => 'Mandatory',
'symmetrical' => 'true',
'then' => 'test2',
'then-action' => 'start',
},
}
end
@ -278,7 +278,7 @@ Pacemaker debug block start at 'test'
-> Simple primitive: 'p_ceilometer-alarm-evaluator'
node-1: STOP | node-2: STOP (F) | node-3: STOP (F)
-> Simple primitive: 'p_heat-engine'
node-1: START (L) | node-2: STOP | node-3: STOP
node-1: START (F) (L) | node-2: STOP | node-3: STOP
-> Simple primitive: 'p_ceilometer-agent-central' (M)
node-1: STOP | node-2: STOP (F) | node-3: STOP (F)
-> Simple primitive: 'vip__management'
@ -292,7 +292,7 @@ Pacemaker debug block start at 'test'
-> Clone primitive: 'p_mysql-clone'
node-1: START (L) | node-2: START (L) | node-3: STOP
-> Simple primitive: 'p_neutron-dhcp-agent'
node-1: START (L) | node-2: STOP | node-3: STOP
node-1: START (F) (L) | node-2: STOP | node-3: STOP
-> Simple primitive: 'vip__public'
node-1: START (L) | node-2: STOP (L) | node-3: STOP (L)
-> Clone primitive: 'p_haproxy-clone'
@ -354,14 +354,14 @@ Pacemaker debug block end at 'test'
it 'can determine if primitive is failed or not globally' do
expect(subject.primitive_has_failures? 'p_ceilometer-agent-central').to eq true
expect(subject.primitive_has_failures? 'p_heat-engine').to eq false
expect(subject.primitive_has_failures? 'p_heat-engine').to eq true
expect(subject.primitive_has_failures? 'UNKNOWN').to eq nil
end
it 'can determine if primitive is failed or not locally' do
expect(subject.primitive_has_failures? 'p_ceilometer-agent-central', 'node-1').to eq false
expect(subject.primitive_has_failures? 'p_ceilometer-agent-central', 'node-2').to eq true
expect(subject.primitive_has_failures? 'p_heat-engine', 'node-1').to eq false
expect(subject.primitive_has_failures? 'p_heat-engine', 'node-1').to eq true
expect(subject.primitive_has_failures? 'p_heat-engine', 'node-2').to eq false
expect(subject.primitive_has_failures? 'UNKNOWN', 'node-1').to eq nil
end

View File

@ -166,6 +166,15 @@ describe Puppet::Type.type(:service).provider(:pacemaker_xml) do
provider.stubs(:service_location_exists?).returns(true)
expect(provider.status).to eq :running
end
it 'counts a service as stopped if the primitive has failures' do
provider.stubs(:get_primitive_puppet_status).returns(:running)
provider.stubs(:service_location_exists?).returns(true)
provider.expects(:primitive_has_failures?).with(name, hostname).returns(true)
expect(provider.status).to eq :stopped
provider.expects(:primitive_has_failures?).with(name, hostname).returns(false)
expect(provider.status).to eq :running
end
end
context '#start' do