From 8200221a30b8d9dee403fcfaec25f5014341f204 Mon Sep 17 00:00:00 2001 From: Trent Lloyd Date: Fri, 9 Feb 2018 14:08:04 +0800 Subject: [PATCH] Enforce no-quorum-policy=stop for all cluster sizes Previously quorum was only enforced on clusters with 3 or more nodes under the mistaken assumption that it is not possible to have quorum with only 2 nodes. The corosync votequorum agent which is configured allows for quorum in 2-node scenarios using the "two_node" option which is already configured by the charm. In this new scenario, corosync requires that both nodes are present in order to initially form cluster quorum, but also allows a single surviving node to keep quorum or take-over once it was already started while in contact with the other node. The net effect of this change is that nodes are unable to startup independently (which is when split brain situations are frequently seen due to network startup delays, etc). There is no change to the runtime behavior (there is still a risk that both nodes can go active if the network connection between them is interrupted, this is an inherrent risk of two-node clusters and requires a 3-node cluster to fix). Thus we update the CRM configuration to always set no-quorum-policy=stop regardless of whether the cluster has 2 or 3+ nodes. In the event that you need to startup a cluster manually with only 1 node, first verify that the second node is definitely either powered off or that corosync/pacemaker and all managed resources are stopped (thus we can be sure it won't go split brain, because it cannot startup again until it is in contact with the other node). Then you can override cluster startup using this command to temporarily set the expected votes to 1 instead of 2: $ corosync-quorumtool -e1 Once the second node comes back up and corosync reconnects, the expected vote count will automatically be reset to the configured value (or if corosync is restarted). Change-Id: Ica6a3ba387a4ab362400a25ff2ba0145e0218e1f --- hooks/utils.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/hooks/utils.py b/hooks/utils.py index 67c9d44..687796c 100644 --- a/hooks/utils.py +++ b/hooks/utils.py @@ -552,16 +552,12 @@ def configure_monitor_host(): def configure_cluster_global(): """Configure global cluster options""" log('Applying global cluster configuration', level=DEBUG) - if int(config('cluster_count')) >= 3: - # NOTE(jamespage) if 3 or more nodes, then quorum can be - # managed effectively, so stop if quorum lost - log('Configuring no-quorum-policy to stop', level=DEBUG) - cmd = "crm configure property no-quorum-policy=stop" - else: - # NOTE(jamespage) if less that 3 nodes, quorum not possible - # so ignore - log('Configuring no-quorum-policy to ignore', level=DEBUG) - cmd = "crm configure property no-quorum-policy=ignore" + # NOTE(lathiat) quorum in a two-node scenario is handled by + # corosync two_node=1. In this case quorum is required for + # initial cluster startup but not if a node was previously in + # contact with the full cluster. + log('Configuring no-quorum-policy to stop', level=DEBUG) + cmd = "crm configure property no-quorum-policy=stop" pcmk.commit(cmd) cmd = ('crm configure rsc_defaults $id="rsc-options" '