From e02c6257ae127c86d5c5bc41045b6cd841a46fbe Mon Sep 17 00:00:00 2001 From: Liam Young Date: Wed, 9 Sep 2020 09:30:46 +0000 Subject: [PATCH] Fix adding of stonith controlled resources. There appears to be a window between a pacemaker remote resource being added and the location properties for that resource being added. In this window the resource is down and pacemaker may fence the node. The window is present because the charm charm currently does: 1) Set stonith-enabled=true cluster property 2) Add maas stonith device that controls pacemaker remote node that has not yet been added. 3) Add pacemaker remote node 4) Add pacemaker location rules. I think the following two fixes are needed: 1) For initial deploys update the charm so it does not enable stonith until stonith resources and pacemaker remotes have been added. 2) For scale-out do not add the new pacemaker remote stonith resource until the corresponding pacemaker resource has been added along with its location rules. Depends-On: Ib8a667d0d82ef3dcd4da27e62460b4f0ce32ee43 Change-Id: I7e2f568d829f6d0bfc7859a7d0ea239203bbc490 Closes-Bug: #1884284 --- hooks/hooks.py | 28 +++++++++++++------ hooks/utils.py | 44 ++++++++++++++++++++++++++++-- unit_tests/test_hacluster_hooks.py | 16 ++++++++--- unit_tests/test_hacluster_utils.py | 38 ++++++++++++++++++++++---- 4 files changed, 104 insertions(+), 22 deletions(-) diff --git a/hooks/hooks.py b/hooks/hooks.py index c86c1c1..d8a8fde 100755 --- a/hooks/hooks.py +++ b/hooks/hooks.py @@ -117,6 +117,8 @@ from utils import ( notify_peers_of_series_upgrade, clear_series_upgrade_notification, get_hostname, + disable_stonith, + is_stonith_configured, ) from charmhelpers.contrib.charmsupport import nrpe @@ -169,12 +171,13 @@ def get_transport(): return val -def run_initial_steup(): +def run_initial_setup(): """Run global setup.""" failure_timeout = config('failure_timeout') configure_cluster_global(failure_timeout) configure_monitor_host() - configure_stonith() + if not is_stonith_configured(): + disable_stonith() @hooks.hook('config-changed') @@ -206,7 +209,7 @@ def config_changed(): if configure_corosync(): try_pcmk_wait() if is_leader(): - run_initial_steup() + run_initial_setup() update_nrpe_config() @@ -376,7 +379,7 @@ def ha_relation_changed(): # Only configure the cluster resources # from the oldest peer unit. if is_leader(): - run_initial_steup() + run_initial_setup() log('Setting cluster symmetry', level=INFO) set_cluster_symmetry() log('Deleting Resources' % (delete_resources), level=DEBUG) @@ -521,15 +524,22 @@ def ha_relation_changed(): log('Configuring any remote nodes', level=INFO) remote_resources = configure_pacemaker_remote_resources() resources.update(remote_resources) - stonith_remote_res = configure_pacemaker_remote_stonith_resource() - resources.update(stonith_remote_res) - if stonith_remote_res: - stonith_peer_res = configure_peer_stonith_resource() - resources.update(stonith_peer_res) configure_resources_on_remotes( resources=resources, clones=clones, groups=groups) + + stonith_resources = {} + stonith_remote_res = configure_pacemaker_remote_stonith_resource() + stonith_resources.update(stonith_remote_res) + if stonith_remote_res: + stonith_peer_res = configure_peer_stonith_resource() + stonith_resources.update(stonith_peer_res) + configure_resources_on_remotes( + resources=stonith_resources, + clones=clones, + groups=groups) + configure_stonith() else: log('Deferring configuration of any remote nodes', level=INFO) diff --git a/hooks/utils.py b/hooks/utils.py index 76f7998..e3fbc70 100644 --- a/hooks/utils.py +++ b/hooks/utils.py @@ -30,6 +30,9 @@ import itertools from base64 import b64decode +from charmhelpers.core.strutils import ( + bool_from_string, +) from charmhelpers.core.hookenv import ( local_unit, log, @@ -37,6 +40,8 @@ from charmhelpers.core.hookenv import ( ERROR, INFO, WARNING, + leader_get, + leader_set, relation_get, relation_set, related_units, @@ -107,6 +112,7 @@ SYSTEMD_OVERRIDES_FILE = '{}/overrides.conf' MAAS_DNS_CONF_DIR = '/etc/maas_dns' +STONITH_CONFIGURED = 'stonith-configured' class MAASConfigIncomplete(Exception): @@ -508,8 +514,8 @@ def parse_data(relid, unit, key): def configure_stonith(): if configure_pacemaker_remote_stonith_resource(): configure_peer_stonith_resource() - log('Not disabling STONITH as pacemaker remotes are present', - level=INFO) + enable_stonith() + set_stonith_configured(True) else: log('Disabling STONITH', level=INFO) cmd = "crm configure property stonith-enabled=false" @@ -699,10 +705,21 @@ def configure_maas_stonith_resource(stonith_hostnames): "op monitor interval=25 start-delay=25 " "timeout=25")} _configure_stonith_resource(ctxt) + return {ctxt['stonith_resource_name']: ctxt['stonith_plugin']} + + +def enable_stonith(): + """Enable stonith via the global property stonith-enabled.""" pcmk.commit( "crm configure property stonith-enabled=true", failure_is_fatal=True) - return {ctxt['stonith_resource_name']: ctxt['stonith_plugin']} + + +def disable_stonith(): + """Disable stonith via the global property stonith-enabled.""" + pcmk.commit( + "crm configure property stonith-enabled=false", + failure_is_fatal=True) def get_ip_addr_from_resource_params(params): @@ -1503,3 +1520,24 @@ def clear_series_upgrade_notification(): relation_set( relation_id=rel_id, relation_settings=relation_data) + + +def set_stonith_configured(is_configured): + """Set the STONITH_CONFIGURED state. + + :param is_configured: Flag to check peers relation data for. + :type is_configured: bool + :returns: List of IPs of nodes that are ready to join the cluster + :rtype: List + """ + leader_set({STONITH_CONFIGURED: is_configured}) + + +def is_stonith_configured(): + """Get the STONITH_CONFIGURED state. + + :returns: State of STONITH_CONFIGURED state. + :rtype: bool + """ + configured = leader_get(STONITH_CONFIGURED) or 'False' + return bool_from_string(configured) diff --git a/unit_tests/test_hacluster_hooks.py b/unit_tests/test_hacluster_hooks.py index 8a80f60..5e525df 100644 --- a/unit_tests/test_hacluster_hooks.py +++ b/unit_tests/test_hacluster_hooks.py @@ -40,6 +40,7 @@ class TestCorosyncConf(unittest.TestCase): shutil.rmtree(self.tmpdir) os.remove(self.tmpfile.name) + @mock.patch.object(hooks, 'is_stonith_configured') @mock.patch.object(hooks, 'configure_peer_stonith_resource') @mock.patch.object(hooks, 'get_member_ready_nodes') @mock.patch.object(hooks, 'configure_resources_on_remotes') @@ -74,7 +75,8 @@ class TestCorosyncConf(unittest.TestCase): configure_pacemaker_remote_stonith_resource, configure_resources_on_remotes, get_member_ready_nodes, - configure_peer_stonith_resource): + configure_peer_stonith_resource, + is_stonith_configured): def fake_crm_opt_exists(res_name): # res_ubuntu will take the "update resource" route @@ -83,6 +85,7 @@ class TestCorosyncConf(unittest.TestCase): crm_opt_exists.side_effect = fake_crm_opt_exists commit.return_value = 0 + is_stonith_configured.return_value = False is_leader.return_value = True related_units.return_value = ['ha/0', 'ha/1', 'ha/2'] get_cluster_nodes.return_value = ['10.0.3.2', '10.0.3.3', '10.0.3.4'] @@ -157,6 +160,7 @@ class TestCorosyncConf(unittest.TestCase): commit.assert_any_call( 'crm -w -F configure %s %s %s' % (kw, name, params)) + @mock.patch.object(hooks, 'is_stonith_configured') @mock.patch.object(hooks, 'configure_peer_stonith_resource') @mock.patch.object(hooks, 'get_member_ready_nodes') @mock.patch.object(hooks, 'configure_resources_on_remotes') @@ -190,7 +194,9 @@ class TestCorosyncConf(unittest.TestCase): set_cluster_symmetry, configure_pacemaker_remote_resources, configure_pacemaker_remote_stonith_resource, configure_resources_on_remotes, get_member_ready_nodes, - configure_peer_stonith_resource): + configure_peer_stonith_resource, + is_stonith_configured): + is_stonith_configured.return_value = False validate_dns_ha.return_value = True crm_opt_exists.return_value = False is_leader.return_value = True @@ -353,6 +359,7 @@ class TestHooks(test_utils.CharmTestCase): apt_install.assert_called_once_with(expected_pkgs, fatal=True) setup_ocf_files.assert_called_once_with() + @mock.patch.object(hooks, 'is_stonith_configured') @mock.patch.object(hooks, 'configure_stonith') @mock.patch.object(hooks, 'relation_ids') @mock.patch.object(hooks, 'hanode_relation_joined') @@ -373,8 +380,10 @@ class TestHooks(test_utils.CharmTestCase): mock_maintenance_mode, mock_hanode_relation_joined, mock_relation_ids, - mock_configure_stonith): + mock_configure_stonith, + mock_is_stonith_configured): + mock_is_stonith_configured.return_value = False mock_config.side_effect = self.test_config.get mock_relation_ids.return_value = ['hanode:1'] mock_wait_for_pcmk.return_value = True @@ -395,7 +404,6 @@ class TestHooks(test_utils.CharmTestCase): self.test_config.set('maintenance-mode', False) hooks.config_changed() mock_maintenance_mode.assert_called_with(False) - mock_configure_stonith.assert_called_with() @mock.patch.object(hooks, 'needs_maas_dns_migration') @mock.patch.object(hooks, 'relation_ids') diff --git a/unit_tests/test_hacluster_utils.py b/unit_tests/test_hacluster_utils.py index 9f15a5b..06518a3 100644 --- a/unit_tests/test_hacluster_utils.py +++ b/unit_tests/test_hacluster_utils.py @@ -877,9 +877,6 @@ class UtilsTestCase(unittest.TestCase): "timeout=25") commit_calls = [ mock.call(cmd, failure_is_fatal=True), - mock.call( - 'crm configure property stonith-enabled=true', - failure_is_fatal=True), ] commit.assert_has_calls(commit_calls) @@ -922,9 +919,6 @@ class UtilsTestCase(unittest.TestCase): 'stonith:external/maas', ("params url='http://maas/2.0' apikey='apikey' hostnames='node1' " "op monitor interval=25 start-delay=25 timeout=25")) - commit.assert_called_once_with( - 'crm configure property stonith-enabled=true', - failure_is_fatal=True) @mock.patch.object(utils, 'config') @mock.patch('pcmk.commit') @@ -1232,3 +1226,35 @@ class UtilsTestCase(unittest.TestCase): mock.call('crm -w -F resource stop st-maas-1234'), mock.call('crm -w -F configure delete st-maas-1234')] mock_commit.assert_has_calls(commit_calls) + + @mock.patch.object(utils, 'leader_set') + def test_set_stonith_configured(self, leader_set): + utils.set_stonith_configured(True) + leader_set.assert_called_once_with( + {'stonith-configured': True}) + + @mock.patch.object(utils, 'leader_get') + def test_is_stonith_configured(self, leader_get): + leader_get.return_value = 'True' + self.assertTrue( + utils.is_stonith_configured()) + leader_get.return_value = 'False' + self.assertFalse( + utils.is_stonith_configured()) + leader_get.return_value = '' + self.assertFalse( + utils.is_stonith_configured()) + + @mock.patch('pcmk.commit') + def test_enable_stonith(self, commit): + utils.enable_stonith() + commit.assert_called_once_with( + 'crm configure property stonith-enabled=true', + failure_is_fatal=True) + + @mock.patch('pcmk.commit') + def test_disable_stonith(self, commit): + utils.disable_stonith() + commit.assert_called_once_with( + 'crm configure property stonith-enabled=false', + failure_is_fatal=True)