Fix adding of stonith controlled resources.

There appears to be a window between a pacemaker remote resource
being added and the location properties for that resource being
added. In this window the resource is down and pacemaker may fence
the node.

The window is present because the charm charm currently does:

1) Set stonith-enabled=true cluster property
2) Add maas stonith device that controls pacemaker remote node that
   has not yet been added.
3) Add pacemaker remote node
4) Add pacemaker location rules.

I think the following two fixes are needed:

1) For initial deploys update the charm so it does not enable stonith
   until stonith resources and pacemaker remotes have been added.

2) For scale-out do not add the new pacemaker remote stonith resource
   until the corresponding pacemaker resource has been added along
   with its location rules.

Depends-On: Ib8a667d0d82ef3dcd4da27e62460b4f0ce32ee43
Change-Id: I7e2f568d829f6d0bfc7859a7d0ea239203bbc490
Closes-Bug: #1884284
This commit is contained in:
Liam Young 2020-09-09 09:30:46 +00:00
parent fc18baa623
commit e02c6257ae
4 changed files with 104 additions and 22 deletions

View File

@ -117,6 +117,8 @@ from utils import (
notify_peers_of_series_upgrade,
clear_series_upgrade_notification,
get_hostname,
disable_stonith,
is_stonith_configured,
)
from charmhelpers.contrib.charmsupport import nrpe
@ -169,12 +171,13 @@ def get_transport():
return val
def run_initial_steup():
def run_initial_setup():
"""Run global setup."""
failure_timeout = config('failure_timeout')
configure_cluster_global(failure_timeout)
configure_monitor_host()
configure_stonith()
if not is_stonith_configured():
disable_stonith()
@hooks.hook('config-changed')
@ -206,7 +209,7 @@ def config_changed():
if configure_corosync():
try_pcmk_wait()
if is_leader():
run_initial_steup()
run_initial_setup()
update_nrpe_config()
@ -376,7 +379,7 @@ def ha_relation_changed():
# Only configure the cluster resources
# from the oldest peer unit.
if is_leader():
run_initial_steup()
run_initial_setup()
log('Setting cluster symmetry', level=INFO)
set_cluster_symmetry()
log('Deleting Resources' % (delete_resources), level=DEBUG)
@ -521,15 +524,22 @@ def ha_relation_changed():
log('Configuring any remote nodes', level=INFO)
remote_resources = configure_pacemaker_remote_resources()
resources.update(remote_resources)
stonith_remote_res = configure_pacemaker_remote_stonith_resource()
resources.update(stonith_remote_res)
if stonith_remote_res:
stonith_peer_res = configure_peer_stonith_resource()
resources.update(stonith_peer_res)
configure_resources_on_remotes(
resources=resources,
clones=clones,
groups=groups)
stonith_resources = {}
stonith_remote_res = configure_pacemaker_remote_stonith_resource()
stonith_resources.update(stonith_remote_res)
if stonith_remote_res:
stonith_peer_res = configure_peer_stonith_resource()
stonith_resources.update(stonith_peer_res)
configure_resources_on_remotes(
resources=stonith_resources,
clones=clones,
groups=groups)
configure_stonith()
else:
log('Deferring configuration of any remote nodes', level=INFO)

View File

@ -30,6 +30,9 @@ import itertools
from base64 import b64decode
from charmhelpers.core.strutils import (
bool_from_string,
)
from charmhelpers.core.hookenv import (
local_unit,
log,
@ -37,6 +40,8 @@ from charmhelpers.core.hookenv import (
ERROR,
INFO,
WARNING,
leader_get,
leader_set,
relation_get,
relation_set,
related_units,
@ -107,6 +112,7 @@ SYSTEMD_OVERRIDES_FILE = '{}/overrides.conf'
MAAS_DNS_CONF_DIR = '/etc/maas_dns'
STONITH_CONFIGURED = 'stonith-configured'
class MAASConfigIncomplete(Exception):
@ -508,8 +514,8 @@ def parse_data(relid, unit, key):
def configure_stonith():
if configure_pacemaker_remote_stonith_resource():
configure_peer_stonith_resource()
log('Not disabling STONITH as pacemaker remotes are present',
level=INFO)
enable_stonith()
set_stonith_configured(True)
else:
log('Disabling STONITH', level=INFO)
cmd = "crm configure property stonith-enabled=false"
@ -699,10 +705,21 @@ def configure_maas_stonith_resource(stonith_hostnames):
"op monitor interval=25 start-delay=25 "
"timeout=25")}
_configure_stonith_resource(ctxt)
return {ctxt['stonith_resource_name']: ctxt['stonith_plugin']}
def enable_stonith():
"""Enable stonith via the global property stonith-enabled."""
pcmk.commit(
"crm configure property stonith-enabled=true",
failure_is_fatal=True)
return {ctxt['stonith_resource_name']: ctxt['stonith_plugin']}
def disable_stonith():
"""Disable stonith via the global property stonith-enabled."""
pcmk.commit(
"crm configure property stonith-enabled=false",
failure_is_fatal=True)
def get_ip_addr_from_resource_params(params):
@ -1503,3 +1520,24 @@ def clear_series_upgrade_notification():
relation_set(
relation_id=rel_id,
relation_settings=relation_data)
def set_stonith_configured(is_configured):
"""Set the STONITH_CONFIGURED state.
:param is_configured: Flag to check peers relation data for.
:type is_configured: bool
:returns: List of IPs of nodes that are ready to join the cluster
:rtype: List
"""
leader_set({STONITH_CONFIGURED: is_configured})
def is_stonith_configured():
"""Get the STONITH_CONFIGURED state.
:returns: State of STONITH_CONFIGURED state.
:rtype: bool
"""
configured = leader_get(STONITH_CONFIGURED) or 'False'
return bool_from_string(configured)

View File

@ -40,6 +40,7 @@ class TestCorosyncConf(unittest.TestCase):
shutil.rmtree(self.tmpdir)
os.remove(self.tmpfile.name)
@mock.patch.object(hooks, 'is_stonith_configured')
@mock.patch.object(hooks, 'configure_peer_stonith_resource')
@mock.patch.object(hooks, 'get_member_ready_nodes')
@mock.patch.object(hooks, 'configure_resources_on_remotes')
@ -74,7 +75,8 @@ class TestCorosyncConf(unittest.TestCase):
configure_pacemaker_remote_stonith_resource,
configure_resources_on_remotes,
get_member_ready_nodes,
configure_peer_stonith_resource):
configure_peer_stonith_resource,
is_stonith_configured):
def fake_crm_opt_exists(res_name):
# res_ubuntu will take the "update resource" route
@ -83,6 +85,7 @@ class TestCorosyncConf(unittest.TestCase):
crm_opt_exists.side_effect = fake_crm_opt_exists
commit.return_value = 0
is_stonith_configured.return_value = False
is_leader.return_value = True
related_units.return_value = ['ha/0', 'ha/1', 'ha/2']
get_cluster_nodes.return_value = ['10.0.3.2', '10.0.3.3', '10.0.3.4']
@ -157,6 +160,7 @@ class TestCorosyncConf(unittest.TestCase):
commit.assert_any_call(
'crm -w -F configure %s %s %s' % (kw, name, params))
@mock.patch.object(hooks, 'is_stonith_configured')
@mock.patch.object(hooks, 'configure_peer_stonith_resource')
@mock.patch.object(hooks, 'get_member_ready_nodes')
@mock.patch.object(hooks, 'configure_resources_on_remotes')
@ -190,7 +194,9 @@ class TestCorosyncConf(unittest.TestCase):
set_cluster_symmetry, configure_pacemaker_remote_resources,
configure_pacemaker_remote_stonith_resource,
configure_resources_on_remotes, get_member_ready_nodes,
configure_peer_stonith_resource):
configure_peer_stonith_resource,
is_stonith_configured):
is_stonith_configured.return_value = False
validate_dns_ha.return_value = True
crm_opt_exists.return_value = False
is_leader.return_value = True
@ -353,6 +359,7 @@ class TestHooks(test_utils.CharmTestCase):
apt_install.assert_called_once_with(expected_pkgs, fatal=True)
setup_ocf_files.assert_called_once_with()
@mock.patch.object(hooks, 'is_stonith_configured')
@mock.patch.object(hooks, 'configure_stonith')
@mock.patch.object(hooks, 'relation_ids')
@mock.patch.object(hooks, 'hanode_relation_joined')
@ -373,8 +380,10 @@ class TestHooks(test_utils.CharmTestCase):
mock_maintenance_mode,
mock_hanode_relation_joined,
mock_relation_ids,
mock_configure_stonith):
mock_configure_stonith,
mock_is_stonith_configured):
mock_is_stonith_configured.return_value = False
mock_config.side_effect = self.test_config.get
mock_relation_ids.return_value = ['hanode:1']
mock_wait_for_pcmk.return_value = True
@ -395,7 +404,6 @@ class TestHooks(test_utils.CharmTestCase):
self.test_config.set('maintenance-mode', False)
hooks.config_changed()
mock_maintenance_mode.assert_called_with(False)
mock_configure_stonith.assert_called_with()
@mock.patch.object(hooks, 'needs_maas_dns_migration')
@mock.patch.object(hooks, 'relation_ids')

View File

@ -877,9 +877,6 @@ class UtilsTestCase(unittest.TestCase):
"timeout=25")
commit_calls = [
mock.call(cmd, failure_is_fatal=True),
mock.call(
'crm configure property stonith-enabled=true',
failure_is_fatal=True),
]
commit.assert_has_calls(commit_calls)
@ -922,9 +919,6 @@ class UtilsTestCase(unittest.TestCase):
'stonith:external/maas',
("params url='http://maas/2.0' apikey='apikey' hostnames='node1' "
"op monitor interval=25 start-delay=25 timeout=25"))
commit.assert_called_once_with(
'crm configure property stonith-enabled=true',
failure_is_fatal=True)
@mock.patch.object(utils, 'config')
@mock.patch('pcmk.commit')
@ -1232,3 +1226,35 @@ class UtilsTestCase(unittest.TestCase):
mock.call('crm -w -F resource stop st-maas-1234'),
mock.call('crm -w -F configure delete st-maas-1234')]
mock_commit.assert_has_calls(commit_calls)
@mock.patch.object(utils, 'leader_set')
def test_set_stonith_configured(self, leader_set):
utils.set_stonith_configured(True)
leader_set.assert_called_once_with(
{'stonith-configured': True})
@mock.patch.object(utils, 'leader_get')
def test_is_stonith_configured(self, leader_get):
leader_get.return_value = 'True'
self.assertTrue(
utils.is_stonith_configured())
leader_get.return_value = 'False'
self.assertFalse(
utils.is_stonith_configured())
leader_get.return_value = ''
self.assertFalse(
utils.is_stonith_configured())
@mock.patch('pcmk.commit')
def test_enable_stonith(self, commit):
utils.enable_stonith()
commit.assert_called_once_with(
'crm configure property stonith-enabled=true',
failure_is_fatal=True)
@mock.patch('pcmk.commit')
def test_disable_stonith(self, commit):
utils.disable_stonith()
commit.assert_called_once_with(
'crm configure property stonith-enabled=false',
failure_is_fatal=True)