Create null stonith resource for lxd containers.

If stonith is enabled then when a compute node is detected as failed
it is powered down. This can include a lxd container which is also
part of the cluster. In this case because stonith is enabled at a
global level, pacemaker will try and power off the lxd container
too. But the container does not have a stonith device and this causes
the container to be marked as unclean (but not down). This running
unclean state prevents resources being moved and causes any
pacemaker-remotes that are associated with the lost container from
losing their connection which prevents masakari hostmonitor from
ascertaining the cluster health.

The way to work around this is to create a dummy stonith device for
the lxd containers. This allows the cluster to properly mark the lost
container as down and resources are relocated.

Change-Id: Ic45dbdd9d8581f25549580c7e98a8d6e0bf8c3e7
Partial-Bug: #1889094
This commit is contained in:
Liam Young 2020-07-29 11:59:43 +00:00
parent 574cc217be
commit b40a6754b0
4 changed files with 117 additions and 38 deletions

View File

@ -88,6 +88,7 @@ from utils import (
configure_cluster_global,
configure_pacemaker_remote_resources,
configure_pacemaker_remote_stonith_resource,
configure_peer_stonith_resource,
configure_resources_on_remotes,
enable_lsb_services,
disable_lsb_services,
@ -115,6 +116,7 @@ from utils import (
enable_ha_services,
notify_peers_of_series_upgrade,
clear_series_upgrade_notification,
get_hostname,
)
from charmhelpers.contrib.charmsupport import nrpe
@ -249,7 +251,9 @@ def upgrade_charm():
def hanode_relation_joined(relid=None):
relation_set(
relation_id=relid,
relation_settings={'private-address': get_relation_ip('hanode')}
relation_settings={
'private-address': get_relation_ip('hanode'),
'hostname': get_hostname()}
)
@ -516,9 +520,12 @@ def ha_relation_changed():
if len(get_member_ready_nodes()) >= int(config('cluster_count')):
log('Configuring any remote nodes', level=INFO)
remote_resources = configure_pacemaker_remote_resources()
stonith_resource = configure_pacemaker_remote_stonith_resource()
resources.update(remote_resources)
resources.update(stonith_resource)
stonith_remote_res = configure_pacemaker_remote_stonith_resource()
resources.update(stonith_remote_res)
if stonith_remote_res:
stonith_peer_res = configure_peer_stonith_resource()
resources.update(stonith_peer_res)
configure_resources_on_remotes(
resources=resources,
clones=clones,

View File

@ -506,6 +506,7 @@ def parse_data(relid, unit, key):
def configure_stonith():
if configure_pacemaker_remote_stonith_resource():
configure_peer_stonith_resource()
log('Not disabling STONITH as pacemaker remotes are present',
level=INFO)
else:
@ -633,46 +634,74 @@ def remove_legacy_maas_stonith_resources():
'crm -w -F configure delete {}'.format(resource_name))
def configure_maas_stonith_resource(stonith_hostnames):
"""Create stonith resource for the given hostname.
def _configure_stonith_resource(ctxt):
hostnames = []
for host in ctxt['stonith_hostnames']:
hostnames.append(host)
if '.' in host:
hostnames.append(host.split('.')[0])
ctxt['hostnames'] = ' '.join(sorted(list(set(hostnames))))
if all(ctxt.values()):
ctxt['resource_params'] = ctxt['resource_params'].format(**ctxt)
if pcmk.is_resource_present(ctxt['stonith_resource_name']):
pcmk.crm_update_resource(
ctxt['stonith_resource_name'],
ctxt['stonith_plugin'],
ctxt['resource_params'])
else:
cmd = (
"crm configure primitive {stonith_resource_name} "
"{stonith_plugin} {resource_params}").format(**ctxt)
pcmk.commit(cmd, failure_is_fatal=True)
else:
raise ValueError("Missing configuration: {}".format(ctxt))
def configure_null_stonith_resource(stonith_hostnames):
"""Create null stonith resource for the given hostname.
:param stonith_hostnames: The hostnames that the stonith management system
refers to the remote node as.
:type stonith_hostname: List
"""
hostnames = []
for host in stonith_hostnames:
hostnames.append(host)
if '.' in host:
hostnames.append(host.split('.')[0])
hostnames = list(set(hostnames))
ctxt = {
'stonith_plugin': 'stonith:null',
'stonith_hostnames': stonith_hostnames,
'stonith_resource_name': 'st-null',
'resource_params': (
"params hostlist='{hostnames}' "
"op monitor interval=25 start-delay=25 "
"timeout=25")}
_configure_stonith_resource(ctxt)
# NOTE (gnuoy): Not enabling the global stonith-enabled setting as it
# does not make sense to have stonith-enabled when the only resources
# are null resources, so defer enabling stonith-enabled to the 'real'
# stonith resources.
return {ctxt['stonith_resource_name']: ctxt['stonith_plugin']}
def configure_maas_stonith_resource(stonith_hostnames):
"""Create maas stonith resource for the given hostname.
:param stonith_hostnames: The hostnames that the stonith management system
refers to the remote node as.
:type stonith_hostname: List
"""
ctxt = {
'stonith_plugin': 'stonith:external/maas',
'stonith_hostnames': stonith_hostnames,
'stonith_resource_name': 'st-maas',
'url': config('maas_url'),
'apikey': config('maas_credentials'),
'hostnames': ' '.join(sorted(hostnames))}
if all(ctxt.values()):
ctxt['stonith_resource_name'] = 'st-maas'
remove_legacy_maas_stonith_resources()
ctxt['resource_params'] = (
'resource_params': (
"params url='{url}' apikey='{apikey}' hostnames='{hostnames}' "
"op monitor interval=25 start-delay=25 "
"timeout=25").format(**ctxt)
if pcmk.is_resource_present(ctxt['stonith_resource_name']):
pcmk.crm_update_resource(
ctxt['stonith_resource_name'],
'stonith:external/maas',
ctxt['resource_params'])
else:
cmd = (
"crm configure primitive {stonith_resource_name} "
"stonith:external/maas {resource_params}").format(**ctxt)
pcmk.commit(cmd, failure_is_fatal=True)
pcmk.commit(
"crm configure property stonith-enabled=true",
failure_is_fatal=True)
else:
raise ValueError("Missing configuration: {}".format(ctxt))
return {ctxt['stonith_resource_name']: 'stonith:external/maas'}
"timeout=25")}
_configure_stonith_resource(ctxt)
pcmk.commit(
"crm configure property stonith-enabled=true",
failure_is_fatal=True)
return {ctxt['stonith_resource_name']: ctxt['stonith_plugin']}
def get_ip_addr_from_resource_params(params):
@ -816,6 +845,23 @@ def configure_pacemaker_remote_stonith_resource():
return stonith_resource
def configure_peer_stonith_resource():
"""Create a null stonith resource for lxd containers.
:returns: Stonith resource dict {res_name: res_type}
:rtype: dict
"""
hostnames = [get_hostname()]
stonith_resource = {}
for relid in relation_ids('hanode'):
for unit in related_units(relid):
stonith_hostname = relation_get('hostname', unit, relid)
if stonith_hostname:
hostnames.append(stonith_hostname)
stonith_resource = configure_null_stonith_resource(hostnames)
return stonith_resource
def configure_pacemaker_remote_resources():
"""Create resources corresponding to the pacemaker remote nodes.

View File

@ -39,6 +39,7 @@ class TestCorosyncConf(unittest.TestCase):
shutil.rmtree(self.tmpdir)
os.remove(self.tmpfile.name)
@mock.patch.object(hooks, 'configure_peer_stonith_resource')
@mock.patch.object(hooks, 'get_member_ready_nodes')
@mock.patch.object(hooks, 'configure_resources_on_remotes')
@mock.patch.object(hooks, 'configure_pacemaker_remote_stonith_resource')
@ -71,7 +72,8 @@ class TestCorosyncConf(unittest.TestCase):
configure_pacemaker_remote_resources,
configure_pacemaker_remote_stonith_resource,
configure_resources_on_remotes,
get_member_ready_nodes):
get_member_ready_nodes,
configure_peer_stonith_resource):
def fake_crm_opt_exists(res_name):
# res_ubuntu will take the "update resource" route
@ -154,6 +156,7 @@ class TestCorosyncConf(unittest.TestCase):
commit.assert_any_call(
'crm -w -F configure %s %s %s' % (kw, name, params))
@mock.patch.object(hooks, 'configure_peer_stonith_resource')
@mock.patch.object(hooks, 'get_member_ready_nodes')
@mock.patch.object(hooks, 'configure_resources_on_remotes')
@mock.patch.object(hooks, 'configure_pacemaker_remote_stonith_resource')
@ -185,7 +188,8 @@ class TestCorosyncConf(unittest.TestCase):
validate_dns_ha, setup_maas_api, write_maas_dns_addr,
set_cluster_symmetry, configure_pacemaker_remote_resources,
configure_pacemaker_remote_stonith_resource,
configure_resources_on_remotes, get_member_ready_nodes):
configure_resources_on_remotes, get_member_ready_nodes,
configure_peer_stonith_resource):
validate_dns_ha.return_value = True
crm_opt_exists.return_value = False
is_leader.return_value = True
@ -426,18 +430,22 @@ class TestHooks(test_utils.CharmTestCase):
write_maas_dns_address.assert_called_with(
"res_keystone_public_hostname", "172.16.0.1")
@mock.patch.object(hooks, 'get_hostname')
@mock.patch.object(hooks, 'get_relation_ip')
@mock.patch.object(hooks, 'relation_set')
def test_hanode_relation_joined(self,
mock_relation_set,
mock_get_relation_ip):
mock_get_relation_ip,
mock_get_hostname):
mock_get_hostname.return_value = 'juju-c2419e-0-lxd-1'
mock_get_relation_ip.return_value = '10.10.10.2'
hooks.hanode_relation_joined('hanode:1')
mock_get_relation_ip.assert_called_once_with('hanode')
mock_relation_set.assert_called_once_with(
relation_id='hanode:1',
relation_settings={'private-address': '10.10.10.2'}
)
relation_settings={
'private-address': '10.10.10.2',
'hostname': 'juju-c2419e-0-lxd-1'})
@mock.patch.object(hooks, 'ha_relation_changed')
@mock.patch.object(hooks, 'is_waiting_unit_series_upgrade_set')

View File

@ -844,6 +844,24 @@ class UtilsTestCase(unittest.TestCase):
]
commit.assert_has_calls(commit_calls)
@mock.patch.object(utils, 'remove_legacy_maas_stonith_resources')
@mock.patch('pcmk.commit')
@mock.patch('pcmk.is_resource_present')
def test_configure_null_stonith_resource(self, is_resource_present,
commit, remove_legacy):
is_resource_present.return_value = False
utils.configure_null_stonith_resource(['node1'])
cmd = (
"crm configure primitive st-null "
"stonith:null "
"params hostlist='node1' "
"op monitor interval=25 start-delay=25 "
"timeout=25")
commit_calls = [
mock.call(cmd, failure_is_fatal=True),
]
commit.assert_has_calls(commit_calls)
@mock.patch.object(utils, 'config')
@mock.patch.object(utils, 'remove_legacy_maas_stonith_resources')
@mock.patch('pcmk.commit')