Create null stonith resource for lxd containers.
If stonith is enabled then when a compute node is detected as failed it is powered down. This can include a lxd container which is also part of the cluster. In this case because stonith is enabled at a global level, pacemaker will try and power off the lxd container too. But the container does not have a stonith device and this causes the container to be marked as unclean (but not down). This running unclean state prevents resources being moved and causes any pacemaker-remotes that are associated with the lost container from losing their connection which prevents masakari hostmonitor from ascertaining the cluster health. The way to work around this is to create a dummy stonith device for the lxd containers. This allows the cluster to properly mark the lost container as down and resources are relocated. Change-Id: Ic45dbdd9d8581f25549580c7e98a8d6e0bf8c3e7 Partial-Bug: #1889094
This commit is contained in:
parent
574cc217be
commit
b40a6754b0
|
@ -88,6 +88,7 @@ from utils import (
|
|||
configure_cluster_global,
|
||||
configure_pacemaker_remote_resources,
|
||||
configure_pacemaker_remote_stonith_resource,
|
||||
configure_peer_stonith_resource,
|
||||
configure_resources_on_remotes,
|
||||
enable_lsb_services,
|
||||
disable_lsb_services,
|
||||
|
@ -115,6 +116,7 @@ from utils import (
|
|||
enable_ha_services,
|
||||
notify_peers_of_series_upgrade,
|
||||
clear_series_upgrade_notification,
|
||||
get_hostname,
|
||||
)
|
||||
|
||||
from charmhelpers.contrib.charmsupport import nrpe
|
||||
|
@ -249,7 +251,9 @@ def upgrade_charm():
|
|||
def hanode_relation_joined(relid=None):
|
||||
relation_set(
|
||||
relation_id=relid,
|
||||
relation_settings={'private-address': get_relation_ip('hanode')}
|
||||
relation_settings={
|
||||
'private-address': get_relation_ip('hanode'),
|
||||
'hostname': get_hostname()}
|
||||
)
|
||||
|
||||
|
||||
|
@ -516,9 +520,12 @@ def ha_relation_changed():
|
|||
if len(get_member_ready_nodes()) >= int(config('cluster_count')):
|
||||
log('Configuring any remote nodes', level=INFO)
|
||||
remote_resources = configure_pacemaker_remote_resources()
|
||||
stonith_resource = configure_pacemaker_remote_stonith_resource()
|
||||
resources.update(remote_resources)
|
||||
resources.update(stonith_resource)
|
||||
stonith_remote_res = configure_pacemaker_remote_stonith_resource()
|
||||
resources.update(stonith_remote_res)
|
||||
if stonith_remote_res:
|
||||
stonith_peer_res = configure_peer_stonith_resource()
|
||||
resources.update(stonith_peer_res)
|
||||
configure_resources_on_remotes(
|
||||
resources=resources,
|
||||
clones=clones,
|
||||
|
|
106
hooks/utils.py
106
hooks/utils.py
|
@ -506,6 +506,7 @@ def parse_data(relid, unit, key):
|
|||
|
||||
def configure_stonith():
|
||||
if configure_pacemaker_remote_stonith_resource():
|
||||
configure_peer_stonith_resource()
|
||||
log('Not disabling STONITH as pacemaker remotes are present',
|
||||
level=INFO)
|
||||
else:
|
||||
|
@ -633,46 +634,74 @@ def remove_legacy_maas_stonith_resources():
|
|||
'crm -w -F configure delete {}'.format(resource_name))
|
||||
|
||||
|
||||
def configure_maas_stonith_resource(stonith_hostnames):
|
||||
"""Create stonith resource for the given hostname.
|
||||
def _configure_stonith_resource(ctxt):
|
||||
hostnames = []
|
||||
for host in ctxt['stonith_hostnames']:
|
||||
hostnames.append(host)
|
||||
if '.' in host:
|
||||
hostnames.append(host.split('.')[0])
|
||||
ctxt['hostnames'] = ' '.join(sorted(list(set(hostnames))))
|
||||
if all(ctxt.values()):
|
||||
ctxt['resource_params'] = ctxt['resource_params'].format(**ctxt)
|
||||
if pcmk.is_resource_present(ctxt['stonith_resource_name']):
|
||||
pcmk.crm_update_resource(
|
||||
ctxt['stonith_resource_name'],
|
||||
ctxt['stonith_plugin'],
|
||||
ctxt['resource_params'])
|
||||
else:
|
||||
cmd = (
|
||||
"crm configure primitive {stonith_resource_name} "
|
||||
"{stonith_plugin} {resource_params}").format(**ctxt)
|
||||
pcmk.commit(cmd, failure_is_fatal=True)
|
||||
else:
|
||||
raise ValueError("Missing configuration: {}".format(ctxt))
|
||||
|
||||
|
||||
def configure_null_stonith_resource(stonith_hostnames):
|
||||
"""Create null stonith resource for the given hostname.
|
||||
|
||||
:param stonith_hostnames: The hostnames that the stonith management system
|
||||
refers to the remote node as.
|
||||
:type stonith_hostname: List
|
||||
"""
|
||||
hostnames = []
|
||||
for host in stonith_hostnames:
|
||||
hostnames.append(host)
|
||||
if '.' in host:
|
||||
hostnames.append(host.split('.')[0])
|
||||
hostnames = list(set(hostnames))
|
||||
ctxt = {
|
||||
'stonith_plugin': 'stonith:null',
|
||||
'stonith_hostnames': stonith_hostnames,
|
||||
'stonith_resource_name': 'st-null',
|
||||
'resource_params': (
|
||||
"params hostlist='{hostnames}' "
|
||||
"op monitor interval=25 start-delay=25 "
|
||||
"timeout=25")}
|
||||
_configure_stonith_resource(ctxt)
|
||||
# NOTE (gnuoy): Not enabling the global stonith-enabled setting as it
|
||||
# does not make sense to have stonith-enabled when the only resources
|
||||
# are null resources, so defer enabling stonith-enabled to the 'real'
|
||||
# stonith resources.
|
||||
return {ctxt['stonith_resource_name']: ctxt['stonith_plugin']}
|
||||
|
||||
|
||||
def configure_maas_stonith_resource(stonith_hostnames):
|
||||
"""Create maas stonith resource for the given hostname.
|
||||
|
||||
:param stonith_hostnames: The hostnames that the stonith management system
|
||||
refers to the remote node as.
|
||||
:type stonith_hostname: List
|
||||
"""
|
||||
ctxt = {
|
||||
'stonith_plugin': 'stonith:external/maas',
|
||||
'stonith_hostnames': stonith_hostnames,
|
||||
'stonith_resource_name': 'st-maas',
|
||||
'url': config('maas_url'),
|
||||
'apikey': config('maas_credentials'),
|
||||
'hostnames': ' '.join(sorted(hostnames))}
|
||||
if all(ctxt.values()):
|
||||
ctxt['stonith_resource_name'] = 'st-maas'
|
||||
remove_legacy_maas_stonith_resources()
|
||||
ctxt['resource_params'] = (
|
||||
'resource_params': (
|
||||
"params url='{url}' apikey='{apikey}' hostnames='{hostnames}' "
|
||||
"op monitor interval=25 start-delay=25 "
|
||||
"timeout=25").format(**ctxt)
|
||||
if pcmk.is_resource_present(ctxt['stonith_resource_name']):
|
||||
pcmk.crm_update_resource(
|
||||
ctxt['stonith_resource_name'],
|
||||
'stonith:external/maas',
|
||||
ctxt['resource_params'])
|
||||
else:
|
||||
cmd = (
|
||||
"crm configure primitive {stonith_resource_name} "
|
||||
"stonith:external/maas {resource_params}").format(**ctxt)
|
||||
pcmk.commit(cmd, failure_is_fatal=True)
|
||||
pcmk.commit(
|
||||
"crm configure property stonith-enabled=true",
|
||||
failure_is_fatal=True)
|
||||
else:
|
||||
raise ValueError("Missing configuration: {}".format(ctxt))
|
||||
return {ctxt['stonith_resource_name']: 'stonith:external/maas'}
|
||||
"timeout=25")}
|
||||
_configure_stonith_resource(ctxt)
|
||||
pcmk.commit(
|
||||
"crm configure property stonith-enabled=true",
|
||||
failure_is_fatal=True)
|
||||
return {ctxt['stonith_resource_name']: ctxt['stonith_plugin']}
|
||||
|
||||
|
||||
def get_ip_addr_from_resource_params(params):
|
||||
|
@ -816,6 +845,23 @@ def configure_pacemaker_remote_stonith_resource():
|
|||
return stonith_resource
|
||||
|
||||
|
||||
def configure_peer_stonith_resource():
|
||||
"""Create a null stonith resource for lxd containers.
|
||||
|
||||
:returns: Stonith resource dict {res_name: res_type}
|
||||
:rtype: dict
|
||||
"""
|
||||
hostnames = [get_hostname()]
|
||||
stonith_resource = {}
|
||||
for relid in relation_ids('hanode'):
|
||||
for unit in related_units(relid):
|
||||
stonith_hostname = relation_get('hostname', unit, relid)
|
||||
if stonith_hostname:
|
||||
hostnames.append(stonith_hostname)
|
||||
stonith_resource = configure_null_stonith_resource(hostnames)
|
||||
return stonith_resource
|
||||
|
||||
|
||||
def configure_pacemaker_remote_resources():
|
||||
"""Create resources corresponding to the pacemaker remote nodes.
|
||||
|
||||
|
|
|
@ -39,6 +39,7 @@ class TestCorosyncConf(unittest.TestCase):
|
|||
shutil.rmtree(self.tmpdir)
|
||||
os.remove(self.tmpfile.name)
|
||||
|
||||
@mock.patch.object(hooks, 'configure_peer_stonith_resource')
|
||||
@mock.patch.object(hooks, 'get_member_ready_nodes')
|
||||
@mock.patch.object(hooks, 'configure_resources_on_remotes')
|
||||
@mock.patch.object(hooks, 'configure_pacemaker_remote_stonith_resource')
|
||||
|
@ -71,7 +72,8 @@ class TestCorosyncConf(unittest.TestCase):
|
|||
configure_pacemaker_remote_resources,
|
||||
configure_pacemaker_remote_stonith_resource,
|
||||
configure_resources_on_remotes,
|
||||
get_member_ready_nodes):
|
||||
get_member_ready_nodes,
|
||||
configure_peer_stonith_resource):
|
||||
|
||||
def fake_crm_opt_exists(res_name):
|
||||
# res_ubuntu will take the "update resource" route
|
||||
|
@ -154,6 +156,7 @@ class TestCorosyncConf(unittest.TestCase):
|
|||
commit.assert_any_call(
|
||||
'crm -w -F configure %s %s %s' % (kw, name, params))
|
||||
|
||||
@mock.patch.object(hooks, 'configure_peer_stonith_resource')
|
||||
@mock.patch.object(hooks, 'get_member_ready_nodes')
|
||||
@mock.patch.object(hooks, 'configure_resources_on_remotes')
|
||||
@mock.patch.object(hooks, 'configure_pacemaker_remote_stonith_resource')
|
||||
|
@ -185,7 +188,8 @@ class TestCorosyncConf(unittest.TestCase):
|
|||
validate_dns_ha, setup_maas_api, write_maas_dns_addr,
|
||||
set_cluster_symmetry, configure_pacemaker_remote_resources,
|
||||
configure_pacemaker_remote_stonith_resource,
|
||||
configure_resources_on_remotes, get_member_ready_nodes):
|
||||
configure_resources_on_remotes, get_member_ready_nodes,
|
||||
configure_peer_stonith_resource):
|
||||
validate_dns_ha.return_value = True
|
||||
crm_opt_exists.return_value = False
|
||||
is_leader.return_value = True
|
||||
|
@ -426,18 +430,22 @@ class TestHooks(test_utils.CharmTestCase):
|
|||
write_maas_dns_address.assert_called_with(
|
||||
"res_keystone_public_hostname", "172.16.0.1")
|
||||
|
||||
@mock.patch.object(hooks, 'get_hostname')
|
||||
@mock.patch.object(hooks, 'get_relation_ip')
|
||||
@mock.patch.object(hooks, 'relation_set')
|
||||
def test_hanode_relation_joined(self,
|
||||
mock_relation_set,
|
||||
mock_get_relation_ip):
|
||||
mock_get_relation_ip,
|
||||
mock_get_hostname):
|
||||
mock_get_hostname.return_value = 'juju-c2419e-0-lxd-1'
|
||||
mock_get_relation_ip.return_value = '10.10.10.2'
|
||||
hooks.hanode_relation_joined('hanode:1')
|
||||
mock_get_relation_ip.assert_called_once_with('hanode')
|
||||
mock_relation_set.assert_called_once_with(
|
||||
relation_id='hanode:1',
|
||||
relation_settings={'private-address': '10.10.10.2'}
|
||||
)
|
||||
relation_settings={
|
||||
'private-address': '10.10.10.2',
|
||||
'hostname': 'juju-c2419e-0-lxd-1'})
|
||||
|
||||
@mock.patch.object(hooks, 'ha_relation_changed')
|
||||
@mock.patch.object(hooks, 'is_waiting_unit_series_upgrade_set')
|
||||
|
|
|
@ -844,6 +844,24 @@ class UtilsTestCase(unittest.TestCase):
|
|||
]
|
||||
commit.assert_has_calls(commit_calls)
|
||||
|
||||
@mock.patch.object(utils, 'remove_legacy_maas_stonith_resources')
|
||||
@mock.patch('pcmk.commit')
|
||||
@mock.patch('pcmk.is_resource_present')
|
||||
def test_configure_null_stonith_resource(self, is_resource_present,
|
||||
commit, remove_legacy):
|
||||
is_resource_present.return_value = False
|
||||
utils.configure_null_stonith_resource(['node1'])
|
||||
cmd = (
|
||||
"crm configure primitive st-null "
|
||||
"stonith:null "
|
||||
"params hostlist='node1' "
|
||||
"op monitor interval=25 start-delay=25 "
|
||||
"timeout=25")
|
||||
commit_calls = [
|
||||
mock.call(cmd, failure_is_fatal=True),
|
||||
]
|
||||
commit.assert_has_calls(commit_calls)
|
||||
|
||||
@mock.patch.object(utils, 'config')
|
||||
@mock.patch.object(utils, 'remove_legacy_maas_stonith_resources')
|
||||
@mock.patch('pcmk.commit')
|
||||
|
|
Loading…
Reference in New Issue