Create null stonith resource for lxd containers.
If stonith is enabled then when a compute node is detected as failed it is powered down. This can include a lxd container which is also part of the cluster. In this case because stonith is enabled at a global level, pacemaker will try and power off the lxd container too. But the container does not have a stonith device and this causes the container to be marked as unclean (but not down). This running unclean state prevents resources being moved and causes any pacemaker-remotes that are associated with the lost container from losing their connection which prevents masakari hostmonitor from ascertaining the cluster health. The way to work around this is to create a dummy stonith device for the lxd containers. This allows the cluster to properly mark the lost container as down and resources are relocated. Change-Id: Ic45dbdd9d8581f25549580c7e98a8d6e0bf8c3e7 Partial-Bug: #1889094
This commit is contained in:
parent
574cc217be
commit
b40a6754b0
|
@ -88,6 +88,7 @@ from utils import (
|
||||||
configure_cluster_global,
|
configure_cluster_global,
|
||||||
configure_pacemaker_remote_resources,
|
configure_pacemaker_remote_resources,
|
||||||
configure_pacemaker_remote_stonith_resource,
|
configure_pacemaker_remote_stonith_resource,
|
||||||
|
configure_peer_stonith_resource,
|
||||||
configure_resources_on_remotes,
|
configure_resources_on_remotes,
|
||||||
enable_lsb_services,
|
enable_lsb_services,
|
||||||
disable_lsb_services,
|
disable_lsb_services,
|
||||||
|
@ -115,6 +116,7 @@ from utils import (
|
||||||
enable_ha_services,
|
enable_ha_services,
|
||||||
notify_peers_of_series_upgrade,
|
notify_peers_of_series_upgrade,
|
||||||
clear_series_upgrade_notification,
|
clear_series_upgrade_notification,
|
||||||
|
get_hostname,
|
||||||
)
|
)
|
||||||
|
|
||||||
from charmhelpers.contrib.charmsupport import nrpe
|
from charmhelpers.contrib.charmsupport import nrpe
|
||||||
|
@ -249,7 +251,9 @@ def upgrade_charm():
|
||||||
def hanode_relation_joined(relid=None):
|
def hanode_relation_joined(relid=None):
|
||||||
relation_set(
|
relation_set(
|
||||||
relation_id=relid,
|
relation_id=relid,
|
||||||
relation_settings={'private-address': get_relation_ip('hanode')}
|
relation_settings={
|
||||||
|
'private-address': get_relation_ip('hanode'),
|
||||||
|
'hostname': get_hostname()}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -516,9 +520,12 @@ def ha_relation_changed():
|
||||||
if len(get_member_ready_nodes()) >= int(config('cluster_count')):
|
if len(get_member_ready_nodes()) >= int(config('cluster_count')):
|
||||||
log('Configuring any remote nodes', level=INFO)
|
log('Configuring any remote nodes', level=INFO)
|
||||||
remote_resources = configure_pacemaker_remote_resources()
|
remote_resources = configure_pacemaker_remote_resources()
|
||||||
stonith_resource = configure_pacemaker_remote_stonith_resource()
|
|
||||||
resources.update(remote_resources)
|
resources.update(remote_resources)
|
||||||
resources.update(stonith_resource)
|
stonith_remote_res = configure_pacemaker_remote_stonith_resource()
|
||||||
|
resources.update(stonith_remote_res)
|
||||||
|
if stonith_remote_res:
|
||||||
|
stonith_peer_res = configure_peer_stonith_resource()
|
||||||
|
resources.update(stonith_peer_res)
|
||||||
configure_resources_on_remotes(
|
configure_resources_on_remotes(
|
||||||
resources=resources,
|
resources=resources,
|
||||||
clones=clones,
|
clones=clones,
|
||||||
|
|
106
hooks/utils.py
106
hooks/utils.py
|
@ -506,6 +506,7 @@ def parse_data(relid, unit, key):
|
||||||
|
|
||||||
def configure_stonith():
|
def configure_stonith():
|
||||||
if configure_pacemaker_remote_stonith_resource():
|
if configure_pacemaker_remote_stonith_resource():
|
||||||
|
configure_peer_stonith_resource()
|
||||||
log('Not disabling STONITH as pacemaker remotes are present',
|
log('Not disabling STONITH as pacemaker remotes are present',
|
||||||
level=INFO)
|
level=INFO)
|
||||||
else:
|
else:
|
||||||
|
@ -633,46 +634,74 @@ def remove_legacy_maas_stonith_resources():
|
||||||
'crm -w -F configure delete {}'.format(resource_name))
|
'crm -w -F configure delete {}'.format(resource_name))
|
||||||
|
|
||||||
|
|
||||||
def configure_maas_stonith_resource(stonith_hostnames):
|
def _configure_stonith_resource(ctxt):
|
||||||
"""Create stonith resource for the given hostname.
|
hostnames = []
|
||||||
|
for host in ctxt['stonith_hostnames']:
|
||||||
|
hostnames.append(host)
|
||||||
|
if '.' in host:
|
||||||
|
hostnames.append(host.split('.')[0])
|
||||||
|
ctxt['hostnames'] = ' '.join(sorted(list(set(hostnames))))
|
||||||
|
if all(ctxt.values()):
|
||||||
|
ctxt['resource_params'] = ctxt['resource_params'].format(**ctxt)
|
||||||
|
if pcmk.is_resource_present(ctxt['stonith_resource_name']):
|
||||||
|
pcmk.crm_update_resource(
|
||||||
|
ctxt['stonith_resource_name'],
|
||||||
|
ctxt['stonith_plugin'],
|
||||||
|
ctxt['resource_params'])
|
||||||
|
else:
|
||||||
|
cmd = (
|
||||||
|
"crm configure primitive {stonith_resource_name} "
|
||||||
|
"{stonith_plugin} {resource_params}").format(**ctxt)
|
||||||
|
pcmk.commit(cmd, failure_is_fatal=True)
|
||||||
|
else:
|
||||||
|
raise ValueError("Missing configuration: {}".format(ctxt))
|
||||||
|
|
||||||
|
|
||||||
|
def configure_null_stonith_resource(stonith_hostnames):
|
||||||
|
"""Create null stonith resource for the given hostname.
|
||||||
|
|
||||||
:param stonith_hostnames: The hostnames that the stonith management system
|
:param stonith_hostnames: The hostnames that the stonith management system
|
||||||
refers to the remote node as.
|
refers to the remote node as.
|
||||||
:type stonith_hostname: List
|
:type stonith_hostname: List
|
||||||
"""
|
"""
|
||||||
hostnames = []
|
|
||||||
for host in stonith_hostnames:
|
|
||||||
hostnames.append(host)
|
|
||||||
if '.' in host:
|
|
||||||
hostnames.append(host.split('.')[0])
|
|
||||||
hostnames = list(set(hostnames))
|
|
||||||
ctxt = {
|
ctxt = {
|
||||||
|
'stonith_plugin': 'stonith:null',
|
||||||
|
'stonith_hostnames': stonith_hostnames,
|
||||||
|
'stonith_resource_name': 'st-null',
|
||||||
|
'resource_params': (
|
||||||
|
"params hostlist='{hostnames}' "
|
||||||
|
"op monitor interval=25 start-delay=25 "
|
||||||
|
"timeout=25")}
|
||||||
|
_configure_stonith_resource(ctxt)
|
||||||
|
# NOTE (gnuoy): Not enabling the global stonith-enabled setting as it
|
||||||
|
# does not make sense to have stonith-enabled when the only resources
|
||||||
|
# are null resources, so defer enabling stonith-enabled to the 'real'
|
||||||
|
# stonith resources.
|
||||||
|
return {ctxt['stonith_resource_name']: ctxt['stonith_plugin']}
|
||||||
|
|
||||||
|
|
||||||
|
def configure_maas_stonith_resource(stonith_hostnames):
|
||||||
|
"""Create maas stonith resource for the given hostname.
|
||||||
|
|
||||||
|
:param stonith_hostnames: The hostnames that the stonith management system
|
||||||
|
refers to the remote node as.
|
||||||
|
:type stonith_hostname: List
|
||||||
|
"""
|
||||||
|
ctxt = {
|
||||||
|
'stonith_plugin': 'stonith:external/maas',
|
||||||
|
'stonith_hostnames': stonith_hostnames,
|
||||||
|
'stonith_resource_name': 'st-maas',
|
||||||
'url': config('maas_url'),
|
'url': config('maas_url'),
|
||||||
'apikey': config('maas_credentials'),
|
'apikey': config('maas_credentials'),
|
||||||
'hostnames': ' '.join(sorted(hostnames))}
|
'resource_params': (
|
||||||
if all(ctxt.values()):
|
|
||||||
ctxt['stonith_resource_name'] = 'st-maas'
|
|
||||||
remove_legacy_maas_stonith_resources()
|
|
||||||
ctxt['resource_params'] = (
|
|
||||||
"params url='{url}' apikey='{apikey}' hostnames='{hostnames}' "
|
"params url='{url}' apikey='{apikey}' hostnames='{hostnames}' "
|
||||||
"op monitor interval=25 start-delay=25 "
|
"op monitor interval=25 start-delay=25 "
|
||||||
"timeout=25").format(**ctxt)
|
"timeout=25")}
|
||||||
if pcmk.is_resource_present(ctxt['stonith_resource_name']):
|
_configure_stonith_resource(ctxt)
|
||||||
pcmk.crm_update_resource(
|
pcmk.commit(
|
||||||
ctxt['stonith_resource_name'],
|
"crm configure property stonith-enabled=true",
|
||||||
'stonith:external/maas',
|
failure_is_fatal=True)
|
||||||
ctxt['resource_params'])
|
return {ctxt['stonith_resource_name']: ctxt['stonith_plugin']}
|
||||||
else:
|
|
||||||
cmd = (
|
|
||||||
"crm configure primitive {stonith_resource_name} "
|
|
||||||
"stonith:external/maas {resource_params}").format(**ctxt)
|
|
||||||
pcmk.commit(cmd, failure_is_fatal=True)
|
|
||||||
pcmk.commit(
|
|
||||||
"crm configure property stonith-enabled=true",
|
|
||||||
failure_is_fatal=True)
|
|
||||||
else:
|
|
||||||
raise ValueError("Missing configuration: {}".format(ctxt))
|
|
||||||
return {ctxt['stonith_resource_name']: 'stonith:external/maas'}
|
|
||||||
|
|
||||||
|
|
||||||
def get_ip_addr_from_resource_params(params):
|
def get_ip_addr_from_resource_params(params):
|
||||||
|
@ -816,6 +845,23 @@ def configure_pacemaker_remote_stonith_resource():
|
||||||
return stonith_resource
|
return stonith_resource
|
||||||
|
|
||||||
|
|
||||||
|
def configure_peer_stonith_resource():
|
||||||
|
"""Create a null stonith resource for lxd containers.
|
||||||
|
|
||||||
|
:returns: Stonith resource dict {res_name: res_type}
|
||||||
|
:rtype: dict
|
||||||
|
"""
|
||||||
|
hostnames = [get_hostname()]
|
||||||
|
stonith_resource = {}
|
||||||
|
for relid in relation_ids('hanode'):
|
||||||
|
for unit in related_units(relid):
|
||||||
|
stonith_hostname = relation_get('hostname', unit, relid)
|
||||||
|
if stonith_hostname:
|
||||||
|
hostnames.append(stonith_hostname)
|
||||||
|
stonith_resource = configure_null_stonith_resource(hostnames)
|
||||||
|
return stonith_resource
|
||||||
|
|
||||||
|
|
||||||
def configure_pacemaker_remote_resources():
|
def configure_pacemaker_remote_resources():
|
||||||
"""Create resources corresponding to the pacemaker remote nodes.
|
"""Create resources corresponding to the pacemaker remote nodes.
|
||||||
|
|
||||||
|
|
|
@ -39,6 +39,7 @@ class TestCorosyncConf(unittest.TestCase):
|
||||||
shutil.rmtree(self.tmpdir)
|
shutil.rmtree(self.tmpdir)
|
||||||
os.remove(self.tmpfile.name)
|
os.remove(self.tmpfile.name)
|
||||||
|
|
||||||
|
@mock.patch.object(hooks, 'configure_peer_stonith_resource')
|
||||||
@mock.patch.object(hooks, 'get_member_ready_nodes')
|
@mock.patch.object(hooks, 'get_member_ready_nodes')
|
||||||
@mock.patch.object(hooks, 'configure_resources_on_remotes')
|
@mock.patch.object(hooks, 'configure_resources_on_remotes')
|
||||||
@mock.patch.object(hooks, 'configure_pacemaker_remote_stonith_resource')
|
@mock.patch.object(hooks, 'configure_pacemaker_remote_stonith_resource')
|
||||||
|
@ -71,7 +72,8 @@ class TestCorosyncConf(unittest.TestCase):
|
||||||
configure_pacemaker_remote_resources,
|
configure_pacemaker_remote_resources,
|
||||||
configure_pacemaker_remote_stonith_resource,
|
configure_pacemaker_remote_stonith_resource,
|
||||||
configure_resources_on_remotes,
|
configure_resources_on_remotes,
|
||||||
get_member_ready_nodes):
|
get_member_ready_nodes,
|
||||||
|
configure_peer_stonith_resource):
|
||||||
|
|
||||||
def fake_crm_opt_exists(res_name):
|
def fake_crm_opt_exists(res_name):
|
||||||
# res_ubuntu will take the "update resource" route
|
# res_ubuntu will take the "update resource" route
|
||||||
|
@ -154,6 +156,7 @@ class TestCorosyncConf(unittest.TestCase):
|
||||||
commit.assert_any_call(
|
commit.assert_any_call(
|
||||||
'crm -w -F configure %s %s %s' % (kw, name, params))
|
'crm -w -F configure %s %s %s' % (kw, name, params))
|
||||||
|
|
||||||
|
@mock.patch.object(hooks, 'configure_peer_stonith_resource')
|
||||||
@mock.patch.object(hooks, 'get_member_ready_nodes')
|
@mock.patch.object(hooks, 'get_member_ready_nodes')
|
||||||
@mock.patch.object(hooks, 'configure_resources_on_remotes')
|
@mock.patch.object(hooks, 'configure_resources_on_remotes')
|
||||||
@mock.patch.object(hooks, 'configure_pacemaker_remote_stonith_resource')
|
@mock.patch.object(hooks, 'configure_pacemaker_remote_stonith_resource')
|
||||||
|
@ -185,7 +188,8 @@ class TestCorosyncConf(unittest.TestCase):
|
||||||
validate_dns_ha, setup_maas_api, write_maas_dns_addr,
|
validate_dns_ha, setup_maas_api, write_maas_dns_addr,
|
||||||
set_cluster_symmetry, configure_pacemaker_remote_resources,
|
set_cluster_symmetry, configure_pacemaker_remote_resources,
|
||||||
configure_pacemaker_remote_stonith_resource,
|
configure_pacemaker_remote_stonith_resource,
|
||||||
configure_resources_on_remotes, get_member_ready_nodes):
|
configure_resources_on_remotes, get_member_ready_nodes,
|
||||||
|
configure_peer_stonith_resource):
|
||||||
validate_dns_ha.return_value = True
|
validate_dns_ha.return_value = True
|
||||||
crm_opt_exists.return_value = False
|
crm_opt_exists.return_value = False
|
||||||
is_leader.return_value = True
|
is_leader.return_value = True
|
||||||
|
@ -426,18 +430,22 @@ class TestHooks(test_utils.CharmTestCase):
|
||||||
write_maas_dns_address.assert_called_with(
|
write_maas_dns_address.assert_called_with(
|
||||||
"res_keystone_public_hostname", "172.16.0.1")
|
"res_keystone_public_hostname", "172.16.0.1")
|
||||||
|
|
||||||
|
@mock.patch.object(hooks, 'get_hostname')
|
||||||
@mock.patch.object(hooks, 'get_relation_ip')
|
@mock.patch.object(hooks, 'get_relation_ip')
|
||||||
@mock.patch.object(hooks, 'relation_set')
|
@mock.patch.object(hooks, 'relation_set')
|
||||||
def test_hanode_relation_joined(self,
|
def test_hanode_relation_joined(self,
|
||||||
mock_relation_set,
|
mock_relation_set,
|
||||||
mock_get_relation_ip):
|
mock_get_relation_ip,
|
||||||
|
mock_get_hostname):
|
||||||
|
mock_get_hostname.return_value = 'juju-c2419e-0-lxd-1'
|
||||||
mock_get_relation_ip.return_value = '10.10.10.2'
|
mock_get_relation_ip.return_value = '10.10.10.2'
|
||||||
hooks.hanode_relation_joined('hanode:1')
|
hooks.hanode_relation_joined('hanode:1')
|
||||||
mock_get_relation_ip.assert_called_once_with('hanode')
|
mock_get_relation_ip.assert_called_once_with('hanode')
|
||||||
mock_relation_set.assert_called_once_with(
|
mock_relation_set.assert_called_once_with(
|
||||||
relation_id='hanode:1',
|
relation_id='hanode:1',
|
||||||
relation_settings={'private-address': '10.10.10.2'}
|
relation_settings={
|
||||||
)
|
'private-address': '10.10.10.2',
|
||||||
|
'hostname': 'juju-c2419e-0-lxd-1'})
|
||||||
|
|
||||||
@mock.patch.object(hooks, 'ha_relation_changed')
|
@mock.patch.object(hooks, 'ha_relation_changed')
|
||||||
@mock.patch.object(hooks, 'is_waiting_unit_series_upgrade_set')
|
@mock.patch.object(hooks, 'is_waiting_unit_series_upgrade_set')
|
||||||
|
|
|
@ -844,6 +844,24 @@ class UtilsTestCase(unittest.TestCase):
|
||||||
]
|
]
|
||||||
commit.assert_has_calls(commit_calls)
|
commit.assert_has_calls(commit_calls)
|
||||||
|
|
||||||
|
@mock.patch.object(utils, 'remove_legacy_maas_stonith_resources')
|
||||||
|
@mock.patch('pcmk.commit')
|
||||||
|
@mock.patch('pcmk.is_resource_present')
|
||||||
|
def test_configure_null_stonith_resource(self, is_resource_present,
|
||||||
|
commit, remove_legacy):
|
||||||
|
is_resource_present.return_value = False
|
||||||
|
utils.configure_null_stonith_resource(['node1'])
|
||||||
|
cmd = (
|
||||||
|
"crm configure primitive st-null "
|
||||||
|
"stonith:null "
|
||||||
|
"params hostlist='node1' "
|
||||||
|
"op monitor interval=25 start-delay=25 "
|
||||||
|
"timeout=25")
|
||||||
|
commit_calls = [
|
||||||
|
mock.call(cmd, failure_is_fatal=True),
|
||||||
|
]
|
||||||
|
commit.assert_has_calls(commit_calls)
|
||||||
|
|
||||||
@mock.patch.object(utils, 'config')
|
@mock.patch.object(utils, 'config')
|
||||||
@mock.patch.object(utils, 'remove_legacy_maas_stonith_resources')
|
@mock.patch.object(utils, 'remove_legacy_maas_stonith_resources')
|
||||||
@mock.patch('pcmk.commit')
|
@mock.patch('pcmk.commit')
|
||||||
|
|
Loading…
Reference in New Issue