Adjust quorum after node removal

Add an `update-ring` action for that purpose.
Also print more on various pacemaker failures.
Also removed some dead code.

Func-Test-PR: https://github.com/openstack-charmers/zaza-openstack-tests/pull/369
Change-Id: I35c0c9ce67fd459b9c3099346705d43d76bbdfe4
Closes-Bug: #1400481
Related-Bug: #1874719
Co-Authored-By: Aurelien Lourot <aurelien.lourot@canonical.com>
Co-Authored-By: Felipe Reyes <felipe.reyes@canonical.com>
This commit is contained in:
Alvaro Uria 2020-07-01 18:41:43 +02:00 committed by Aurelien Lourot
parent 6e1f20040c
commit 457f88eda6
14 changed files with 522 additions and 76 deletions

View File

@ -67,14 +67,37 @@ in a container on existing machines 0, 1, and 2:
This section lists Juju [actions][juju-docs-actions] supported by the charm.
Actions allow specific operations to be performed on a per-unit basis.
* `pause`
* `resume`
* `status`
* `cleanup`
* `pause`
* `resume`
* `status`
* `cleanup`
* `update-ring`
To display action descriptions run `juju actions hacluster`. If the charm is
not deployed then see file ``actions.yaml``.
### update-ring action
The `update-ring` action requires a parameter (`i-really-mean-it=true`) to make
sure tidying up the list of available corosync nodes in the ring is intended.
The operation expects:
1. `juju run-action hacluster/N pause --wait`. This will make sure no Pacemaker
resources run on the unit.
2. `juju remove-unit principal-unit/N`. Iterate through this step as many times
as units want to be removed (ie. to scale back from 6 to 3 units).
3. `juju run-action hacluster/leader update-ring i-really-mean-it=true --wait`.
This step will remove corosync nodes from the ring and update corosync.conf
to list an updated number of nodes (min_quorum is recalculated).
In case a unit goes into lost state (ie. caused by hardware failure), the
initial step (pause a unit) can be skipped. Unit removal may also be replaced
by `juju remove-machine N --force`, where N is the Juju machine ID where the
unit to be removed runs.
# Bugs
Please report bugs on [Launchpad][lp-bugs-charm-hacluster].

View File

@ -12,3 +12,12 @@ cleanup:
default: "all"
type: string
description: Resource name to cleanup
update-ring:
description: Trigger corosync node members cleanup
params:
i-really-mean-it:
type: boolean
description: |
This must be toggled to enable actually performing this action
required:
- i-really-mean-it

View File

@ -18,6 +18,7 @@ import os
import subprocess
import sys
import traceback
import uuid
sys.path.append('hooks/')
@ -39,13 +40,21 @@ from charmhelpers.core.hookenv import (
action_fail,
action_get,
action_set,
is_leader,
log,
relation_ids,
relation_set,
)
from utils import (
emit_corosync_conf,
is_update_ring_requested,
pause_unit,
resume_unit,
update_node_list,
)
import pcmk
def pause(args):
"""Pause the hacluster services.
@ -98,8 +107,46 @@ def cleanup(args):
"'{}'".format(resource_name))
def update_ring(args):
"""Update corosync.conf list of nodes (generally after unit removal)."""
if not action_get('i-really-mean-it'):
action_fail('i-really-mean-it is a required parameter')
return
if not is_leader():
action_fail('only the Juju leader can run this action')
return
diff_nodes = update_node_list()
if not diff_nodes:
# No differences between discovered Pacemaker nodes and
# Juju nodes (ie. no node removal)
action_set({'result': 'noop'})
return
# Trigger emit_corosync_conf() and corosync-cfgtool -R
# for all the hanode peer units to run
relid = relation_ids('hanode')
if len(relid) < 1:
action_fail('no peer ha nodes')
return
corosync_update_uuid = uuid.uuid1().hex
reldata = {'trigger-corosync-update': corosync_update_uuid}
relation_set(relation_id=relid[0],
relation_settings=reldata)
# Trigger the same logic in the leader (no hanode-relation-changed
# hook will be received by self)
if (is_update_ring_requested(corosync_update_uuid) and
emit_corosync_conf()):
cmd = 'corosync-cfgtool -R'
pcmk.commit(cmd)
action_set({'result': 'success'})
ACTIONS = {"pause": pause, "resume": resume,
"status": status, "cleanup": cleanup}
"status": status, "cleanup": cleanup,
"update-ring": update_ring}
def main(args):

1
actions/update-ring Symbolic link
View File

@ -0,0 +1 @@
actions.py

View File

@ -71,8 +71,8 @@ options:
description: |
When enabled pacemaker will be put in maintenance mode, this will allow
administrators to manipulate cluster resources (e.g. stop daemons, reboot
machines, etc). Pacemaker will not monitor the resources while maintence
mode is enabled.
machines, etc). Pacemaker will not monitor the resources while maintenance
mode is enabled and node removals won't be processed.
service_start_timeout:
type: int
default: 180

View File

@ -0,0 +1 @@
hooks.py

View File

@ -45,6 +45,8 @@ from charmhelpers.core.hookenv import (
related_units,
relation_ids,
relation_set,
remote_unit,
principal_unit,
config,
Hooks,
UnregisteredHookError,
@ -122,6 +124,8 @@ from utils import (
disable_stonith,
is_stonith_configured,
emit_systemd_overrides_file,
trigger_corosync_update_from_leader,
emit_corosync_conf,
)
from charmhelpers.contrib.charmsupport import nrpe
@ -293,6 +297,22 @@ def hanode_relation_changed(relid=None):
ha_relation_changed()
@hooks.hook('hanode-relation-departed')
def hanode_relation_departed(relid=None):
if config('maintenance-mode'):
log('pcmk is in maintenance mode - skip any action', DEBUG)
return
# Note(aluria): all units will update corosync.conf list of nodes
# in the aim of having up to date stored configurations. However,
# corosync reloads (or restarts) won't be triggered at this point
# (update-ring action will do)
if emit_corosync_conf():
log('corosync.conf updated')
else:
log('corosync.conf not updated')
@hooks.hook('ha-relation-joined',
'ha-relation-changed',
'peer-availability-relation-joined',
@ -306,9 +326,22 @@ def ha_relation_changed():
level=INFO)
return
if relation_ids('hanode'):
relid_hanode = relation_ids('hanode')
if relid_hanode:
log('Ready to form cluster - informing peers', level=DEBUG)
relation_set(relation_id=relation_ids('hanode')[0], ready=True)
relation_set(relation_id=relid_hanode[0], ready=True)
# If a trigger-corosync-update attribute exists in the relation,
# the Juju leader may have requested all its peers to update
# the corosync.conf list of nodes. If it's the case, no other
# action will be run (a future hook re: ready=True may trigger
# other logic)
if (remote_unit() != principal_unit() and
trigger_corosync_update_from_leader(
remote_unit(), relid_hanode[0]
)):
return
else:
log('Ready to form cluster, but not related to peers just yet',
level=INFO)
@ -563,8 +596,12 @@ def ha_relation_changed():
@hooks.hook()
def stop():
cmd = 'crm -w -F node delete %s' % socket.gethostname()
pcmk.commit(cmd)
# NOTE(lourot): This seems to always fail with
# 'ERROR: node <node_name> not found in the CIB', which means that the node
# has already been removed from the cluster. Thus failure_is_fatal=False.
# We might consider getting rid of this call.
pcmk.delete_node(socket.gethostname(), failure_is_fatal=False)
apt_purge(['corosync', 'pacemaker'], fatal=True)

View File

@ -40,18 +40,41 @@ class PropertyNotFound(Exception):
def wait_for_pcmk(retries=12, sleep=10):
crm_up = None
hostname = socket.gethostname()
"""Wait for pacemaker/corosync to fully come up.
:param retries: Number of times to check for crm's output before raising.
:type retries: int
:param sleep: Number of seconds to sleep between retries.
:type sleep: int
:raises: ServicesNotUp
"""
expected_hostname = socket.gethostname()
last_exit_code = None
last_output = None
for i in range(retries):
if crm_up:
return True
output = subprocess.getstatusoutput("crm node list")[1]
crm_up = hostname in output
time.sleep(sleep)
if not crm_up:
raise ServicesNotUp("Pacemaker or Corosync are still down after "
"waiting for {} retries. Last output: {}"
"".format(retries, output))
if i > 0:
time.sleep(sleep)
last_exit_code, last_output = subprocess.getstatusoutput(
'crm node list')
if expected_hostname in last_output:
return
msg = ('Pacemaker or Corosync are still not fully up after waiting for '
'{} retries. '.format(retries))
if last_exit_code != 0:
msg += 'Last exit code: {}. '.format(last_exit_code)
if 'not supported between' in last_output:
# NOTE(lourot): transient crmsh bug
# https://github.com/ClusterLabs/crmsh/issues/764
msg += 'This looks like ClusterLabs/crmsh#764. '
elif 'node1' in last_output:
# NOTE(lourot): transient bug on deployment. The charm will recover
# later but the corosync ring will still show an offline 'node1' node.
# The corosync ring can then be cleaned up by running the 'update-ring'
# action.
msg += 'This looks like lp:1874719. '
msg += 'Last output: {}'.format(last_output)
raise ServicesNotUp(msg)
def commit(cmd, failure_is_fatal=False):
@ -64,7 +87,7 @@ def commit(cmd, failure_is_fatal=False):
:raises: subprocess.CalledProcessError
"""
if failure_is_fatal:
return subprocess.check_call(cmd.split())
return subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT)
else:
return subprocess.call(cmd.split())
@ -77,24 +100,6 @@ def is_resource_present(resource):
return True
def standby(node=None):
if node is None:
cmd = "crm -F node standby"
else:
cmd = "crm -F node standby %s" % node
commit(cmd)
def online(node=None):
if node is None:
cmd = "crm -F node online"
else:
cmd = "crm -F node online %s" % node
commit(cmd)
def crm_opt_exists(opt_name):
output = subprocess.getstatusoutput("crm configure show")[1]
if opt_name in output:
@ -168,6 +173,53 @@ def list_nodes():
return sorted(nodes)
def set_node_status_to_maintenance(node_name):
"""See https://crmsh.github.io/man-2.0/#cmdhelp_node_maintenance
:param node_name: Name of the node to set to maintenance.
:type node_name: str
:raises: subprocess.CalledProcessError
"""
log('Setting node {} to maintenance'.format(node_name))
commit('crm -w -F node maintenance {}'.format(node_name),
failure_is_fatal=True)
def delete_node(node_name, failure_is_fatal=True):
"""See https://crmsh.github.io/man-2.0/#cmdhelp_node_delete
:param node_name: Name of the node to be removed from the cluster.
:type node_name: str
:param failure_is_fatal: Whether to raise exception if command fails.
:type failure_is_fatal: bool
:raises: subprocess.CalledProcessError
"""
log('Deleting node {} from the cluster'.format(node_name))
cmd = 'crm -w -F node delete {}'.format(node_name)
for attempt in [2, 1, 0]:
try:
commit(cmd, failure_is_fatal=failure_is_fatal)
except subprocess.CalledProcessError as e:
output = e.output.decode('utf-8').strip()
log('"{}" failed with "{}"'.format(cmd, output), WARNING)
if output == 'ERROR: node {} not found in the CIB'.format(
node_name):
# NOTE(lourot): Sometimes seen when called from the
# `update-ring` action.
log('{} was already removed from the cluster, moving on',
WARNING)
return
if '/cmdline' in output:
# NOTE(lourot): older versions of crmsh may fail with
# https://github.com/ClusterLabs/crmsh/issues/283 . If that's
# the case let's retry.
log('This looks like ClusterLabs/crmsh#283.', WARNING)
if attempt > 0:
log('Retrying...', WARNING)
continue
raise
def get_property_from_xml(name, output):
"""Read a configuration property from the XML generated by 'crm configure show
xml'

View File

@ -118,6 +118,27 @@ class MAASConfigIncomplete(Exception):
pass
class RemoveCorosyncNodeFailed(Exception):
def __init__(self, node_name, called_process_error):
msg = 'Removing {} from the cluster failed. {} output={}'.format(
node_name, called_process_error, called_process_error.output)
super(RemoveCorosyncNodeFailed, self).__init__(msg)
class EnableStonithFailed(Exception):
def __init__(self, called_process_error):
msg = 'Enabling STONITH failed. {} output={}'.format(
called_process_error, called_process_error.output)
super(EnableStonithFailed, self).__init__(msg)
class DisableStonithFailed(Exception):
def __init__(self, called_process_error):
msg = 'Disabling STONITH failed. {} output={}'.format(
called_process_error, called_process_error.output)
super(DisableStonithFailed, self).__init__(msg)
def disable_upstart_services(*services):
for service in services:
with open("/etc/init/{}.override".format(service), "wt") as override:
@ -516,9 +537,13 @@ def configure_stonith():
enable_stonith()
set_stonith_configured(True)
else:
log('Disabling STONITH', level=INFO)
cmd = "crm configure property stonith-enabled=false"
pcmk.commit(cmd)
# NOTE(lourot): We enter here when no MAAS STONITH resource could be
# created. Disabling STONITH for now. We're not calling
# set_stonith_configured(), so that enabling STONITH will be retried
# later. (STONITH is now always enabled in this charm.)
# Without MAAS, we keep entering here, which isn't really an issue,
# except that this fails in rare cases, thus failure_is_fatal=False.
disable_stonith(failure_is_fatal=False)
def configure_monitor_host():
@ -661,17 +686,33 @@ def configure_maas_stonith_resource(stonith_hostnames):
def enable_stonith():
"""Enable stonith via the global property stonith-enabled."""
pcmk.commit(
"crm configure property stonith-enabled=true",
failure_is_fatal=True)
"""Enable stonith via the global property stonith-enabled.
:raises: EnableStonithFailed
"""
log('Enabling STONITH', level=INFO)
try:
pcmk.commit(
"crm configure property stonith-enabled=true",
failure_is_fatal=True)
except subprocess.CalledProcessError as e:
raise EnableStonithFailed(e)
def disable_stonith():
"""Disable stonith via the global property stonith-enabled."""
pcmk.commit(
"crm configure property stonith-enabled=false",
failure_is_fatal=True)
def disable_stonith(failure_is_fatal=True):
"""Disable stonith via the global property stonith-enabled.
:param failure_is_fatal: Whether to raise exception if command fails.
:type failure_is_fatal: bool
:raises: DisableStonithFailed
"""
log('Disabling STONITH', level=INFO)
try:
pcmk.commit(
"crm configure property stonith-enabled=false",
failure_is_fatal=failure_is_fatal)
except subprocess.CalledProcessError as e:
raise DisableStonithFailed(e)
def get_ip_addr_from_resource_params(params):
@ -950,13 +991,14 @@ def restart_corosync_on_change():
def wrap(f):
def wrapped_f(*args, **kwargs):
checksums = {}
for path in COROSYNC_CONF_FILES:
checksums[path] = file_hash(path)
if not is_unit_paused_set():
for path in COROSYNC_CONF_FILES:
checksums[path] = file_hash(path)
return_data = f(*args, **kwargs)
# NOTE: this assumes that this call is always done around
# configure_corosync, which returns true if configuration
# files where actually generated
if return_data:
if return_data and not is_unit_paused_set():
for path in COROSYNC_CONF_FILES:
if checksums[path] != file_hash(path):
validated_restart_corosync()
@ -974,11 +1016,12 @@ def try_pcmk_wait():
try:
pcmk.wait_for_pcmk()
log("Pacemaker is ready", DEBUG)
except pcmk.ServicesNotUp:
msg = ("Pacemaker is down. Please manually start it.")
log(msg, ERROR)
status_set('blocked', msg)
raise pcmk.ServicesNotUp(msg)
except pcmk.ServicesNotUp as e:
status_msg = "Pacemaker is down. Please manually start it."
status_set('blocked', status_msg)
full_msg = "{} {}".format(status_msg, e)
log(full_msg, ERROR)
raise pcmk.ServicesNotUp(full_msg)
@restart_corosync_on_change()
@ -1003,9 +1046,10 @@ def services_running():
if not (pacemaker_status and corosync_status):
# OS perspective
return False
else:
# Functional test of pacemaker
return pcmk.wait_for_pcmk()
# Functional test of pacemaker. This will raise if pacemaker doesn't get
# fully ready in time:
pcmk.wait_for_pcmk()
return True
def validated_restart_corosync(retries=10):
@ -1184,6 +1228,20 @@ def node_has_resources(node_name):
return has_resources
def node_is_dc(node_name):
"""Check if this node is the designated controller.
@param node_name: The name of the node to check
@returns boolean - True if node_name is the DC
"""
out = subprocess.check_output(['crm_mon', '-X']).decode('utf-8')
root = ET.fromstring(out)
for current_dc in root.iter("current_dc"):
if current_dc.attrib.get('name') == node_name:
return True
return False
def set_unit_status():
"""Set the workload status for this unit
@ -1493,3 +1551,58 @@ def is_stonith_configured():
"""
configured = leader_get(STONITH_CONFIGURED) or 'False'
return bool_from_string(configured)
def update_node_list():
"""Delete a node from the corosync ring when a Juju unit is removed.
:returns: Set of pcmk nodes not part of Juju hanode relation
:rtype: Set[str]
:raises: RemoveCorosyncNodeFailed
"""
pcmk_nodes = set(pcmk.list_nodes())
juju_nodes = {socket.gethostname()}
juju_hanode_rel = get_ha_nodes()
for corosync_id, addr in juju_hanode_rel.items():
peer_node_name = utils.get_hostname(addr, fqdn=False)
juju_nodes.add(peer_node_name)
diff_nodes = pcmk_nodes.difference(juju_nodes)
log("pcmk_nodes[{}], juju_nodes[{}], diff[{}]"
"".format(pcmk_nodes, juju_nodes, diff_nodes),
DEBUG)
for old_node in diff_nodes:
try:
pcmk.set_node_status_to_maintenance(old_node)
pcmk.delete_node(old_node)
except subprocess.CalledProcessError as e:
raise RemoveCorosyncNodeFailed(old_node, e)
return diff_nodes
def is_update_ring_requested(corosync_update_uuid):
log("Setting corosync-update-uuid=<uuid> in local kv", DEBUG)
with unitdata.HookData()() as t:
kv = t[0]
stored_value = kv.get('corosync-update-uuid')
if not stored_value or stored_value != corosync_update_uuid:
kv.set('corosync-update-uuid', corosync_update_uuid)
return True
return False
def trigger_corosync_update_from_leader(unit, rid):
corosync_update_uuid = relation_get(
attribute='trigger-corosync-update',
unit=unit, rid=rid,
)
if (corosync_update_uuid and
is_update_ring_requested(corosync_update_uuid) and
emit_corosync_conf()):
cmd = 'corosync-cfgtool -R'
pcmk.commit(cmd)
return True
return False

View File

@ -23,7 +23,11 @@ configure:
tests:
- zaza.openstack.charm_tests.hacluster.tests.HaclusterTest
- zaza.openstack.charm_tests.hacluster.tests.HaclusterScaleBackAndForthTest
tests_options:
hacluster:
principle-app-name: keystone
hacluster-charm-name: hacluster
force_deploy:
- groovy-victoria

View File

@ -42,6 +42,9 @@ class TestCorosyncConf(unittest.TestCase):
os.remove(self.tmpfile.name)
@mock.patch.object(pcmk.unitdata, 'kv')
@mock.patch.object(hooks, 'remote_unit')
@mock.patch.object(hooks, 'principal_unit')
@mock.patch.object(hooks, 'trigger_corosync_update_from_leader')
@mock.patch.object(hooks, 'is_stonith_configured')
@mock.patch.object(hooks, 'configure_peer_stonith_resource')
@mock.patch.object(hooks, 'get_member_ready_nodes')
@ -78,7 +81,9 @@ class TestCorosyncConf(unittest.TestCase):
configure_resources_on_remotes,
get_member_ready_nodes,
configure_peer_stonith_resource,
is_stonith_configured, mock_kv):
is_stonith_configured,
trigger_corosync_update_from_leader,
principal_unit, remote_unit, mock_kv):
def fake_crm_opt_exists(res_name):
# res_ubuntu will take the "update resource" route
@ -104,6 +109,8 @@ class TestCorosyncConf(unittest.TestCase):
'cluster_count': 3,
'failure_timeout': 180,
'cluster_recheck_interval': 60}
trigger_corosync_update_from_leader.return_value = False
principal_unit.return_value = remote_unit.return_value = ""
config.side_effect = lambda key: cfg.get(key)
@ -165,6 +172,9 @@ class TestCorosyncConf(unittest.TestCase):
commit.assert_any_call(
'crm -w -F configure %s %s %s' % (kw, name, params))
@mock.patch.object(hooks, 'remote_unit')
@mock.patch.object(hooks, 'principal_unit')
@mock.patch.object(hooks, 'trigger_corosync_update_from_leader')
@mock.patch.object(hooks, 'is_stonith_configured')
@mock.patch.object(hooks, 'configure_peer_stonith_resource')
@mock.patch.object(hooks, 'get_member_ready_nodes')
@ -200,7 +210,9 @@ class TestCorosyncConf(unittest.TestCase):
configure_pacemaker_remote_stonith_resource,
configure_resources_on_remotes, get_member_ready_nodes,
configure_peer_stonith_resource,
is_stonith_configured):
is_stonith_configured,
trigger_corosync_update_from_leader,
principal_unit, remote_unit):
is_stonith_configured.return_value = False
validate_dns_ha.return_value = True
crm_opt_exists.return_value = False
@ -218,6 +230,8 @@ class TestCorosyncConf(unittest.TestCase):
'cluster_count': 3,
'maas_url': 'http://maas/MAAAS/',
'maas_credentials': 'secret'}
trigger_corosync_update_from_leader.return_value = False
principal_unit.return_value = remote_unit.return_value = ""
config.side_effect = lambda key: cfg.get(key)
@ -248,6 +262,9 @@ class TestCorosyncConf(unittest.TestCase):
'params bar ip_address="172.16.0.1" maas_url="http://maas/MAAAS/" '
'maas_credentials="secret"')
@mock.patch.object(hooks, 'remote_unit')
@mock.patch.object(hooks, 'principal_unit')
@mock.patch.object(hooks, 'trigger_corosync_update_from_leader')
@mock.patch.object(hooks, 'setup_maas_api')
@mock.patch.object(hooks, 'validate_dns_ha')
@mock.patch('pcmk.wait_for_pcmk')
@ -270,7 +287,9 @@ class TestCorosyncConf(unittest.TestCase):
relation_set, get_cluster_nodes, related_units, configure_stonith,
configure_monitor_host, configure_cluster_global,
configure_corosync, is_leader, crm_opt_exists,
wait_for_pcmk, validate_dns_ha, setup_maas_api):
wait_for_pcmk, validate_dns_ha, setup_maas_api,
trigger_corosync_update_from_leader,
principal_unit, remote_unit):
def fake_validate():
raise utils.MAASConfigIncomplete('DNS HA invalid config')
@ -289,6 +308,8 @@ class TestCorosyncConf(unittest.TestCase):
'cluster_count': 3,
'maas_url': 'http://maas/MAAAS/',
'maas_credentials': None}
trigger_corosync_update_from_leader.return_value = False
principal_unit.return_value = remote_unit.return_value = ""
config.side_effect = lambda key: cfg.get(key)
@ -391,7 +412,6 @@ class TestHooks(test_utils.CharmTestCase):
mock_is_stonith_configured.return_value = False
mock_config.side_effect = self.test_config.get
mock_relation_ids.return_value = ['hanode:1']
mock_wait_for_pcmk.return_value = True
mock_is_leader.return_value = True
hooks.config_changed()
mock_maintenance_mode.assert_not_called()

View File

@ -93,6 +93,10 @@ class UtilsTestCaseWriteTmp(unittest.TestCase):
class UtilsTestCase(unittest.TestCase):
def _testdata(self, filename):
return os.path.join(os.path.dirname(__file__),
'testdata',
filename)
@mock.patch.object(utils, 'config')
def test_get_transport(self, mock_config):
@ -430,20 +434,19 @@ class UtilsTestCase(unittest.TestCase):
])
@mock.patch('pcmk.commit')
@mock.patch.object(utils, 'config')
@mock.patch.object(utils, 'configure_pacemaker_remote_stonith_resource')
def test_configure_stonith_stonith_enabled_false(
def test_configure_stonith_no_maas(
self,
mock_cfg_pcmkr_rstonith_res,
mock_config,
mock_commit):
cfg = {
'stonith_enabled': 'false'}
mock_config.side_effect = lambda key: cfg.get(key)
# Without MAAS this function will return no resource:
mock_cfg_pcmkr_rstonith_res.return_value = []
utils.configure_stonith()
mock_commit.assert_called_once_with(
'crm configure property stonith-enabled=false')
'crm configure property stonith-enabled=false',
failure_is_fatal=False)
@mock.patch.object(utils, 'relation_get')
def test_parse_data_json(self, relation_get):
@ -1260,3 +1263,57 @@ class UtilsTestCase(unittest.TestCase):
commit.assert_called_once_with(
'crm configure property stonith-enabled=false',
failure_is_fatal=True)
@mock.patch('subprocess.check_output')
def test_node_is_dc(self, mock_subprocess):
with open(self._testdata('test_crm_mon.xml'), 'r') as fd:
mock_subprocess.return_value = "".join(
fd.readlines()).encode("utf-8")
self.assertTrue(utils.node_is_dc('juju-2eebcf-0'))
@mock.patch.object(utils.unitdata, 'HookData')
def test_is_update_ring_requested(self, HookData):
hook_data = self.MockHookData()
HookData.return_value = hook_data
self.assertTrue(
utils.is_update_ring_requested('random-uuid-generated')
)
self.assertEquals(
hook_data.kv.get('corosync-update-uuid'),
'random-uuid-generated',
)
# No change in uuid means no new request has been issued
self.assertFalse(
utils.is_update_ring_requested('random-uuid-generated')
)
@mock.patch('pcmk.commit')
@mock.patch.object(utils, 'emit_corosync_conf')
@mock.patch.object(utils, 'is_update_ring_requested')
@mock.patch.object(utils, 'relation_get')
def test_trigger_corosync_update_from_leader(self, mock_relation_get,
mock_is_update_ring_req,
mock_emit_corosync_conf,
mock_commit):
# corosync-update-uuid is set and has changed:
mock_relation_get.return_value = 'random-uuid-generated'
mock_is_update_ring_req.return_value = True
mock_emit_corosync_conf.return_value = True
self.assertTrue(
utils.trigger_corosync_update_from_leader(
'hacluster/0',
'hanode:0',
),
)
mock_commit.assert_has_calls([mock.call('corosync-cfgtool -R')])
# corosync-update-uuid isn't set:
mock_relation_get.return_value = None
self.assertFalse(
utils.trigger_corosync_update_from_leader(
'hacluster/0',
'hanode:0',
),
)

View File

@ -163,7 +163,8 @@ class TestPcmk(unittest.TestCase):
# Pacemaker is up
gethostname.return_value = 'hanode-1'
getstatusoutput.return_value = (0, 'Hosname: hanode-1')
self.assertTrue(pcmk.wait_for_pcmk(retries=2, sleep=0))
# Here we are asserting that it doesn't raise anything:
pcmk.wait_for_pcmk(retries=2, sleep=0)
@mock.patch('subprocess.check_output')
def test_crm_version(self, mock_check_output):

81
unit_tests/testdata/test_crm_mon.xml vendored Normal file
View File

@ -0,0 +1,81 @@
<?xml version="1.0"?>
<crm_mon version="1.1.18">
<summary>
<stack type="corosync" />
<current_dc present="true" version="1.1.18-2b07d5c5a9" name="juju-2eebcf-0" id="1000" with_quorum="true" />
<last_update time="Mon Jul 20 09:15:49 2020" />
<last_change time="Mon Jul 20 09:09:40 2020" user="hacluster" client="crmd" origin="juju-2eebcf-2" />
<nodes_configured number="3" expected_votes="unknown" />
<resources_configured number="5" disabled="0" blocked="0" />
<cluster_options stonith-enabled="false" symmetric-cluster="true" no-quorum-policy="stop" maintenance-mode="false" />
</summary>
<nodes>
<node name="juju-2eebcf-0" id="1000" online="true" standby="false" standby_onfail="false" maintenance="false" pending="false" unclean="false" shutdown="false" expected_up="true" is_dc="true" resources_running="2" type="member" />
<node name="juju-2eebcf-2" id="1001" online="true" standby="false" standby_onfail="false" maintenance="false" pending="false" unclean="false" shutdown="false" expected_up="true" is_dc="false" resources_running="2" type="member" />
<node name="juju-2eebcf-3" id="1002" online="true" standby="false" standby_onfail="false" maintenance="false" pending="false" unclean="false" shutdown="false" expected_up="true" is_dc="false" resources_running="1" type="member" />
</nodes>
<resources>
<group id="grp_ks_vips" number_resources="1" >
<resource id="res_ks_0dd3a53_vip" resource_agent="ocf::heartbeat:IPaddr2" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
<node name="juju-2eebcf-0" id="1000" cached="false"/>
</resource>
</group>
<clone id="cl_ks_haproxy" multi_state="false" unique="false" managed="true" failed="false" failure_ignored="false" >
<resource id="res_ks_haproxy" resource_agent="lsb:haproxy" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
<node name="juju-2eebcf-2" id="1001" cached="false"/>
</resource>
<resource id="res_ks_haproxy" resource_agent="lsb:haproxy" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
<node name="juju-2eebcf-0" id="1000" cached="false"/>
</resource>
<resource id="res_ks_haproxy" resource_agent="lsb:haproxy" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
<node name="juju-2eebcf-3" id="1002" cached="false"/>
</resource>
</clone>
<resource id="res_ks_bc84550_vip" resource_agent="ocf::heartbeat:IPaddr2" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
<node name="juju-2eebcf-2" id="1001" cached="false"/>
</resource>
</resources>
<node_attributes>
<node name="juju-2eebcf-0">
</node>
<node name="juju-2eebcf-2">
</node>
<node name="juju-2eebcf-3">
</node>
</node_attributes>
<node_history>
<node name="juju-2eebcf-2">
<resource_history id="res_ks_bc84550_vip" orphan="false" migration-threshold="1000000">
<operation_history call="56" task="start" last-rc-change="Mon Jul 20 09:09:37 2020" last-run="Mon Jul 20 09:09:37 2020" exec-time="548ms" queue-time="0ms" rc="0" rc_text="ok" />
<operation_history call="57" task="monitor" interval="10000ms" last-rc-change="Mon Jul 20 09:09:38 2020" exec-time="534ms" queue-time="0ms" rc="0" rc_text="ok" />
</resource_history>
<resource_history id="res_ks_haproxy" orphan="false" migration-threshold="1000000">
<operation_history call="64" task="probe" last-rc-change="Mon Jul 20 09:09:40 2020" last-run="Mon Jul 20 09:09:40 2020" exec-time="554ms" queue-time="0ms" rc="0" rc_text="ok" />
<operation_history call="64" task="probe" last-rc-change="Mon Jul 20 09:09:40 2020" last-run="Mon Jul 20 09:09:40 2020" exec-time="554ms" queue-time="0ms" rc="0" rc_text="ok" />
<operation_history call="70" task="monitor" interval="5000ms" last-rc-change="Mon Jul 20 09:09:41 2020" exec-time="804ms" queue-time="0ms" rc="0" rc_text="ok" />
</resource_history>
</node>
<node name="juju-2eebcf-0">
<resource_history id="res_ks_haproxy" orphan="false" migration-threshold="1000000">
<operation_history call="80" task="start" last-rc-change="Mon Jul 20 09:09:41 2020" last-run="Mon Jul 20 09:09:41 2020" exec-time="824ms" queue-time="0ms" rc="0" rc_text="ok" />
<operation_history call="82" task="monitor" interval="5000ms" last-rc-change="Mon Jul 20 09:09:42 2020" exec-time="534ms" queue-time="0ms" rc="0" rc_text="ok" />
</resource_history>
<resource_history id="res_ks_0dd3a53_vip" orphan="false" migration-threshold="1000000">
<operation_history call="79" task="probe" last-rc-change="Mon Jul 20 09:09:41 2020" last-run="Mon Jul 20 09:09:41 2020" exec-time="753ms" queue-time="0ms" rc="0" rc_text="ok" />
<operation_history call="79" task="probe" last-rc-change="Mon Jul 20 09:09:41 2020" last-run="Mon Jul 20 09:09:41 2020" exec-time="753ms" queue-time="0ms" rc="0" rc_text="ok" />
<operation_history call="81" task="monitor" interval="10000ms" last-rc-change="Mon Jul 20 09:09:42 2020" exec-time="542ms" queue-time="0ms" rc="0" rc_text="ok" />
</resource_history>
</node>
<node name="juju-2eebcf-3">
<resource_history id="res_ks_haproxy" orphan="false" migration-threshold="1000000">
<operation_history call="62" task="probe" last-rc-change="Mon Jul 20 09:09:40 2020" last-run="Mon Jul 20 09:09:40 2020" exec-time="547ms" queue-time="0ms" rc="0" rc_text="ok" />
<operation_history call="62" task="probe" last-rc-change="Mon Jul 20 09:09:40 2020" last-run="Mon Jul 20 09:09:40 2020" exec-time="547ms" queue-time="0ms" rc="0" rc_text="ok" />
<operation_history call="68" task="monitor" interval="5000ms" last-rc-change="Mon Jul 20 09:09:41 2020" exec-time="751ms" queue-time="0ms" rc="0" rc_text="ok" />
</resource_history>
</node>
</node_history>
<tickets>
</tickets>
<bans>
</bans>
</crm_mon>