Adjust quorum after node removal
Add an `update-ring` action for that purpose. Also print more on various pacemaker failures. Also removed some dead code. Func-Test-PR: https://github.com/openstack-charmers/zaza-openstack-tests/pull/369 Change-Id: I35c0c9ce67fd459b9c3099346705d43d76bbdfe4 Closes-Bug: #1400481 Related-Bug: #1874719 Co-Authored-By: Aurelien Lourot <aurelien.lourot@canonical.com> Co-Authored-By: Felipe Reyes <felipe.reyes@canonical.com>
This commit is contained in:
parent
6e1f20040c
commit
457f88eda6
31
README.md
31
README.md
|
@ -67,14 +67,37 @@ in a container on existing machines 0, 1, and 2:
|
|||
This section lists Juju [actions][juju-docs-actions] supported by the charm.
|
||||
Actions allow specific operations to be performed on a per-unit basis.
|
||||
|
||||
* `pause`
|
||||
* `resume`
|
||||
* `status`
|
||||
* `cleanup`
|
||||
* `pause`
|
||||
* `resume`
|
||||
* `status`
|
||||
* `cleanup`
|
||||
* `update-ring`
|
||||
|
||||
To display action descriptions run `juju actions hacluster`. If the charm is
|
||||
not deployed then see file ``actions.yaml``.
|
||||
|
||||
### update-ring action
|
||||
|
||||
The `update-ring` action requires a parameter (`i-really-mean-it=true`) to make
|
||||
sure tidying up the list of available corosync nodes in the ring is intended.
|
||||
|
||||
The operation expects:
|
||||
|
||||
1. `juju run-action hacluster/N pause --wait`. This will make sure no Pacemaker
|
||||
resources run on the unit.
|
||||
|
||||
2. `juju remove-unit principal-unit/N`. Iterate through this step as many times
|
||||
as units want to be removed (ie. to scale back from 6 to 3 units).
|
||||
|
||||
3. `juju run-action hacluster/leader update-ring i-really-mean-it=true --wait`.
|
||||
This step will remove corosync nodes from the ring and update corosync.conf
|
||||
to list an updated number of nodes (min_quorum is recalculated).
|
||||
|
||||
In case a unit goes into lost state (ie. caused by hardware failure), the
|
||||
initial step (pause a unit) can be skipped. Unit removal may also be replaced
|
||||
by `juju remove-machine N --force`, where N is the Juju machine ID where the
|
||||
unit to be removed runs.
|
||||
|
||||
# Bugs
|
||||
|
||||
Please report bugs on [Launchpad][lp-bugs-charm-hacluster].
|
||||
|
|
|
@ -12,3 +12,12 @@ cleanup:
|
|||
default: "all"
|
||||
type: string
|
||||
description: Resource name to cleanup
|
||||
update-ring:
|
||||
description: Trigger corosync node members cleanup
|
||||
params:
|
||||
i-really-mean-it:
|
||||
type: boolean
|
||||
description: |
|
||||
This must be toggled to enable actually performing this action
|
||||
required:
|
||||
- i-really-mean-it
|
||||
|
|
|
@ -18,6 +18,7 @@ import os
|
|||
import subprocess
|
||||
import sys
|
||||
import traceback
|
||||
import uuid
|
||||
|
||||
sys.path.append('hooks/')
|
||||
|
||||
|
@ -39,13 +40,21 @@ from charmhelpers.core.hookenv import (
|
|||
action_fail,
|
||||
action_get,
|
||||
action_set,
|
||||
is_leader,
|
||||
log,
|
||||
relation_ids,
|
||||
relation_set,
|
||||
)
|
||||
from utils import (
|
||||
emit_corosync_conf,
|
||||
is_update_ring_requested,
|
||||
pause_unit,
|
||||
resume_unit,
|
||||
update_node_list,
|
||||
)
|
||||
|
||||
import pcmk
|
||||
|
||||
|
||||
def pause(args):
|
||||
"""Pause the hacluster services.
|
||||
|
@ -98,8 +107,46 @@ def cleanup(args):
|
|||
"'{}'".format(resource_name))
|
||||
|
||||
|
||||
def update_ring(args):
|
||||
"""Update corosync.conf list of nodes (generally after unit removal)."""
|
||||
if not action_get('i-really-mean-it'):
|
||||
action_fail('i-really-mean-it is a required parameter')
|
||||
return
|
||||
|
||||
if not is_leader():
|
||||
action_fail('only the Juju leader can run this action')
|
||||
return
|
||||
|
||||
diff_nodes = update_node_list()
|
||||
if not diff_nodes:
|
||||
# No differences between discovered Pacemaker nodes and
|
||||
# Juju nodes (ie. no node removal)
|
||||
action_set({'result': 'noop'})
|
||||
return
|
||||
|
||||
# Trigger emit_corosync_conf() and corosync-cfgtool -R
|
||||
# for all the hanode peer units to run
|
||||
relid = relation_ids('hanode')
|
||||
if len(relid) < 1:
|
||||
action_fail('no peer ha nodes')
|
||||
return
|
||||
corosync_update_uuid = uuid.uuid1().hex
|
||||
reldata = {'trigger-corosync-update': corosync_update_uuid}
|
||||
relation_set(relation_id=relid[0],
|
||||
relation_settings=reldata)
|
||||
|
||||
# Trigger the same logic in the leader (no hanode-relation-changed
|
||||
# hook will be received by self)
|
||||
if (is_update_ring_requested(corosync_update_uuid) and
|
||||
emit_corosync_conf()):
|
||||
cmd = 'corosync-cfgtool -R'
|
||||
pcmk.commit(cmd)
|
||||
action_set({'result': 'success'})
|
||||
|
||||
|
||||
ACTIONS = {"pause": pause, "resume": resume,
|
||||
"status": status, "cleanup": cleanup}
|
||||
"status": status, "cleanup": cleanup,
|
||||
"update-ring": update_ring}
|
||||
|
||||
|
||||
def main(args):
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
actions.py
|
|
@ -71,8 +71,8 @@ options:
|
|||
description: |
|
||||
When enabled pacemaker will be put in maintenance mode, this will allow
|
||||
administrators to manipulate cluster resources (e.g. stop daemons, reboot
|
||||
machines, etc). Pacemaker will not monitor the resources while maintence
|
||||
mode is enabled.
|
||||
machines, etc). Pacemaker will not monitor the resources while maintenance
|
||||
mode is enabled and node removals won't be processed.
|
||||
service_start_timeout:
|
||||
type: int
|
||||
default: 180
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
hooks.py
|
|
@ -45,6 +45,8 @@ from charmhelpers.core.hookenv import (
|
|||
related_units,
|
||||
relation_ids,
|
||||
relation_set,
|
||||
remote_unit,
|
||||
principal_unit,
|
||||
config,
|
||||
Hooks,
|
||||
UnregisteredHookError,
|
||||
|
@ -122,6 +124,8 @@ from utils import (
|
|||
disable_stonith,
|
||||
is_stonith_configured,
|
||||
emit_systemd_overrides_file,
|
||||
trigger_corosync_update_from_leader,
|
||||
emit_corosync_conf,
|
||||
)
|
||||
|
||||
from charmhelpers.contrib.charmsupport import nrpe
|
||||
|
@ -293,6 +297,22 @@ def hanode_relation_changed(relid=None):
|
|||
ha_relation_changed()
|
||||
|
||||
|
||||
@hooks.hook('hanode-relation-departed')
|
||||
def hanode_relation_departed(relid=None):
|
||||
if config('maintenance-mode'):
|
||||
log('pcmk is in maintenance mode - skip any action', DEBUG)
|
||||
return
|
||||
|
||||
# Note(aluria): all units will update corosync.conf list of nodes
|
||||
# in the aim of having up to date stored configurations. However,
|
||||
# corosync reloads (or restarts) won't be triggered at this point
|
||||
# (update-ring action will do)
|
||||
if emit_corosync_conf():
|
||||
log('corosync.conf updated')
|
||||
else:
|
||||
log('corosync.conf not updated')
|
||||
|
||||
|
||||
@hooks.hook('ha-relation-joined',
|
||||
'ha-relation-changed',
|
||||
'peer-availability-relation-joined',
|
||||
|
@ -306,9 +326,22 @@ def ha_relation_changed():
|
|||
level=INFO)
|
||||
return
|
||||
|
||||
if relation_ids('hanode'):
|
||||
relid_hanode = relation_ids('hanode')
|
||||
if relid_hanode:
|
||||
log('Ready to form cluster - informing peers', level=DEBUG)
|
||||
relation_set(relation_id=relation_ids('hanode')[0], ready=True)
|
||||
relation_set(relation_id=relid_hanode[0], ready=True)
|
||||
|
||||
# If a trigger-corosync-update attribute exists in the relation,
|
||||
# the Juju leader may have requested all its peers to update
|
||||
# the corosync.conf list of nodes. If it's the case, no other
|
||||
# action will be run (a future hook re: ready=True may trigger
|
||||
# other logic)
|
||||
if (remote_unit() != principal_unit() and
|
||||
trigger_corosync_update_from_leader(
|
||||
remote_unit(), relid_hanode[0]
|
||||
)):
|
||||
return
|
||||
|
||||
else:
|
||||
log('Ready to form cluster, but not related to peers just yet',
|
||||
level=INFO)
|
||||
|
@ -563,8 +596,12 @@ def ha_relation_changed():
|
|||
|
||||
@hooks.hook()
|
||||
def stop():
|
||||
cmd = 'crm -w -F node delete %s' % socket.gethostname()
|
||||
pcmk.commit(cmd)
|
||||
# NOTE(lourot): This seems to always fail with
|
||||
# 'ERROR: node <node_name> not found in the CIB', which means that the node
|
||||
# has already been removed from the cluster. Thus failure_is_fatal=False.
|
||||
# We might consider getting rid of this call.
|
||||
pcmk.delete_node(socket.gethostname(), failure_is_fatal=False)
|
||||
|
||||
apt_purge(['corosync', 'pacemaker'], fatal=True)
|
||||
|
||||
|
||||
|
|
112
hooks/pcmk.py
112
hooks/pcmk.py
|
@ -40,18 +40,41 @@ class PropertyNotFound(Exception):
|
|||
|
||||
|
||||
def wait_for_pcmk(retries=12, sleep=10):
|
||||
crm_up = None
|
||||
hostname = socket.gethostname()
|
||||
"""Wait for pacemaker/corosync to fully come up.
|
||||
|
||||
:param retries: Number of times to check for crm's output before raising.
|
||||
:type retries: int
|
||||
:param sleep: Number of seconds to sleep between retries.
|
||||
:type sleep: int
|
||||
:raises: ServicesNotUp
|
||||
"""
|
||||
expected_hostname = socket.gethostname()
|
||||
last_exit_code = None
|
||||
last_output = None
|
||||
for i in range(retries):
|
||||
if crm_up:
|
||||
return True
|
||||
output = subprocess.getstatusoutput("crm node list")[1]
|
||||
crm_up = hostname in output
|
||||
time.sleep(sleep)
|
||||
if not crm_up:
|
||||
raise ServicesNotUp("Pacemaker or Corosync are still down after "
|
||||
"waiting for {} retries. Last output: {}"
|
||||
"".format(retries, output))
|
||||
if i > 0:
|
||||
time.sleep(sleep)
|
||||
last_exit_code, last_output = subprocess.getstatusoutput(
|
||||
'crm node list')
|
||||
if expected_hostname in last_output:
|
||||
return
|
||||
|
||||
msg = ('Pacemaker or Corosync are still not fully up after waiting for '
|
||||
'{} retries. '.format(retries))
|
||||
if last_exit_code != 0:
|
||||
msg += 'Last exit code: {}. '.format(last_exit_code)
|
||||
if 'not supported between' in last_output:
|
||||
# NOTE(lourot): transient crmsh bug
|
||||
# https://github.com/ClusterLabs/crmsh/issues/764
|
||||
msg += 'This looks like ClusterLabs/crmsh#764. '
|
||||
elif 'node1' in last_output:
|
||||
# NOTE(lourot): transient bug on deployment. The charm will recover
|
||||
# later but the corosync ring will still show an offline 'node1' node.
|
||||
# The corosync ring can then be cleaned up by running the 'update-ring'
|
||||
# action.
|
||||
msg += 'This looks like lp:1874719. '
|
||||
msg += 'Last output: {}'.format(last_output)
|
||||
raise ServicesNotUp(msg)
|
||||
|
||||
|
||||
def commit(cmd, failure_is_fatal=False):
|
||||
|
@ -64,7 +87,7 @@ def commit(cmd, failure_is_fatal=False):
|
|||
:raises: subprocess.CalledProcessError
|
||||
"""
|
||||
if failure_is_fatal:
|
||||
return subprocess.check_call(cmd.split())
|
||||
return subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT)
|
||||
else:
|
||||
return subprocess.call(cmd.split())
|
||||
|
||||
|
@ -77,24 +100,6 @@ def is_resource_present(resource):
|
|||
return True
|
||||
|
||||
|
||||
def standby(node=None):
|
||||
if node is None:
|
||||
cmd = "crm -F node standby"
|
||||
else:
|
||||
cmd = "crm -F node standby %s" % node
|
||||
|
||||
commit(cmd)
|
||||
|
||||
|
||||
def online(node=None):
|
||||
if node is None:
|
||||
cmd = "crm -F node online"
|
||||
else:
|
||||
cmd = "crm -F node online %s" % node
|
||||
|
||||
commit(cmd)
|
||||
|
||||
|
||||
def crm_opt_exists(opt_name):
|
||||
output = subprocess.getstatusoutput("crm configure show")[1]
|
||||
if opt_name in output:
|
||||
|
@ -168,6 +173,53 @@ def list_nodes():
|
|||
return sorted(nodes)
|
||||
|
||||
|
||||
def set_node_status_to_maintenance(node_name):
|
||||
"""See https://crmsh.github.io/man-2.0/#cmdhelp_node_maintenance
|
||||
|
||||
:param node_name: Name of the node to set to maintenance.
|
||||
:type node_name: str
|
||||
:raises: subprocess.CalledProcessError
|
||||
"""
|
||||
log('Setting node {} to maintenance'.format(node_name))
|
||||
commit('crm -w -F node maintenance {}'.format(node_name),
|
||||
failure_is_fatal=True)
|
||||
|
||||
|
||||
def delete_node(node_name, failure_is_fatal=True):
|
||||
"""See https://crmsh.github.io/man-2.0/#cmdhelp_node_delete
|
||||
|
||||
:param node_name: Name of the node to be removed from the cluster.
|
||||
:type node_name: str
|
||||
:param failure_is_fatal: Whether to raise exception if command fails.
|
||||
:type failure_is_fatal: bool
|
||||
:raises: subprocess.CalledProcessError
|
||||
"""
|
||||
log('Deleting node {} from the cluster'.format(node_name))
|
||||
cmd = 'crm -w -F node delete {}'.format(node_name)
|
||||
for attempt in [2, 1, 0]:
|
||||
try:
|
||||
commit(cmd, failure_is_fatal=failure_is_fatal)
|
||||
except subprocess.CalledProcessError as e:
|
||||
output = e.output.decode('utf-8').strip()
|
||||
log('"{}" failed with "{}"'.format(cmd, output), WARNING)
|
||||
if output == 'ERROR: node {} not found in the CIB'.format(
|
||||
node_name):
|
||||
# NOTE(lourot): Sometimes seen when called from the
|
||||
# `update-ring` action.
|
||||
log('{} was already removed from the cluster, moving on',
|
||||
WARNING)
|
||||
return
|
||||
if '/cmdline' in output:
|
||||
# NOTE(lourot): older versions of crmsh may fail with
|
||||
# https://github.com/ClusterLabs/crmsh/issues/283 . If that's
|
||||
# the case let's retry.
|
||||
log('This looks like ClusterLabs/crmsh#283.', WARNING)
|
||||
if attempt > 0:
|
||||
log('Retrying...', WARNING)
|
||||
continue
|
||||
raise
|
||||
|
||||
|
||||
def get_property_from_xml(name, output):
|
||||
"""Read a configuration property from the XML generated by 'crm configure show
|
||||
xml'
|
||||
|
|
159
hooks/utils.py
159
hooks/utils.py
|
@ -118,6 +118,27 @@ class MAASConfigIncomplete(Exception):
|
|||
pass
|
||||
|
||||
|
||||
class RemoveCorosyncNodeFailed(Exception):
|
||||
def __init__(self, node_name, called_process_error):
|
||||
msg = 'Removing {} from the cluster failed. {} output={}'.format(
|
||||
node_name, called_process_error, called_process_error.output)
|
||||
super(RemoveCorosyncNodeFailed, self).__init__(msg)
|
||||
|
||||
|
||||
class EnableStonithFailed(Exception):
|
||||
def __init__(self, called_process_error):
|
||||
msg = 'Enabling STONITH failed. {} output={}'.format(
|
||||
called_process_error, called_process_error.output)
|
||||
super(EnableStonithFailed, self).__init__(msg)
|
||||
|
||||
|
||||
class DisableStonithFailed(Exception):
|
||||
def __init__(self, called_process_error):
|
||||
msg = 'Disabling STONITH failed. {} output={}'.format(
|
||||
called_process_error, called_process_error.output)
|
||||
super(DisableStonithFailed, self).__init__(msg)
|
||||
|
||||
|
||||
def disable_upstart_services(*services):
|
||||
for service in services:
|
||||
with open("/etc/init/{}.override".format(service), "wt") as override:
|
||||
|
@ -516,9 +537,13 @@ def configure_stonith():
|
|||
enable_stonith()
|
||||
set_stonith_configured(True)
|
||||
else:
|
||||
log('Disabling STONITH', level=INFO)
|
||||
cmd = "crm configure property stonith-enabled=false"
|
||||
pcmk.commit(cmd)
|
||||
# NOTE(lourot): We enter here when no MAAS STONITH resource could be
|
||||
# created. Disabling STONITH for now. We're not calling
|
||||
# set_stonith_configured(), so that enabling STONITH will be retried
|
||||
# later. (STONITH is now always enabled in this charm.)
|
||||
# Without MAAS, we keep entering here, which isn't really an issue,
|
||||
# except that this fails in rare cases, thus failure_is_fatal=False.
|
||||
disable_stonith(failure_is_fatal=False)
|
||||
|
||||
|
||||
def configure_monitor_host():
|
||||
|
@ -661,17 +686,33 @@ def configure_maas_stonith_resource(stonith_hostnames):
|
|||
|
||||
|
||||
def enable_stonith():
|
||||
"""Enable stonith via the global property stonith-enabled."""
|
||||
pcmk.commit(
|
||||
"crm configure property stonith-enabled=true",
|
||||
failure_is_fatal=True)
|
||||
"""Enable stonith via the global property stonith-enabled.
|
||||
|
||||
:raises: EnableStonithFailed
|
||||
"""
|
||||
log('Enabling STONITH', level=INFO)
|
||||
try:
|
||||
pcmk.commit(
|
||||
"crm configure property stonith-enabled=true",
|
||||
failure_is_fatal=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise EnableStonithFailed(e)
|
||||
|
||||
|
||||
def disable_stonith():
|
||||
"""Disable stonith via the global property stonith-enabled."""
|
||||
pcmk.commit(
|
||||
"crm configure property stonith-enabled=false",
|
||||
failure_is_fatal=True)
|
||||
def disable_stonith(failure_is_fatal=True):
|
||||
"""Disable stonith via the global property stonith-enabled.
|
||||
|
||||
:param failure_is_fatal: Whether to raise exception if command fails.
|
||||
:type failure_is_fatal: bool
|
||||
:raises: DisableStonithFailed
|
||||
"""
|
||||
log('Disabling STONITH', level=INFO)
|
||||
try:
|
||||
pcmk.commit(
|
||||
"crm configure property stonith-enabled=false",
|
||||
failure_is_fatal=failure_is_fatal)
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise DisableStonithFailed(e)
|
||||
|
||||
|
||||
def get_ip_addr_from_resource_params(params):
|
||||
|
@ -950,13 +991,14 @@ def restart_corosync_on_change():
|
|||
def wrap(f):
|
||||
def wrapped_f(*args, **kwargs):
|
||||
checksums = {}
|
||||
for path in COROSYNC_CONF_FILES:
|
||||
checksums[path] = file_hash(path)
|
||||
if not is_unit_paused_set():
|
||||
for path in COROSYNC_CONF_FILES:
|
||||
checksums[path] = file_hash(path)
|
||||
return_data = f(*args, **kwargs)
|
||||
# NOTE: this assumes that this call is always done around
|
||||
# configure_corosync, which returns true if configuration
|
||||
# files where actually generated
|
||||
if return_data:
|
||||
if return_data and not is_unit_paused_set():
|
||||
for path in COROSYNC_CONF_FILES:
|
||||
if checksums[path] != file_hash(path):
|
||||
validated_restart_corosync()
|
||||
|
@ -974,11 +1016,12 @@ def try_pcmk_wait():
|
|||
try:
|
||||
pcmk.wait_for_pcmk()
|
||||
log("Pacemaker is ready", DEBUG)
|
||||
except pcmk.ServicesNotUp:
|
||||
msg = ("Pacemaker is down. Please manually start it.")
|
||||
log(msg, ERROR)
|
||||
status_set('blocked', msg)
|
||||
raise pcmk.ServicesNotUp(msg)
|
||||
except pcmk.ServicesNotUp as e:
|
||||
status_msg = "Pacemaker is down. Please manually start it."
|
||||
status_set('blocked', status_msg)
|
||||
full_msg = "{} {}".format(status_msg, e)
|
||||
log(full_msg, ERROR)
|
||||
raise pcmk.ServicesNotUp(full_msg)
|
||||
|
||||
|
||||
@restart_corosync_on_change()
|
||||
|
@ -1003,9 +1046,10 @@ def services_running():
|
|||
if not (pacemaker_status and corosync_status):
|
||||
# OS perspective
|
||||
return False
|
||||
else:
|
||||
# Functional test of pacemaker
|
||||
return pcmk.wait_for_pcmk()
|
||||
# Functional test of pacemaker. This will raise if pacemaker doesn't get
|
||||
# fully ready in time:
|
||||
pcmk.wait_for_pcmk()
|
||||
return True
|
||||
|
||||
|
||||
def validated_restart_corosync(retries=10):
|
||||
|
@ -1184,6 +1228,20 @@ def node_has_resources(node_name):
|
|||
return has_resources
|
||||
|
||||
|
||||
def node_is_dc(node_name):
|
||||
"""Check if this node is the designated controller.
|
||||
|
||||
@param node_name: The name of the node to check
|
||||
@returns boolean - True if node_name is the DC
|
||||
"""
|
||||
out = subprocess.check_output(['crm_mon', '-X']).decode('utf-8')
|
||||
root = ET.fromstring(out)
|
||||
for current_dc in root.iter("current_dc"):
|
||||
if current_dc.attrib.get('name') == node_name:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def set_unit_status():
|
||||
"""Set the workload status for this unit
|
||||
|
||||
|
@ -1493,3 +1551,58 @@ def is_stonith_configured():
|
|||
"""
|
||||
configured = leader_get(STONITH_CONFIGURED) or 'False'
|
||||
return bool_from_string(configured)
|
||||
|
||||
|
||||
def update_node_list():
|
||||
"""Delete a node from the corosync ring when a Juju unit is removed.
|
||||
|
||||
:returns: Set of pcmk nodes not part of Juju hanode relation
|
||||
:rtype: Set[str]
|
||||
:raises: RemoveCorosyncNodeFailed
|
||||
"""
|
||||
pcmk_nodes = set(pcmk.list_nodes())
|
||||
juju_nodes = {socket.gethostname()}
|
||||
juju_hanode_rel = get_ha_nodes()
|
||||
for corosync_id, addr in juju_hanode_rel.items():
|
||||
peer_node_name = utils.get_hostname(addr, fqdn=False)
|
||||
juju_nodes.add(peer_node_name)
|
||||
|
||||
diff_nodes = pcmk_nodes.difference(juju_nodes)
|
||||
log("pcmk_nodes[{}], juju_nodes[{}], diff[{}]"
|
||||
"".format(pcmk_nodes, juju_nodes, diff_nodes),
|
||||
DEBUG)
|
||||
|
||||
for old_node in diff_nodes:
|
||||
try:
|
||||
pcmk.set_node_status_to_maintenance(old_node)
|
||||
pcmk.delete_node(old_node)
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise RemoveCorosyncNodeFailed(old_node, e)
|
||||
|
||||
return diff_nodes
|
||||
|
||||
|
||||
def is_update_ring_requested(corosync_update_uuid):
|
||||
log("Setting corosync-update-uuid=<uuid> in local kv", DEBUG)
|
||||
with unitdata.HookData()() as t:
|
||||
kv = t[0]
|
||||
stored_value = kv.get('corosync-update-uuid')
|
||||
if not stored_value or stored_value != corosync_update_uuid:
|
||||
kv.set('corosync-update-uuid', corosync_update_uuid)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def trigger_corosync_update_from_leader(unit, rid):
|
||||
corosync_update_uuid = relation_get(
|
||||
attribute='trigger-corosync-update',
|
||||
unit=unit, rid=rid,
|
||||
)
|
||||
if (corosync_update_uuid and
|
||||
is_update_ring_requested(corosync_update_uuid) and
|
||||
emit_corosync_conf()):
|
||||
cmd = 'corosync-cfgtool -R'
|
||||
pcmk.commit(cmd)
|
||||
return True
|
||||
|
||||
return False
|
||||
|
|
|
@ -23,7 +23,11 @@ configure:
|
|||
|
||||
tests:
|
||||
- zaza.openstack.charm_tests.hacluster.tests.HaclusterTest
|
||||
- zaza.openstack.charm_tests.hacluster.tests.HaclusterScaleBackAndForthTest
|
||||
|
||||
tests_options:
|
||||
hacluster:
|
||||
principle-app-name: keystone
|
||||
hacluster-charm-name: hacluster
|
||||
force_deploy:
|
||||
- groovy-victoria
|
||||
|
|
|
@ -42,6 +42,9 @@ class TestCorosyncConf(unittest.TestCase):
|
|||
os.remove(self.tmpfile.name)
|
||||
|
||||
@mock.patch.object(pcmk.unitdata, 'kv')
|
||||
@mock.patch.object(hooks, 'remote_unit')
|
||||
@mock.patch.object(hooks, 'principal_unit')
|
||||
@mock.patch.object(hooks, 'trigger_corosync_update_from_leader')
|
||||
@mock.patch.object(hooks, 'is_stonith_configured')
|
||||
@mock.patch.object(hooks, 'configure_peer_stonith_resource')
|
||||
@mock.patch.object(hooks, 'get_member_ready_nodes')
|
||||
|
@ -78,7 +81,9 @@ class TestCorosyncConf(unittest.TestCase):
|
|||
configure_resources_on_remotes,
|
||||
get_member_ready_nodes,
|
||||
configure_peer_stonith_resource,
|
||||
is_stonith_configured, mock_kv):
|
||||
is_stonith_configured,
|
||||
trigger_corosync_update_from_leader,
|
||||
principal_unit, remote_unit, mock_kv):
|
||||
|
||||
def fake_crm_opt_exists(res_name):
|
||||
# res_ubuntu will take the "update resource" route
|
||||
|
@ -104,6 +109,8 @@ class TestCorosyncConf(unittest.TestCase):
|
|||
'cluster_count': 3,
|
||||
'failure_timeout': 180,
|
||||
'cluster_recheck_interval': 60}
|
||||
trigger_corosync_update_from_leader.return_value = False
|
||||
principal_unit.return_value = remote_unit.return_value = ""
|
||||
|
||||
config.side_effect = lambda key: cfg.get(key)
|
||||
|
||||
|
@ -165,6 +172,9 @@ class TestCorosyncConf(unittest.TestCase):
|
|||
commit.assert_any_call(
|
||||
'crm -w -F configure %s %s %s' % (kw, name, params))
|
||||
|
||||
@mock.patch.object(hooks, 'remote_unit')
|
||||
@mock.patch.object(hooks, 'principal_unit')
|
||||
@mock.patch.object(hooks, 'trigger_corosync_update_from_leader')
|
||||
@mock.patch.object(hooks, 'is_stonith_configured')
|
||||
@mock.patch.object(hooks, 'configure_peer_stonith_resource')
|
||||
@mock.patch.object(hooks, 'get_member_ready_nodes')
|
||||
|
@ -200,7 +210,9 @@ class TestCorosyncConf(unittest.TestCase):
|
|||
configure_pacemaker_remote_stonith_resource,
|
||||
configure_resources_on_remotes, get_member_ready_nodes,
|
||||
configure_peer_stonith_resource,
|
||||
is_stonith_configured):
|
||||
is_stonith_configured,
|
||||
trigger_corosync_update_from_leader,
|
||||
principal_unit, remote_unit):
|
||||
is_stonith_configured.return_value = False
|
||||
validate_dns_ha.return_value = True
|
||||
crm_opt_exists.return_value = False
|
||||
|
@ -218,6 +230,8 @@ class TestCorosyncConf(unittest.TestCase):
|
|||
'cluster_count': 3,
|
||||
'maas_url': 'http://maas/MAAAS/',
|
||||
'maas_credentials': 'secret'}
|
||||
trigger_corosync_update_from_leader.return_value = False
|
||||
principal_unit.return_value = remote_unit.return_value = ""
|
||||
|
||||
config.side_effect = lambda key: cfg.get(key)
|
||||
|
||||
|
@ -248,6 +262,9 @@ class TestCorosyncConf(unittest.TestCase):
|
|||
'params bar ip_address="172.16.0.1" maas_url="http://maas/MAAAS/" '
|
||||
'maas_credentials="secret"')
|
||||
|
||||
@mock.patch.object(hooks, 'remote_unit')
|
||||
@mock.patch.object(hooks, 'principal_unit')
|
||||
@mock.patch.object(hooks, 'trigger_corosync_update_from_leader')
|
||||
@mock.patch.object(hooks, 'setup_maas_api')
|
||||
@mock.patch.object(hooks, 'validate_dns_ha')
|
||||
@mock.patch('pcmk.wait_for_pcmk')
|
||||
|
@ -270,7 +287,9 @@ class TestCorosyncConf(unittest.TestCase):
|
|||
relation_set, get_cluster_nodes, related_units, configure_stonith,
|
||||
configure_monitor_host, configure_cluster_global,
|
||||
configure_corosync, is_leader, crm_opt_exists,
|
||||
wait_for_pcmk, validate_dns_ha, setup_maas_api):
|
||||
wait_for_pcmk, validate_dns_ha, setup_maas_api,
|
||||
trigger_corosync_update_from_leader,
|
||||
principal_unit, remote_unit):
|
||||
|
||||
def fake_validate():
|
||||
raise utils.MAASConfigIncomplete('DNS HA invalid config')
|
||||
|
@ -289,6 +308,8 @@ class TestCorosyncConf(unittest.TestCase):
|
|||
'cluster_count': 3,
|
||||
'maas_url': 'http://maas/MAAAS/',
|
||||
'maas_credentials': None}
|
||||
trigger_corosync_update_from_leader.return_value = False
|
||||
principal_unit.return_value = remote_unit.return_value = ""
|
||||
|
||||
config.side_effect = lambda key: cfg.get(key)
|
||||
|
||||
|
@ -391,7 +412,6 @@ class TestHooks(test_utils.CharmTestCase):
|
|||
mock_is_stonith_configured.return_value = False
|
||||
mock_config.side_effect = self.test_config.get
|
||||
mock_relation_ids.return_value = ['hanode:1']
|
||||
mock_wait_for_pcmk.return_value = True
|
||||
mock_is_leader.return_value = True
|
||||
hooks.config_changed()
|
||||
mock_maintenance_mode.assert_not_called()
|
||||
|
|
|
@ -93,6 +93,10 @@ class UtilsTestCaseWriteTmp(unittest.TestCase):
|
|||
|
||||
|
||||
class UtilsTestCase(unittest.TestCase):
|
||||
def _testdata(self, filename):
|
||||
return os.path.join(os.path.dirname(__file__),
|
||||
'testdata',
|
||||
filename)
|
||||
|
||||
@mock.patch.object(utils, 'config')
|
||||
def test_get_transport(self, mock_config):
|
||||
|
@ -430,20 +434,19 @@ class UtilsTestCase(unittest.TestCase):
|
|||
])
|
||||
|
||||
@mock.patch('pcmk.commit')
|
||||
@mock.patch.object(utils, 'config')
|
||||
@mock.patch.object(utils, 'configure_pacemaker_remote_stonith_resource')
|
||||
def test_configure_stonith_stonith_enabled_false(
|
||||
def test_configure_stonith_no_maas(
|
||||
self,
|
||||
mock_cfg_pcmkr_rstonith_res,
|
||||
mock_config,
|
||||
mock_commit):
|
||||
cfg = {
|
||||
'stonith_enabled': 'false'}
|
||||
mock_config.side_effect = lambda key: cfg.get(key)
|
||||
# Without MAAS this function will return no resource:
|
||||
mock_cfg_pcmkr_rstonith_res.return_value = []
|
||||
|
||||
utils.configure_stonith()
|
||||
|
||||
mock_commit.assert_called_once_with(
|
||||
'crm configure property stonith-enabled=false')
|
||||
'crm configure property stonith-enabled=false',
|
||||
failure_is_fatal=False)
|
||||
|
||||
@mock.patch.object(utils, 'relation_get')
|
||||
def test_parse_data_json(self, relation_get):
|
||||
|
@ -1260,3 +1263,57 @@ class UtilsTestCase(unittest.TestCase):
|
|||
commit.assert_called_once_with(
|
||||
'crm configure property stonith-enabled=false',
|
||||
failure_is_fatal=True)
|
||||
|
||||
@mock.patch('subprocess.check_output')
|
||||
def test_node_is_dc(self, mock_subprocess):
|
||||
with open(self._testdata('test_crm_mon.xml'), 'r') as fd:
|
||||
mock_subprocess.return_value = "".join(
|
||||
fd.readlines()).encode("utf-8")
|
||||
|
||||
self.assertTrue(utils.node_is_dc('juju-2eebcf-0'))
|
||||
|
||||
@mock.patch.object(utils.unitdata, 'HookData')
|
||||
def test_is_update_ring_requested(self, HookData):
|
||||
hook_data = self.MockHookData()
|
||||
HookData.return_value = hook_data
|
||||
self.assertTrue(
|
||||
utils.is_update_ring_requested('random-uuid-generated')
|
||||
)
|
||||
self.assertEquals(
|
||||
hook_data.kv.get('corosync-update-uuid'),
|
||||
'random-uuid-generated',
|
||||
)
|
||||
# No change in uuid means no new request has been issued
|
||||
self.assertFalse(
|
||||
utils.is_update_ring_requested('random-uuid-generated')
|
||||
)
|
||||
|
||||
@mock.patch('pcmk.commit')
|
||||
@mock.patch.object(utils, 'emit_corosync_conf')
|
||||
@mock.patch.object(utils, 'is_update_ring_requested')
|
||||
@mock.patch.object(utils, 'relation_get')
|
||||
def test_trigger_corosync_update_from_leader(self, mock_relation_get,
|
||||
mock_is_update_ring_req,
|
||||
mock_emit_corosync_conf,
|
||||
mock_commit):
|
||||
# corosync-update-uuid is set and has changed:
|
||||
mock_relation_get.return_value = 'random-uuid-generated'
|
||||
mock_is_update_ring_req.return_value = True
|
||||
|
||||
mock_emit_corosync_conf.return_value = True
|
||||
self.assertTrue(
|
||||
utils.trigger_corosync_update_from_leader(
|
||||
'hacluster/0',
|
||||
'hanode:0',
|
||||
),
|
||||
)
|
||||
mock_commit.assert_has_calls([mock.call('corosync-cfgtool -R')])
|
||||
|
||||
# corosync-update-uuid isn't set:
|
||||
mock_relation_get.return_value = None
|
||||
self.assertFalse(
|
||||
utils.trigger_corosync_update_from_leader(
|
||||
'hacluster/0',
|
||||
'hanode:0',
|
||||
),
|
||||
)
|
||||
|
|
|
@ -163,7 +163,8 @@ class TestPcmk(unittest.TestCase):
|
|||
# Pacemaker is up
|
||||
gethostname.return_value = 'hanode-1'
|
||||
getstatusoutput.return_value = (0, 'Hosname: hanode-1')
|
||||
self.assertTrue(pcmk.wait_for_pcmk(retries=2, sleep=0))
|
||||
# Here we are asserting that it doesn't raise anything:
|
||||
pcmk.wait_for_pcmk(retries=2, sleep=0)
|
||||
|
||||
@mock.patch('subprocess.check_output')
|
||||
def test_crm_version(self, mock_check_output):
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
<?xml version="1.0"?>
|
||||
<crm_mon version="1.1.18">
|
||||
<summary>
|
||||
<stack type="corosync" />
|
||||
<current_dc present="true" version="1.1.18-2b07d5c5a9" name="juju-2eebcf-0" id="1000" with_quorum="true" />
|
||||
<last_update time="Mon Jul 20 09:15:49 2020" />
|
||||
<last_change time="Mon Jul 20 09:09:40 2020" user="hacluster" client="crmd" origin="juju-2eebcf-2" />
|
||||
<nodes_configured number="3" expected_votes="unknown" />
|
||||
<resources_configured number="5" disabled="0" blocked="0" />
|
||||
<cluster_options stonith-enabled="false" symmetric-cluster="true" no-quorum-policy="stop" maintenance-mode="false" />
|
||||
</summary>
|
||||
<nodes>
|
||||
<node name="juju-2eebcf-0" id="1000" online="true" standby="false" standby_onfail="false" maintenance="false" pending="false" unclean="false" shutdown="false" expected_up="true" is_dc="true" resources_running="2" type="member" />
|
||||
<node name="juju-2eebcf-2" id="1001" online="true" standby="false" standby_onfail="false" maintenance="false" pending="false" unclean="false" shutdown="false" expected_up="true" is_dc="false" resources_running="2" type="member" />
|
||||
<node name="juju-2eebcf-3" id="1002" online="true" standby="false" standby_onfail="false" maintenance="false" pending="false" unclean="false" shutdown="false" expected_up="true" is_dc="false" resources_running="1" type="member" />
|
||||
</nodes>
|
||||
<resources>
|
||||
<group id="grp_ks_vips" number_resources="1" >
|
||||
<resource id="res_ks_0dd3a53_vip" resource_agent="ocf::heartbeat:IPaddr2" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
|
||||
<node name="juju-2eebcf-0" id="1000" cached="false"/>
|
||||
</resource>
|
||||
</group>
|
||||
<clone id="cl_ks_haproxy" multi_state="false" unique="false" managed="true" failed="false" failure_ignored="false" >
|
||||
<resource id="res_ks_haproxy" resource_agent="lsb:haproxy" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
|
||||
<node name="juju-2eebcf-2" id="1001" cached="false"/>
|
||||
</resource>
|
||||
<resource id="res_ks_haproxy" resource_agent="lsb:haproxy" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
|
||||
<node name="juju-2eebcf-0" id="1000" cached="false"/>
|
||||
</resource>
|
||||
<resource id="res_ks_haproxy" resource_agent="lsb:haproxy" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
|
||||
<node name="juju-2eebcf-3" id="1002" cached="false"/>
|
||||
</resource>
|
||||
</clone>
|
||||
<resource id="res_ks_bc84550_vip" resource_agent="ocf::heartbeat:IPaddr2" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
|
||||
<node name="juju-2eebcf-2" id="1001" cached="false"/>
|
||||
</resource>
|
||||
</resources>
|
||||
<node_attributes>
|
||||
<node name="juju-2eebcf-0">
|
||||
</node>
|
||||
<node name="juju-2eebcf-2">
|
||||
</node>
|
||||
<node name="juju-2eebcf-3">
|
||||
</node>
|
||||
</node_attributes>
|
||||
<node_history>
|
||||
<node name="juju-2eebcf-2">
|
||||
<resource_history id="res_ks_bc84550_vip" orphan="false" migration-threshold="1000000">
|
||||
<operation_history call="56" task="start" last-rc-change="Mon Jul 20 09:09:37 2020" last-run="Mon Jul 20 09:09:37 2020" exec-time="548ms" queue-time="0ms" rc="0" rc_text="ok" />
|
||||
<operation_history call="57" task="monitor" interval="10000ms" last-rc-change="Mon Jul 20 09:09:38 2020" exec-time="534ms" queue-time="0ms" rc="0" rc_text="ok" />
|
||||
</resource_history>
|
||||
<resource_history id="res_ks_haproxy" orphan="false" migration-threshold="1000000">
|
||||
<operation_history call="64" task="probe" last-rc-change="Mon Jul 20 09:09:40 2020" last-run="Mon Jul 20 09:09:40 2020" exec-time="554ms" queue-time="0ms" rc="0" rc_text="ok" />
|
||||
<operation_history call="64" task="probe" last-rc-change="Mon Jul 20 09:09:40 2020" last-run="Mon Jul 20 09:09:40 2020" exec-time="554ms" queue-time="0ms" rc="0" rc_text="ok" />
|
||||
<operation_history call="70" task="monitor" interval="5000ms" last-rc-change="Mon Jul 20 09:09:41 2020" exec-time="804ms" queue-time="0ms" rc="0" rc_text="ok" />
|
||||
</resource_history>
|
||||
</node>
|
||||
<node name="juju-2eebcf-0">
|
||||
<resource_history id="res_ks_haproxy" orphan="false" migration-threshold="1000000">
|
||||
<operation_history call="80" task="start" last-rc-change="Mon Jul 20 09:09:41 2020" last-run="Mon Jul 20 09:09:41 2020" exec-time="824ms" queue-time="0ms" rc="0" rc_text="ok" />
|
||||
<operation_history call="82" task="monitor" interval="5000ms" last-rc-change="Mon Jul 20 09:09:42 2020" exec-time="534ms" queue-time="0ms" rc="0" rc_text="ok" />
|
||||
</resource_history>
|
||||
<resource_history id="res_ks_0dd3a53_vip" orphan="false" migration-threshold="1000000">
|
||||
<operation_history call="79" task="probe" last-rc-change="Mon Jul 20 09:09:41 2020" last-run="Mon Jul 20 09:09:41 2020" exec-time="753ms" queue-time="0ms" rc="0" rc_text="ok" />
|
||||
<operation_history call="79" task="probe" last-rc-change="Mon Jul 20 09:09:41 2020" last-run="Mon Jul 20 09:09:41 2020" exec-time="753ms" queue-time="0ms" rc="0" rc_text="ok" />
|
||||
<operation_history call="81" task="monitor" interval="10000ms" last-rc-change="Mon Jul 20 09:09:42 2020" exec-time="542ms" queue-time="0ms" rc="0" rc_text="ok" />
|
||||
</resource_history>
|
||||
</node>
|
||||
<node name="juju-2eebcf-3">
|
||||
<resource_history id="res_ks_haproxy" orphan="false" migration-threshold="1000000">
|
||||
<operation_history call="62" task="probe" last-rc-change="Mon Jul 20 09:09:40 2020" last-run="Mon Jul 20 09:09:40 2020" exec-time="547ms" queue-time="0ms" rc="0" rc_text="ok" />
|
||||
<operation_history call="62" task="probe" last-rc-change="Mon Jul 20 09:09:40 2020" last-run="Mon Jul 20 09:09:40 2020" exec-time="547ms" queue-time="0ms" rc="0" rc_text="ok" />
|
||||
<operation_history call="68" task="monitor" interval="5000ms" last-rc-change="Mon Jul 20 09:09:41 2020" exec-time="751ms" queue-time="0ms" rc="0" rc_text="ok" />
|
||||
</resource_history>
|
||||
</node>
|
||||
</node_history>
|
||||
<tickets>
|
||||
</tickets>
|
||||
<bans>
|
||||
</bans>
|
||||
</crm_mon>
|
Loading…
Reference in New Issue