diff --git a/actions.yaml b/actions.yaml index e269c2a..dd4e72a 100644 --- a/actions.yaml +++ b/actions.yaml @@ -22,11 +22,26 @@ status: type: boolean description: Show cluster status history update-ring: - description: Trigger corosync node members cleanup + description: | + Trigger corosync node members cleanup. + + WARNING This action updates the corosync cluster members by adding or + removing nodes, which may lead to a loss of quorum and other unexpected + side-effects. It is strongly encouraged to manually remove nodes + individually using the delete-node-from-ring action. params: i-really-mean-it: type: boolean description: | - This must be toggled to enable actually performing this action + This must be toggled to enable actually performing this action. required: - i-really-mean-it +delete-node-from-ring: + description: Delete a node from the corosync ring. Must be run on the hacluster leader node. + params: + node: + type: string + description: | + Node name to be removed. i.e. hostname of the node + required: + - node diff --git a/actions/actions.py b/actions/actions.py index c369842..19a4b25 100755 --- a/actions/actions.py +++ b/actions/actions.py @@ -110,23 +110,7 @@ def cleanup(args): "'{}'".format(resource_name)) -def update_ring(args): - """Update corosync.conf list of nodes (generally after unit removal).""" - if not action_get('i-really-mean-it'): - action_fail('i-really-mean-it is a required parameter') - return - - if not is_leader(): - action_fail('only the Juju leader can run this action') - return - - diff_nodes = update_node_list() - if not diff_nodes: - # No differences between discovered Pacemaker nodes and - # Juju nodes (ie. no node removal) - action_set({'result': 'noop'}) - return - +def _trigger_corosync_update(): # Trigger emit_corosync_conf() and corosync-cfgtool -R # for all the hanode peer units to run relid = relation_ids('hanode') @@ -144,11 +128,66 @@ def update_ring(args): emit_corosync_conf()): cmd = 'corosync-cfgtool -R' pcmk.commit(cmd) - action_set({'result': 'success'}) + + +def update_ring(args): + """Update corosync.conf list of nodes (generally after unit removal).""" + if not function_get('i-really-mean-it'): + function_fail('i-really-mean-it is a required parameter') + return + + if not is_leader(): + function_fail('only the Juju leader can run this action') + return + + diff_nodes = update_node_list() + log("Unexpected node(s) found and removed: {}" + .format(",".join(list(diff_nodes)))) + if not diff_nodes: + # No differences between discovered Pacemaker nodes and + # Juju nodes (ie. no node removal) + function_set({'result': 'No changes required.'}) + return + + # Notify the cluster + _trigger_corosync_update() + + function_set( + {"result": + "Nodes removed: {}" + .format(" ".join(list(diff_nodes)))}) + + +def delete_node_from_ring(args): + """Delete a node from the corosync ring.""" + + node = function_get('node') + if not node: + function_fail('node is a required parameter') + return + + if not is_leader(): + function_fail('only the Juju leader can run this action') + return + + # Delete the node from the live corosync env + try: + pcmk.set_node_status_to_maintenance(node) + pcmk.delete_node(node, failure_is_fatal=True) + except subprocess.CalledProcessError as e: + function_fail( + "Removing {} from the cluster failed. {} output={}" + .format(node, e, e.output)) + + # Notify the cluster + _trigger_corosync_update() + + function_set({'result': 'success'}) ACTIONS = {"pause": pause, "resume": resume, "status": status, "cleanup": cleanup, + "delete-node-from-ring": delete_node_from_ring, "update-ring": update_ring} diff --git a/actions/delete-node-from-ring b/actions/delete-node-from-ring new file mode 120000 index 0000000..405a394 --- /dev/null +++ b/actions/delete-node-from-ring @@ -0,0 +1 @@ +actions.py \ No newline at end of file diff --git a/hooks/hooks.py b/hooks/hooks.py index 8af82b7..5704116 100755 --- a/hooks/hooks.py +++ b/hooks/hooks.py @@ -18,7 +18,6 @@ import copy import glob import os import shutil -import socket import sys import traceback @@ -598,11 +597,11 @@ def ha_relation_changed(): @hooks.hook() def stop(): - # NOTE(lourot): This seems to always fail with - # 'ERROR: node not found in the CIB', which means that the node - # has already been removed from the cluster. Thus failure_is_fatal=False. - # We might consider getting rid of this call. - pcmk.delete_node(socket.gethostname(), failure_is_fatal=False) + # The pcmk.delete_node handles several known failure modes so + # failure_is_fatal=True actually helps as it causes retries. + node = get_hostname() + pcmk.set_node_status_to_maintenance(node) + pcmk.delete_node(node, failure_is_fatal=True) apt_purge(['corosync', 'pacemaker'], fatal=True) diff --git a/hooks/utils.py b/hooks/utils.py index ab6c6bb..155b6c9 100644 --- a/hooks/utils.py +++ b/hooks/utils.py @@ -1553,19 +1553,32 @@ def is_stonith_configured(): return bool_from_string(configured) +def get_hanode_hostnames(): + """Hostnames of nodes in the hanode relation. + + :returns: List of hostnames of nodes in the hanode relation. + :rtype: List + """ + hanode_hostnames = [get_hostname()] + for relid in relation_ids('hanode'): + for unit in related_units(relid): + hostname = relation_get('hostname', rid=relid, unit=unit) + if hostname: + hanode_hostnames.append(hostname) + + hanode_hostnames.sort() + return hanode_hostnames + + def update_node_list(): - """Delete a node from the corosync ring when a Juju unit is removed. + """Determine and delete unexpected nodes from the corosync ring. :returns: Set of pcmk nodes not part of Juju hanode relation :rtype: Set[str] :raises: RemoveCorosyncNodeFailed """ pcmk_nodes = set(pcmk.list_nodes()) - juju_nodes = {socket.gethostname()} - juju_hanode_rel = get_ha_nodes() - for corosync_id, addr in juju_hanode_rel.items(): - peer_node_name = utils.get_hostname(addr, fqdn=False) - juju_nodes.add(peer_node_name) + juju_nodes = set(get_hanode_hostnames()) diff_nodes = pcmk_nodes.difference(juju_nodes) log("pcmk_nodes[{}], juju_nodes[{}], diff[{}]" diff --git a/unit_tests/test_hacluster_utils.py b/unit_tests/test_hacluster_utils.py index f80ab06..3e38a4e 100644 --- a/unit_tests/test_hacluster_utils.py +++ b/unit_tests/test_hacluster_utils.py @@ -1317,3 +1317,53 @@ class UtilsTestCase(unittest.TestCase): 'hanode:0', ), ) + + @mock.patch.object(utils, 'get_hostname') + @mock.patch.object(utils, 'relation_get') + @mock.patch.object(utils, 'related_units') + @mock.patch.object(utils, 'relation_ids') + def test_get_hanode_hostnames( + self, _relation_ids, _related_units, _relation_get, _get_hostname): + _data = { + "node/1": {"hostname": "beta"}, + "node/3": {"hostname": "alpha"}, + } + + def _rel_data(_param, unit, rid): + return _data[unit][_param] + + _relation_ids.return_value = ["hanode:1"] + _related_units.return_value = ["node/1", "node/3"] + _relation_get.side_effect = _rel_data + _get_hostname.return_value = "gamma" + _expected = ["alpha", "beta", "gamma"] + + self.assertEqual( + utils.get_hanode_hostnames(), + _expected) + + @mock.patch.object(utils, 'get_hanode_hostnames') + @mock.patch.object(utils, 'pcmk') + def test_update_node_list(self, _pcmk, _get_hanode_hostnames): + _pcmk.list_nodes.return_value = ["zeta", "beta", "psi"] + _get_hanode_hostnames.return_value = ["alpha", "beta", "gamma"] + _expected = set(["zeta", "psi"]) + + self.assertEqual( + utils.update_node_list(), + _expected) + + _pcmk.set_node_status_to_maintenance.assert_has_calls( + [mock.call('zeta'), + mock.call('psi')], + any_order=True) + _pcmk.delete_node.assert_has_calls( + [mock.call('zeta'), + mock.call('psi')], + any_order=True) + + # Raise RemoveCorosyncNodeFailed + _pcmk.delete_node.side_effect = subprocess.CalledProcessError( + 127, "fake crm command") + with self.assertRaises(utils.RemoveCorosyncNodeFailed): + utils.update_node_list()