From b8c2213dfbd4ae417be95a8ce1b1c973eee9e55c Mon Sep 17 00:00:00 2001 From: David Ames Date: Fri, 12 Jul 2019 16:16:46 -0700 Subject: [PATCH] Notify bootstrapped action It turns out a subsequent required step after a cold boot bootstrap is notifying the cluster of the new bootstrap UUID. The notify-bootstrapped action should be run on a different node than the one which ran the bootstrap-pxc action. This action will ensure the cluster converges on the correct bootstrap UUID. A subsequent patch stacked on this one will include tests for the new cold boot actions. Change-Id: Idee12d5f7e28498c5ab6ccb9605f751c6427ac30 Partial-Bug: #1744393 --- actions.yaml | 4 +- actions/actions.py | 33 +++++++++-- actions/notify-bootstrapped | 1 + hooks/percona_hooks.py | 10 +++- hooks/percona_utils.py | 97 +++++++++++++++++++++----------- unit_tests/test_percona_utils.py | 55 +++++++++++++++--- 6 files changed, 154 insertions(+), 46 deletions(-) create mode 120000 actions/notify-bootstrapped diff --git a/actions.yaml b/actions.yaml index 1adc042..7a5ae21 100644 --- a/actions.yaml +++ b/actions.yaml @@ -29,6 +29,8 @@ bootstrap-pxc: Bootstrap this unit of Percona. *WARNING* This action will bootstrap this unit of Percona cluster. This should only occur in a recovery scenario. Make sure this unit has the - highest sequence number in grstate.dat or data loss may occur. + highest sequence number in grastate.dat or data loss may occur. See upstream Percona documentation for context https://www.percona.com/blog/2014/09/01/galera-replication-how-to-recover-a-pxc-cluster/ +notify-bootstrapped: + descripttion: Notify the cluster of the new bootstrap uuid. diff --git a/actions/actions.py b/actions/actions.py index 4c8d561..dcb3340 100755 --- a/actions/actions.py +++ b/actions/actions.py @@ -112,16 +112,28 @@ def backup(args): def bootstrap_pxc(args): + """ Force a bootstrap on this node + + This action will run bootstrap-pxc on this node bootstrapping the cluster. + This action should only be run after a cold start requiring a bootstrap. + This action should only be run on the node with the highest sequence number + as displayed in workgoup status and found in grastate.dat. + If this unit has the highest sequence number and is not the juju leader + node, a subsequent action run of notify-bootstrapped is required. + """ + try: # Force safe to bootstrap - percona_utils.set_grstate_safe_to_bootstrap() + percona_utils.set_grastate_safe_to_bootstrap() # Boostrap this node percona_utils.bootstrap_pxc() - except (percona_utils.GRStateFileNotFound, OSError) as e: + percona_utils.notify_bootstrapped() + except (percona_utils.GRAStateFileNotFound, OSError) as e: action_set({ 'output': e.output, 'return-code': e.returncode}) - action_fail("The GRState file does not exist or cannot be written to.") + action_fail("The GRAState file does not exist or cannot " + "be written to.") except (subprocess.CalledProcessError, Exception) as e: action_set({ 'output': e.output, @@ -130,16 +142,27 @@ def bootstrap_pxc(args): action_fail("The bootstrap-pxc failed. " "See traceback in show-action-output") action_set({ - 'output': "Bootstrap succeded. " + 'output': "Bootstrap succeeded. " "Wait for the other units to run update-status"}) percona_utils.assess_status(percona_utils.register_configs()) +def notify_bootstrapped(args): + """Notify the cluster of the new bootstrap cluster UUID. + + As a consequence of timing, this action will often need to be executed + after the bootstrap-pxc action. It will need to be run on a different unit + than was bootstrap-pxc was executed on. + """ + percona_utils.notify_bootstrapped() + + # A dictionary of all the defined actions to callables (which take # parsed arguments). ACTIONS = {"pause": pause, "resume": resume, "backup": backup, "complete-cluster-series-upgrade": complete_cluster_series_upgrade, - "bootstrap-pxc": bootstrap_pxc} + "bootstrap-pxc": bootstrap_pxc, + "notify-bootstrapped": notify_bootstrapped} def main(args): diff --git a/actions/notify-bootstrapped b/actions/notify-bootstrapped new file mode 120000 index 0000000..405a394 --- /dev/null +++ b/actions/notify-bootstrapped @@ -0,0 +1 @@ +actions.py \ No newline at end of file diff --git a/hooks/percona_hooks.py b/hooks/percona_hooks.py index ef13e74..13df074 100755 --- a/hooks/percona_hooks.py +++ b/hooks/percona_hooks.py @@ -106,6 +106,7 @@ from percona_utils import ( get_db_helper, mark_seeded, seeded, install_mysql_ocf, + maybe_notify_bootstrapped, notify_bootstrapped, is_bootstrapped, clustered_once, @@ -145,6 +146,7 @@ from percona_utils import ( get_slave_status, delete_replication_user, list_replication_users, + check_mysql_connection, ) from charmhelpers.core.unitdata import kv @@ -331,7 +333,8 @@ def render_config_restart_on_changed(hosts, bootstrap=False): def update_client_db_relations(): """ Upate client db relations IFF ready """ - if leader_node_is_ready() or client_node_is_ready(): + if ((leader_node_is_ready() or + client_node_is_ready()) and check_mysql_connection()): for r_id in relation_ids('shared-db'): for unit in related_units(r_id): shared_db_changed(r_id, unit) @@ -633,6 +636,8 @@ def cluster_changed(): peer_echo(includes=inc_list) # NOTE(jamespage): deprecated - leader-election + maybe_notify_bootstrapped() + cluster_joined() config_changed() @@ -951,6 +956,9 @@ def ha_relation_changed(): @hooks.hook('leader-settings-changed') def leader_settings_changed(): '''Re-trigger install once leader has seeded passwords into install''' + + maybe_notify_bootstrapped() + config_changed() # NOTE(tkurek): re-set 'master' relation data if relation_ids('master'): diff --git a/hooks/percona_utils.py b/hooks/percona_utils.py index 6933b16..e2959d9 100644 --- a/hooks/percona_utils.py +++ b/hooks/percona_utils.py @@ -113,8 +113,8 @@ class DesyncedException(Exception): pass -class GRStateFileNotFound(Exception): - """Raised when the grstate file does not exist""" +class GRAStateFileNotFound(Exception): + """Raised when the grastate file does not exist""" pass @@ -548,8 +548,9 @@ def is_bootstrapped(): DEBUG) return False elif len(set(uuids)) > 1: - raise Exception("Found inconsistent bootstrap uuids: " - "{}".format((uuids))) + log("Found inconsistent bootstrap uuids: " + "{}".format(uuids), level=WARNING) + return False else: log("All {} percona units reporting clustered".format(min_size), DEBUG) @@ -693,7 +694,8 @@ def charm_check_func(): not check_mysql_connection()): return ('blocked', 'MySQL is down. Sequence Number: {}. Safe To Bootstrap: {}' - .format(get_grstate_seqno(), get_grstate_safe_to_bootstrap())) + .format(get_grastate_seqno(), + get_grastate_safe_to_bootstrap())) @retry_on_exception(num_retries=10, base_delay=2, @@ -1489,58 +1491,89 @@ def check_mysql_connection(): return False -def get_grstate_seqno(): +def get_grastate_seqno(): """Get GR State safe sequence number. - Read the grstate yaml file to determine the sequence number for this + Read the grastate yaml file to determine the sequence number for this instance. :returns: int Sequence Number """ - grstate_file = os.path.join(resolve_data_dir(), "grastate.dat") - if os.path.exists(grstate_file): - with open(grstate_file, 'r') as f: - grstate = yaml.safe_load(f) - return grstate.get("seqno") + grastate_file = os.path.join(resolve_data_dir(), "grastate.dat") + if os.path.exists(grastate_file): + with open(grastate_file, 'r') as f: + grastate = yaml.safe_load(f) + return grastate.get("seqno") -def get_grstate_safe_to_bootstrap(): +def get_grastate_safe_to_bootstrap(): """Get GR State safe to bootstrap. - Read the grstate yaml file to determine if it is safe to bootstrap from + Read the grastate yaml file to determine if it is safe to bootstrap from this instance. :returns: int Safe to bootstrap 0 or 1 """ - grstate_file = os.path.join(resolve_data_dir(), "grastate.dat") - if os.path.exists(grstate_file): - with open(grstate_file, 'r') as f: - grstate = yaml.safe_load(f) - return grstate.get("safe_to_bootstrap") + grastate_file = os.path.join(resolve_data_dir(), "grastate.dat") + if os.path.exists(grastate_file): + with open(grastate_file, 'r') as f: + grastate = yaml.safe_load(f) + return grastate.get("safe_to_bootstrap") -def set_grstate_safe_to_bootstrap(): +def set_grastate_safe_to_bootstrap(): """Set GR State safe to bootstrap. - Update the grstate yaml file to indicate it is safe to bootstrap from + Update the grastate yaml file to indicate it is safe to bootstrap from this instance. - :side effect: Writes the grstate.dat file. - :raises GRStateFileNotFound: If grstate.dat file does not exist. + :side effect: Writes the grastate.dat file. + :raises GRAStateFileNotFound: If grastate.dat file does not exist. :returns: None """ - grstate_file = os.path.join(resolve_data_dir(), "grastate.dat") - if not os.path.exists(grstate_file): - raise GRStateFileNotFound("{} file does not exist" - .format(grstate_file)) - with open(grstate_file, 'r') as f: - grstate = yaml.safe_load(f) + grastate_file = os.path.join(resolve_data_dir(), "grastate.dat") + if not os.path.exists(grastate_file): + raise GRAStateFileNotFound("{} file does not exist" + .format(grastate_file)) + with open(grastate_file, 'r') as f: + grastate = yaml.safe_load(f) # Force safe to bootstrap - grstate["safe_to_bootstrap"] = 1 + grastate["safe_to_bootstrap"] = 1 - with open(grstate_file, 'w') as f: - f.write(yaml.dump(grstate)) + with open(grastate_file, 'w') as f: + f.write(yaml.dump(grastate)) + + +def maybe_notify_bootstrapped(): + """Maybe notify bootstrapped. + + In the event of a subsequent bootstrap after deploy time, as in the case of + a cold start, it is necessary to re-notify the cluster relation of the new + bootstrap UUID. + + This function checks that the cluster has been clustered before and + notified clients, checks for agreement with the leader on the bootstrap + UUID and calls notify_bootstrapped to inform the cluster peers of the UUID. + + :side effect: calls kv() + :side effect: may call notify_bootstrapped() + :returns: None + """ + + if not check_mysql_connection(): + log("MySQL is down: deferring notify bootstrapped", DEBUG) + return + + kvstore = kv() + # Using INITIAL_CLIENT_UPDATE_KEY as this is a step beyond merely + # clustered, but rather clustered and clients were previously notified. + if kvstore.get(INITIAL_CLIENT_UPDATE_KEY, False): + # Handle a change of bootstrap UUID after cold start bootstrap + lead_cluster_state_uuid = leader_get('bootstrap-uuid') + cluster_state_uuid = get_wsrep_value('wsrep_cluster_state_uuid') + if lead_cluster_state_uuid == cluster_state_uuid: + notify_bootstrapped(cluster_uuid=cluster_state_uuid) diff --git a/unit_tests/test_percona_utils.py b/unit_tests/test_percona_utils.py index ac828f7..3602b8d 100644 --- a/unit_tests/test_percona_utils.py +++ b/unit_tests/test_percona_utils.py @@ -14,6 +14,8 @@ os.environ['JUJU_UNIT_NAME'] = 'percona-cluster/2' class UtilsTests(CharmTestCase): TO_PATCH = [ 'config', + 'kv', + 'leader_get', 'log', 'relation_ids', 'related_units', @@ -87,7 +89,6 @@ class UtilsTests(CharmTestCase): os.remove(tmpfile.name) self.assertEqual(len(lines), 5) - print("XXX", lines) self.assertEqual(lines[0], "#somedata\n") self.assertEqual(lines[1], "{} {}\n".format(list(_map.keys())[0], @@ -438,18 +439,18 @@ class UtilsTests(CharmTestCase): @mock.patch("percona_utils.resolve_data_dir") @mock.patch("percona_utils.os") - def test_get_grstate_seqno(self, _os, _resolve_dd): + def test_get_grastate_seqno(self, _os, _resolve_dd): _resolve_dd.return_value = "/tmp" _seqno = "25" _os.path.exists.return_value = True self.yaml.safe_load.return_value = {"seqno": _seqno} with patch_open() as (_open, _file): _open.return_value = _file - self.assertEqual(_seqno, percona_utils.get_grstate_seqno()) + self.assertEqual(_seqno, percona_utils.get_grastate_seqno()) @mock.patch("percona_utils.resolve_data_dir") @mock.patch("percona_utils.os") - def test_get_grstate_safe_to_bootstrap(self, _os, _resolve_dd): + def test_get_grastate_safe_to_bootstrap(self, _os, _resolve_dd): _resolve_dd.return_value = "/tmp" _bootstrap = "0" _os.path.exists.return_value = True @@ -457,11 +458,11 @@ class UtilsTests(CharmTestCase): with patch_open() as (_open, _file): _open.return_value = _file self.assertEqual( - _bootstrap, percona_utils.get_grstate_safe_to_bootstrap()) + _bootstrap, percona_utils.get_grastate_safe_to_bootstrap()) @mock.patch("percona_utils.resolve_data_dir") @mock.patch("percona_utils.os") - def test_set_grstate_safe_to_bootstrap(self, _os, _resolve_dd): + def test_set_grastate_safe_to_bootstrap(self, _os, _resolve_dd): _resolve_dd.return_value = "/tmp" _bootstrap = "0" _os.path.exists.return_value = True @@ -469,10 +470,50 @@ class UtilsTests(CharmTestCase): with patch_open() as (_open, _file): _open.return_value = _file _file.write = mock.MagicMock() - percona_utils.set_grstate_safe_to_bootstrap() + percona_utils.set_grastate_safe_to_bootstrap() self.yaml.dump.assert_called_once_with({"safe_to_bootstrap": 1}) _file.write.assert_called_once() + @mock.patch("percona_utils.check_mysql_connection") + @mock.patch("percona_utils.get_wsrep_value") + @mock.patch("percona_utils.notify_bootstrapped") + def test_maybe_notify_bootstrapped( + self, _notify_bootstrapped, + _get_wsrep_value, _check_mysql_connection): + kvstore = mock.MagicMock() + kvstore.get.return_value = True + self.kv.return_value = kvstore + + _check_mysql_connection.return_value = False + + _uuid = "uuid-uuid" + self.leader_get.return_value = _uuid + _get_wsrep_value.return_value = _uuid + + # mysql not runnig + percona_utils.maybe_notify_bootstrapped() + _notify_bootstrapped.assert_not_called() + + # No clients initialized + _check_mysql_connection.return_value = True + kvstore.get.return_value = False + percona_utils.maybe_notify_bootstrapped() + _notify_bootstrapped.assert_not_called() + + # Differing UUID + _check_mysql_connection.return_value = True + kvstore.get.return_value = True + _get_wsrep_value.return_value = "not-the-same-uuid" + percona_utils.maybe_notify_bootstrapped() + _notify_bootstrapped.assert_not_called() + + # Differing UUID + _check_mysql_connection.return_value = True + kvstore.get.return_value = True + _get_wsrep_value.return_value = _uuid + percona_utils.maybe_notify_bootstrapped() + _notify_bootstrapped.assert_called_once_with(cluster_uuid=_uuid) + class UtilsTestsStatus(CharmTestCase):