diff --git a/README.md b/README.md index 3dca3c8..17aa49b 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,25 @@ requirements and resources available. HA/Clustering ------------- +When more than one unit of the charm is deployed with the hacluster charm +the percona charm will bring up an Active/Active cluster. The process of +clustering the units together takes some time. Due to the nature of +asynchronous hook execution it is possible client relationship hooks may +be executed before the cluster is complete. In some cases, this can lead +to client charm errors. + +To guarantee client relation hooks will not be executed until clustering is +completed use the min-cluster-size configuration setting: + + juju deploy -n 3 percona-cluster + juju config percona-cluster min-cluster-size=3 + +When min-cluster-size is not set the charm will still cluster, however, +there are no guarantees client relation hooks will not execute before it is +complete. + +Single unit deployments behave as expected. + There are two mutually exclusive high availability options: using virtual IP(s) or DNS. In both cases, a relationship to hacluster is required which provides the corosync back end HA functionality. diff --git a/hooks/percona_hooks.py b/hooks/percona_hooks.py index d9bba37..2758a14 100755 --- a/hooks/percona_hooks.py +++ b/hooks/percona_hooks.py @@ -10,7 +10,6 @@ from charmhelpers.core.hookenv import ( Hooks, UnregisteredHookError, is_relation_made, log, - local_unit, relation_get, relation_set, relation_id, @@ -26,6 +25,7 @@ from charmhelpers.core.hookenv import ( is_leader, network_get_primary_address, charm_name, + leader_get, ) from charmhelpers.core.host import ( service_restart, @@ -50,9 +50,7 @@ from charmhelpers.contrib.database.mysql import ( from charmhelpers.contrib.hahelpers.cluster import ( is_elected_leader, is_clustered, - oldest_peer, DC_RESOURCE_NAME, - peer_units, get_hacluster_config, ) from charmhelpers.payload.execd import execd_preinstall @@ -87,7 +85,6 @@ from percona_utils import ( get_db_helper, mark_seeded, seeded, install_mysql_ocf, - is_sufficient_peers, notify_bootstrapped, is_bootstrapped, get_wsrep_value, @@ -97,6 +94,8 @@ from percona_utils import ( create_binlogs_directory, bootstrap_pxc, get_cluster_host_ip, + client_node_is_ready, + leader_node_is_ready, ) @@ -228,23 +227,19 @@ def render_config_restart_on_changed(clustered, hosts, bootstrap=False): def update_shared_db_rels(): - for r_id in relation_ids('shared-db'): - for unit in related_units(r_id): - shared_db_changed(r_id, unit) + """ Upate client shared-db relations IFF ready + """ + if leader_node_is_ready() or client_node_is_ready(): + for r_id in relation_ids('shared-db'): + for unit in related_units(r_id): + shared_db_changed(r_id, unit) @hooks.hook('upgrade-charm') @harden() def upgrade(): - check_bootstrap = False - try: - if is_leader(): - check_bootstrap = True - except: - if oldest_peer(peer_units()): - check_bootstrap = True - if check_bootstrap and not is_bootstrapped() and is_sufficient_peers(): + if leader_node_is_ready(): # If this is the leader but we have not yet broadcast the cluster uuid # then do so now. wsrep_ready = get_wsrep_value('wsrep_ready') or "" @@ -276,35 +271,17 @@ def config_changed(): # applies if min-cluster-size is provided and is used to avoid extraneous # configuration changes and premature bootstrapping as the cluster is # deployed. - if is_sufficient_peers(): - try: - # NOTE(jamespage): try with leadership election - if is_leader(): - log("Leader unit - bootstrap required=%s" % (not bootstrapped), - DEBUG) - render_config_restart_on_changed(clustered, hosts, - bootstrap=not bootstrapped) - elif bootstrapped: - log("Cluster is bootstrapped - configuring mysql on this node", - DEBUG) - render_config_restart_on_changed(clustered, hosts) - else: - log("Not configuring", DEBUG) - - except NotImplementedError: - # NOTE(jamespage): fallback to legacy behaviour. - oldest = oldest_peer(peer_units()) - if oldest: - log("Leader unit - bootstrap required=%s" % (not bootstrapped), - DEBUG) - render_config_restart_on_changed(clustered, hosts, - bootstrap=not bootstrapped) - elif bootstrapped: - log("Cluster is bootstrapped - configuring mysql on this node", - DEBUG) - render_config_restart_on_changed(clustered, hosts) - else: - log("Not configuring", DEBUG) + if is_leader(): + log("Leader unit - bootstrap required=%s" % (not bootstrapped), + DEBUG) + render_config_restart_on_changed(clustered, hosts, + bootstrap=not bootstrapped) + elif bootstrapped: + log("Cluster is bootstrapped - configuring mysql on this node", + DEBUG) + render_config_restart_on_changed(clustered, hosts) + else: + log("Not configuring", DEBUG) # Notify any changes to the access network update_shared_db_rels() @@ -336,7 +313,7 @@ def cluster_joined(): relation_set(relation_settings=relation_settings) # Ensure all new peers are aware - cluster_state_uuid = relation_get('bootstrap-uuid', unit=local_unit()) + cluster_state_uuid = leader_get('bootstrap-uuid') if cluster_state_uuid: notify_bootstrapped(cluster_rid=relation_id(), cluster_uuid=cluster_state_uuid) @@ -486,7 +463,7 @@ def shared_db_changed(relation_id=None, unit=None): "until bootstrapped", DEBUG) return - if not is_elected_leader(DC_RESOURCE_NAME): + if not is_leader() and client_node_is_ready(): # NOTE(jamespage): relation level data candidate log('Service is peered, clearing shared-db relation ' 'as this service unit is not the leader') @@ -504,6 +481,10 @@ def shared_db_changed(relation_id=None, unit=None): relation_set(relation_id=rel_id, **peerdb_settings) return + # Bail if leader is not ready + if not leader_node_is_ready(): + return + settings = relation_get(unit=unit, rid=relation_id) access_network = config('access-network') db_helper = get_db_helper() @@ -720,6 +701,7 @@ def main(): hooks.execute(sys.argv) except UnregisteredHookError as e: log('Unknown hook {} - skipping.'.format(e)) + update_shared_db_rels() assess_status(register_configs()) diff --git a/hooks/percona_utils.py b/hooks/percona_utils.py index 62d5574..c7c22b7 100644 --- a/hooks/percona_utils.py +++ b/hooks/percona_utils.py @@ -31,6 +31,9 @@ from charmhelpers.core.hookenv import ( status_set, network_get_primary_address, application_version_set, + is_leader, + leader_get, + leader_set, ) from charmhelpers.fetch import ( apt_install, @@ -48,6 +51,7 @@ from charmhelpers.contrib.openstack.utils import ( make_assess_status_func, pause_unit, resume_unit, + is_unit_paused_set, ) # NOTE: python-mysqldb is installed by charmhelpers.contrib.database.mysql so @@ -128,26 +132,38 @@ def resolve_hostname_to_ip(hostname): def is_sufficient_peers(): - """If min-cluster-size has been provided, check that we have sufficient - number of peers to proceed with bootstrapping percona cluster. + """Sufficient number of expected peers to build a complete cluster + + If min-cluster-size has been provided, check that we have sufficient + number of peers as expected for a complete cluster. + + If not defined assume a single unit. + + @returns boolean """ + min_size = config('min-cluster-size') if min_size: - size = 0 - for rid in relation_ids('cluster'): - size = len(related_units(rid)) + log("Checking for minimum of {} peer units".format(min_size), + level=DEBUG) # Include this unit - size += 1 - if min_size > size: - log("Insufficient number of units to configure percona cluster " - "(expected=%s, got=%s)" % (min_size, size), level=INFO) + units = 1 + for rid in relation_ids('cluster'): + units += len(related_units(rid)) + + if units < min_size: + log("Insufficient number of peer units to form cluster " + "(expected=%s, got=%s)" % (min_size, units), level=INFO) return False else: - log("Sufficient units available to configure percona cluster " - "(>=%s)" % (min_size), level=DEBUG) - - return True + log("Sufficient number of peer units to form cluster {}" + "".format(min_size, level=DEBUG)) + return True + else: + log("min-cluster-size is not defined, race conditions may occur if " + "this is not a single unit deployment.", level=WARNING) + return True def get_cluster_hosts(): @@ -339,9 +355,10 @@ def get_wsrep_value(key): def is_bootstrapped(): - if not is_sufficient_peers(): - return False + """ Check that this unit is bootstrapped + @returns boolean + """ uuids = [] rids = relation_ids('cluster') or [] for rid in rids: @@ -402,6 +419,8 @@ def notify_bootstrapped(cluster_rid=None, cluster_uuid=None): (cluster_uuid), DEBUG) for rid in rids: relation_set(relation_id=rid, **{'bootstrap-uuid': cluster_uuid}) + if is_leader(): + leader_set(**{'bootstrap-uuid': cluster_uuid}) def cluster_in_sync(): @@ -583,3 +602,94 @@ def get_cluster_host_ip(): ) return cluster_addr + + +def cluster_ready(): + """Determine if each node in the cluster is ready and the cluster is + complete with the expected number of peers. + + Once cluster_ready returns True it is safe to execute client relation + hooks. Having min-cluster-size set will guarantee cluster_ready will not + return True until the expected number of peers are clustered and ready. + + If min-cluster-size is not set it must assume the cluster is ready in order + to allow for single unit deployments. + + @returns boolean + """ + min_size = config('min-cluster-size') + units = 1 + for relation_id in relation_ids('cluster'): + units += len(related_units(relation_id)) + if not min_size: + min_size = units + + if not is_sufficient_peers(): + return False + elif min_size > 1: + uuids = [] + for relation_id in relation_ids('cluster'): + units = related_units(relation_id) or [] + units.append(local_unit()) + for unit in units: + if not relation_get(attribute='bootstrap-uuid', + rid=relation_id, + unit=unit): + log("{} is not yet clustered".format(unit), + DEBUG) + return False + else: + bootstrap_uuid = relation_get(attribute='bootstrap-uuid', + rid=relation_id, + unit=unit) + if bootstrap_uuid: + uuids.append(bootstrap_uuid) + + if len(uuids) < min_size: + log("Fewer than minimum cluster size:{} percona units reporting " + "clustered".format(min_size), + DEBUG) + return False + elif len(set(uuids)) > 1: + raise Exception("Found inconsistent bootstrap uuids - %s" + "".format((uuids))) + else: + log("All {} percona units reporting clustered" + "".format(min_size), + DEBUG) + return True + + log("Must assume this is a single unit returning 'cluster' ready", DEBUG) + return True + + +def client_node_is_ready(): + """Determine if the leader node has set shared-db client data + + @returns boolean + """ + # Bail if this unit is paused + if is_unit_paused_set(): + return False + if not cluster_ready(): + return False + for rid in relation_ids('shared-db'): + if leader_get(attribute='{}_password'.format(rid)): + return True + return False + + +def leader_node_is_ready(): + """Determine if the leader node is ready to handle client relationship + hooks. + + IFF percona is not paused, is installed, this is the leader node and the + cluster is complete. + + @returns boolean + """ + # Paused check must run before other checks + # Bail if this unit is paused + if is_unit_paused_set(): + return False + return (is_leader() and cluster_ready()) diff --git a/unit_tests/test_percona_hooks.py b/unit_tests/test_percona_hooks.py index 4e1d1e2..2181750 100644 --- a/unit_tests/test_percona_hooks.py +++ b/unit_tests/test_percona_hooks.py @@ -22,7 +22,6 @@ TO_PATCH = ['log', 'config', 'get_iface_for_address', 'get_netmask_for_address', 'is_bootstrapped', - 'is_sufficient_peers', 'network_get_primary_address', 'resolve_network_cidr', 'unit_get', diff --git a/unit_tests/test_percona_utils.py b/unit_tests/test_percona_utils.py index a584f3c..61d10f6 100644 --- a/unit_tests/test_percona_utils.py +++ b/unit_tests/test_percona_utils.py @@ -10,6 +10,8 @@ import percona_utils from test_utils import CharmTestCase +os.environ['JUJU_UNIT_NAME'] = 'percona-cluster/2' + class UtilsTests(unittest.TestCase): def setUp(self): @@ -176,22 +178,28 @@ class UtilsTests(unittest.TestCase): mock.call(rid=88, unit=2)]) self.assertEqual(hosts, ['10.100.0.1', '10.100.0.2', '10.100.0.3']) - @mock.patch.object(percona_utils, 'log', lambda *args, **kwargs: None) + @mock.patch.object(percona_utils, 'is_leader') @mock.patch.object(percona_utils, 'related_units') @mock.patch.object(percona_utils, 'relation_ids') @mock.patch.object(percona_utils, 'config') def test_is_sufficient_peers(self, mock_config, mock_relation_ids, - mock_related_units): + mock_related_units, mock_is_leader): + mock_is_leader.return_value = False _config = {'min-cluster-size': None} mock_config.side_effect = lambda key: _config.get(key) self.assertTrue(percona_utils.is_sufficient_peers()) + mock_is_leader.return_value = False mock_relation_ids.return_value = ['cluster:0'] mock_related_units.return_value = ['test/0'] _config = {'min-cluster-size': 3} + mock_config.side_effect = lambda key: _config.get(key) self.assertFalse(percona_utils.is_sufficient_peers()) + mock_is_leader.return_value = False mock_related_units.return_value = ['test/0', 'test/1'] + _config = {'min-cluster-size': 3} + mock_config.side_effect = lambda key: _config.get(key) self.assertTrue(percona_utils.is_sufficient_peers()) @mock.patch.object(percona_utils, 'lsb_release') @@ -235,15 +243,20 @@ class UtilsTests(unittest.TestCase): TO_PATCH = [ - # 'status_set', 'is_sufficient_peers', 'is_bootstrapped', 'config', 'cluster_in_sync', + 'is_leader', + 'related_units', + 'relation_ids', + 'relation_get', + 'leader_get', + 'is_unit_paused_set', ] -class TestAssessStatus(CharmTestCase): +class UtilsTestsCTC(CharmTestCase): def setUp(self): CharmTestCase.setUp(self, percona_utils, TO_PATCH) @@ -335,3 +348,102 @@ class TestAssessStatus(CharmTestCase): asf.assert_called_once_with('some-config') # ports=None whilst port checks are disabled. f.assert_called_once_with('assessor', services='s1', ports=None) + + @mock.patch.object(percona_utils, 'is_sufficient_peers') + def test_cluster_ready(self, mock_is_sufficient_peers): + + # Not sufficient number of peers + mock_is_sufficient_peers.return_value = False + self.assertFalse(percona_utils.cluster_ready()) + + # Not all cluster ready + mock_is_sufficient_peers.return_value = True + self.relation_ids.return_value = ['cluster:0'] + self.related_units.return_value = ['test/0', 'test/1'] + self.relation_get.return_value = False + _config = {'min-cluster-size': 3} + self.config.side_effect = lambda key: _config.get(key) + self.assertFalse(percona_utils.cluster_ready()) + + # All cluster ready + mock_is_sufficient_peers.return_value = True + self.relation_ids.return_value = ['cluster:0'] + self.related_units.return_value = ['test/0', 'test/1'] + self.relation_get.return_value = 'UUID' + _config = {'min-cluster-size': 3} + self.config.side_effect = lambda key: _config.get(key) + self.assertTrue(percona_utils.cluster_ready()) + + # Not all cluster ready no min-cluster-size + mock_is_sufficient_peers.return_value = True + self.relation_ids.return_value = ['cluster:0'] + self.related_units.return_value = ['test/0', 'test/1'] + self.relation_get.return_value = False + _config = {'min-cluster-size': None} + self.config.side_effect = lambda key: _config.get(key) + self.assertFalse(percona_utils.cluster_ready()) + + # All cluster ready no min-cluster-size + mock_is_sufficient_peers.return_value = True + self.relation_ids.return_value = ['cluster:0'] + self.related_units.return_value = ['test/0', 'test/1'] + self.relation_get.return_value = 'UUID' + _config = {'min-cluster-size': None} + self.config.side_effect = lambda key: _config.get(key) + self.assertTrue(percona_utils.cluster_ready()) + + # Assume single unit no-min-cluster-size + mock_is_sufficient_peers.return_value = True + self.relation_ids.return_value = [] + self.related_units.return_value = [] + self.relation_get.return_value = None + _config = {'min-cluster-size': None} + self.config.side_effect = lambda key: _config.get(key) + self.assertTrue(percona_utils.cluster_ready()) + + @mock.patch.object(percona_utils, 'cluster_ready') + def test_client_node_is_ready(self, mock_cluster_ready): + # Paused + self.is_unit_paused_set.return_value = True + self.assertFalse(percona_utils.client_node_is_ready()) + + # Cluster not ready + mock_cluster_ready.return_value = False + self.assertFalse(percona_utils.client_node_is_ready()) + + # Not ready + self.is_unit_paused_set.return_value = False + mock_cluster_ready.return_value = True + self.relation_ids.return_value = ['shared-db:0'] + self.leader_get.return_value = {} + self.assertFalse(percona_utils.client_node_is_ready()) + + # Ready + self.is_unit_paused_set.return_value = False + mock_cluster_ready.return_value = True + self.relation_ids.return_value = ['shared-db:0'] + self.leader_get.return_value = {'shared-db:0_password': 'password'} + self.assertTrue(percona_utils.client_node_is_ready()) + + @mock.patch.object(percona_utils, 'cluster_ready') + def test_leader_node_is_ready(self, mock_cluster_ready): + # Paused + self.is_unit_paused_set.return_value = True + self.assertFalse(percona_utils.leader_node_is_ready()) + + # Not leader + self.is_unit_paused_set.return_value = False + self.is_leader.return_value = False + self.assertFalse(percona_utils.leader_node_is_ready()) + + # Not cluster ready + self.is_unit_paused_set.return_value = False + self.is_leader.return_value = True + mock_cluster_ready.return_value = False + self.assertFalse(percona_utils.leader_node_is_ready()) + + # Leader ready + self.is_unit_paused_set.return_value = False + self.is_leader.return_value = True + mock_cluster_ready.return_value = True + self.assertTrue(percona_utils.leader_node_is_ready())