Wait until clustered before running client hooks

Percona cluster takes some time to fully cluster. The charm was
previously running shared-db-relation-changed hooks whenever they
were queued even if the cluster was not yet complete. This may lead
to split brain scenarios or unexpected behavior.

This change confirms the entire cluster is ready before running
client shared-db-relation-changed hooks.

min-cluster-size can now be used to attempt to guarantee the cluster
is ready with the expected number of nodes. If min-cluster-size is
not set the charm will still determine based on the information
available if all the cluster nodes are ready. Single node
deployments are still possible.

Partial-Bug: #1655417
Change-Id: Ie9deb266a9682e86f3a9cbc1103b655b13a8295e
This commit is contained in:
David Ames 2017-02-07 15:10:11 -08:00
parent 8815918715
commit 58d3641dee
5 changed files with 288 additions and 66 deletions

View File

@ -93,6 +93,25 @@ requirements and resources available.
HA/Clustering
-------------
When more than one unit of the charm is deployed with the hacluster charm
the percona charm will bring up an Active/Active cluster. The process of
clustering the units together takes some time. Due to the nature of
asynchronous hook execution it is possible client relationship hooks may
be executed before the cluster is complete. In some cases, this can lead
to client charm errors.
To guarantee client relation hooks will not be executed until clustering is
completed use the min-cluster-size configuration setting:
juju deploy -n 3 percona-cluster
juju config percona-cluster min-cluster-size=3
When min-cluster-size is not set the charm will still cluster, however,
there are no guarantees client relation hooks will not execute before it is
complete.
Single unit deployments behave as expected.
There are two mutually exclusive high availability options: using virtual
IP(s) or DNS. In both cases, a relationship to hacluster is required which
provides the corosync back end HA functionality.

View File

@ -10,7 +10,6 @@ from charmhelpers.core.hookenv import (
Hooks, UnregisteredHookError,
is_relation_made,
log,
local_unit,
relation_get,
relation_set,
relation_id,
@ -26,6 +25,7 @@ from charmhelpers.core.hookenv import (
is_leader,
network_get_primary_address,
charm_name,
leader_get,
)
from charmhelpers.core.host import (
service_restart,
@ -50,9 +50,7 @@ from charmhelpers.contrib.database.mysql import (
from charmhelpers.contrib.hahelpers.cluster import (
is_elected_leader,
is_clustered,
oldest_peer,
DC_RESOURCE_NAME,
peer_units,
get_hacluster_config,
)
from charmhelpers.payload.execd import execd_preinstall
@ -87,7 +85,6 @@ from percona_utils import (
get_db_helper,
mark_seeded, seeded,
install_mysql_ocf,
is_sufficient_peers,
notify_bootstrapped,
is_bootstrapped,
get_wsrep_value,
@ -97,6 +94,8 @@ from percona_utils import (
create_binlogs_directory,
bootstrap_pxc,
get_cluster_host_ip,
client_node_is_ready,
leader_node_is_ready,
)
@ -228,23 +227,19 @@ def render_config_restart_on_changed(clustered, hosts, bootstrap=False):
def update_shared_db_rels():
for r_id in relation_ids('shared-db'):
for unit in related_units(r_id):
shared_db_changed(r_id, unit)
""" Upate client shared-db relations IFF ready
"""
if leader_node_is_ready() or client_node_is_ready():
for r_id in relation_ids('shared-db'):
for unit in related_units(r_id):
shared_db_changed(r_id, unit)
@hooks.hook('upgrade-charm')
@harden()
def upgrade():
check_bootstrap = False
try:
if is_leader():
check_bootstrap = True
except:
if oldest_peer(peer_units()):
check_bootstrap = True
if check_bootstrap and not is_bootstrapped() and is_sufficient_peers():
if leader_node_is_ready():
# If this is the leader but we have not yet broadcast the cluster uuid
# then do so now.
wsrep_ready = get_wsrep_value('wsrep_ready') or ""
@ -276,35 +271,17 @@ def config_changed():
# applies if min-cluster-size is provided and is used to avoid extraneous
# configuration changes and premature bootstrapping as the cluster is
# deployed.
if is_sufficient_peers():
try:
# NOTE(jamespage): try with leadership election
if is_leader():
log("Leader unit - bootstrap required=%s" % (not bootstrapped),
DEBUG)
render_config_restart_on_changed(clustered, hosts,
bootstrap=not bootstrapped)
elif bootstrapped:
log("Cluster is bootstrapped - configuring mysql on this node",
DEBUG)
render_config_restart_on_changed(clustered, hosts)
else:
log("Not configuring", DEBUG)
except NotImplementedError:
# NOTE(jamespage): fallback to legacy behaviour.
oldest = oldest_peer(peer_units())
if oldest:
log("Leader unit - bootstrap required=%s" % (not bootstrapped),
DEBUG)
render_config_restart_on_changed(clustered, hosts,
bootstrap=not bootstrapped)
elif bootstrapped:
log("Cluster is bootstrapped - configuring mysql on this node",
DEBUG)
render_config_restart_on_changed(clustered, hosts)
else:
log("Not configuring", DEBUG)
if is_leader():
log("Leader unit - bootstrap required=%s" % (not bootstrapped),
DEBUG)
render_config_restart_on_changed(clustered, hosts,
bootstrap=not bootstrapped)
elif bootstrapped:
log("Cluster is bootstrapped - configuring mysql on this node",
DEBUG)
render_config_restart_on_changed(clustered, hosts)
else:
log("Not configuring", DEBUG)
# Notify any changes to the access network
update_shared_db_rels()
@ -336,7 +313,7 @@ def cluster_joined():
relation_set(relation_settings=relation_settings)
# Ensure all new peers are aware
cluster_state_uuid = relation_get('bootstrap-uuid', unit=local_unit())
cluster_state_uuid = leader_get('bootstrap-uuid')
if cluster_state_uuid:
notify_bootstrapped(cluster_rid=relation_id(),
cluster_uuid=cluster_state_uuid)
@ -486,7 +463,7 @@ def shared_db_changed(relation_id=None, unit=None):
"until bootstrapped", DEBUG)
return
if not is_elected_leader(DC_RESOURCE_NAME):
if not is_leader() and client_node_is_ready():
# NOTE(jamespage): relation level data candidate
log('Service is peered, clearing shared-db relation '
'as this service unit is not the leader')
@ -504,6 +481,10 @@ def shared_db_changed(relation_id=None, unit=None):
relation_set(relation_id=rel_id, **peerdb_settings)
return
# Bail if leader is not ready
if not leader_node_is_ready():
return
settings = relation_get(unit=unit, rid=relation_id)
access_network = config('access-network')
db_helper = get_db_helper()
@ -720,6 +701,7 @@ def main():
hooks.execute(sys.argv)
except UnregisteredHookError as e:
log('Unknown hook {} - skipping.'.format(e))
update_shared_db_rels()
assess_status(register_configs())

View File

@ -31,6 +31,9 @@ from charmhelpers.core.hookenv import (
status_set,
network_get_primary_address,
application_version_set,
is_leader,
leader_get,
leader_set,
)
from charmhelpers.fetch import (
apt_install,
@ -48,6 +51,7 @@ from charmhelpers.contrib.openstack.utils import (
make_assess_status_func,
pause_unit,
resume_unit,
is_unit_paused_set,
)
# NOTE: python-mysqldb is installed by charmhelpers.contrib.database.mysql so
@ -128,26 +132,38 @@ def resolve_hostname_to_ip(hostname):
def is_sufficient_peers():
"""If min-cluster-size has been provided, check that we have sufficient
number of peers to proceed with bootstrapping percona cluster.
"""Sufficient number of expected peers to build a complete cluster
If min-cluster-size has been provided, check that we have sufficient
number of peers as expected for a complete cluster.
If not defined assume a single unit.
@returns boolean
"""
min_size = config('min-cluster-size')
if min_size:
size = 0
for rid in relation_ids('cluster'):
size = len(related_units(rid))
log("Checking for minimum of {} peer units".format(min_size),
level=DEBUG)
# Include this unit
size += 1
if min_size > size:
log("Insufficient number of units to configure percona cluster "
"(expected=%s, got=%s)" % (min_size, size), level=INFO)
units = 1
for rid in relation_ids('cluster'):
units += len(related_units(rid))
if units < min_size:
log("Insufficient number of peer units to form cluster "
"(expected=%s, got=%s)" % (min_size, units), level=INFO)
return False
else:
log("Sufficient units available to configure percona cluster "
"(>=%s)" % (min_size), level=DEBUG)
return True
log("Sufficient number of peer units to form cluster {}"
"".format(min_size, level=DEBUG))
return True
else:
log("min-cluster-size is not defined, race conditions may occur if "
"this is not a single unit deployment.", level=WARNING)
return True
def get_cluster_hosts():
@ -338,9 +354,10 @@ def get_wsrep_value(key):
def is_bootstrapped():
if not is_sufficient_peers():
return False
""" Check that this unit is bootstrapped
@returns boolean
"""
uuids = []
rids = relation_ids('cluster') or []
for rid in rids:
@ -401,6 +418,8 @@ def notify_bootstrapped(cluster_rid=None, cluster_uuid=None):
(cluster_uuid), DEBUG)
for rid in rids:
relation_set(relation_id=rid, **{'bootstrap-uuid': cluster_uuid})
if is_leader():
leader_set(**{'bootstrap-uuid': cluster_uuid})
def cluster_in_sync():
@ -582,3 +601,94 @@ def get_cluster_host_ip():
)
return cluster_addr
def cluster_ready():
"""Determine if each node in the cluster is ready and the cluster is
complete with the expected number of peers.
Once cluster_ready returns True it is safe to execute client relation
hooks. Having min-cluster-size set will guarantee cluster_ready will not
return True until the expected number of peers are clustered and ready.
If min-cluster-size is not set it must assume the cluster is ready in order
to allow for single unit deployments.
@returns boolean
"""
min_size = config('min-cluster-size')
units = 1
for relation_id in relation_ids('cluster'):
units += len(related_units(relation_id))
if not min_size:
min_size = units
if not is_sufficient_peers():
return False
elif min_size > 1:
uuids = []
for relation_id in relation_ids('cluster'):
units = related_units(relation_id) or []
units.append(local_unit())
for unit in units:
if not relation_get(attribute='bootstrap-uuid',
rid=relation_id,
unit=unit):
log("{} is not yet clustered".format(unit),
DEBUG)
return False
else:
bootstrap_uuid = relation_get(attribute='bootstrap-uuid',
rid=relation_id,
unit=unit)
if bootstrap_uuid:
uuids.append(bootstrap_uuid)
if len(uuids) < min_size:
log("Fewer than minimum cluster size:{} percona units reporting "
"clustered".format(min_size),
DEBUG)
return False
elif len(set(uuids)) > 1:
raise Exception("Found inconsistent bootstrap uuids - %s"
"".format((uuids)))
else:
log("All {} percona units reporting clustered"
"".format(min_size),
DEBUG)
return True
log("Must assume this is a single unit returning 'cluster' ready", DEBUG)
return True
def client_node_is_ready():
"""Determine if the leader node has set shared-db client data
@returns boolean
"""
# Bail if this unit is paused
if is_unit_paused_set():
return False
if not cluster_ready():
return False
for rid in relation_ids('shared-db'):
if leader_get(attribute='{}_password'.format(rid)):
return True
return False
def leader_node_is_ready():
"""Determine if the leader node is ready to handle client relationship
hooks.
IFF percona is not paused, is installed, this is the leader node and the
cluster is complete.
@returns boolean
"""
# Paused check must run before other checks
# Bail if this unit is paused
if is_unit_paused_set():
return False
return (is_leader() and cluster_ready())

View File

@ -22,7 +22,6 @@ TO_PATCH = ['log', 'config',
'get_iface_for_address',
'get_netmask_for_address',
'is_bootstrapped',
'is_sufficient_peers',
'network_get_primary_address',
'resolve_network_cidr',
'unit_get',

View File

@ -10,6 +10,8 @@ import percona_utils
from test_utils import CharmTestCase
os.environ['JUJU_UNIT_NAME'] = 'percona-cluster/2'
class UtilsTests(unittest.TestCase):
def setUp(self):
@ -176,22 +178,28 @@ class UtilsTests(unittest.TestCase):
mock.call(rid=88, unit=2)])
self.assertEqual(hosts, ['10.100.0.1', '10.100.0.2', '10.100.0.3'])
@mock.patch.object(percona_utils, 'log', lambda *args, **kwargs: None)
@mock.patch.object(percona_utils, 'is_leader')
@mock.patch.object(percona_utils, 'related_units')
@mock.patch.object(percona_utils, 'relation_ids')
@mock.patch.object(percona_utils, 'config')
def test_is_sufficient_peers(self, mock_config, mock_relation_ids,
mock_related_units):
mock_related_units, mock_is_leader):
mock_is_leader.return_value = False
_config = {'min-cluster-size': None}
mock_config.side_effect = lambda key: _config.get(key)
self.assertTrue(percona_utils.is_sufficient_peers())
mock_is_leader.return_value = False
mock_relation_ids.return_value = ['cluster:0']
mock_related_units.return_value = ['test/0']
_config = {'min-cluster-size': 3}
mock_config.side_effect = lambda key: _config.get(key)
self.assertFalse(percona_utils.is_sufficient_peers())
mock_is_leader.return_value = False
mock_related_units.return_value = ['test/0', 'test/1']
_config = {'min-cluster-size': 3}
mock_config.side_effect = lambda key: _config.get(key)
self.assertTrue(percona_utils.is_sufficient_peers())
@mock.patch.object(percona_utils, 'lsb_release')
@ -235,15 +243,20 @@ class UtilsTests(unittest.TestCase):
TO_PATCH = [
# 'status_set',
'is_sufficient_peers',
'is_bootstrapped',
'config',
'cluster_in_sync',
'is_leader',
'related_units',
'relation_ids',
'relation_get',
'leader_get',
'is_unit_paused_set',
]
class TestAssessStatus(CharmTestCase):
class UtilsTestsCTC(CharmTestCase):
def setUp(self):
CharmTestCase.setUp(self, percona_utils, TO_PATCH)
@ -335,3 +348,102 @@ class TestAssessStatus(CharmTestCase):
asf.assert_called_once_with('some-config')
# ports=None whilst port checks are disabled.
f.assert_called_once_with('assessor', services='s1', ports=None)
@mock.patch.object(percona_utils, 'is_sufficient_peers')
def test_cluster_ready(self, mock_is_sufficient_peers):
# Not sufficient number of peers
mock_is_sufficient_peers.return_value = False
self.assertFalse(percona_utils.cluster_ready())
# Not all cluster ready
mock_is_sufficient_peers.return_value = True
self.relation_ids.return_value = ['cluster:0']
self.related_units.return_value = ['test/0', 'test/1']
self.relation_get.return_value = False
_config = {'min-cluster-size': 3}
self.config.side_effect = lambda key: _config.get(key)
self.assertFalse(percona_utils.cluster_ready())
# All cluster ready
mock_is_sufficient_peers.return_value = True
self.relation_ids.return_value = ['cluster:0']
self.related_units.return_value = ['test/0', 'test/1']
self.relation_get.return_value = 'UUID'
_config = {'min-cluster-size': 3}
self.config.side_effect = lambda key: _config.get(key)
self.assertTrue(percona_utils.cluster_ready())
# Not all cluster ready no min-cluster-size
mock_is_sufficient_peers.return_value = True
self.relation_ids.return_value = ['cluster:0']
self.related_units.return_value = ['test/0', 'test/1']
self.relation_get.return_value = False
_config = {'min-cluster-size': None}
self.config.side_effect = lambda key: _config.get(key)
self.assertFalse(percona_utils.cluster_ready())
# All cluster ready no min-cluster-size
mock_is_sufficient_peers.return_value = True
self.relation_ids.return_value = ['cluster:0']
self.related_units.return_value = ['test/0', 'test/1']
self.relation_get.return_value = 'UUID'
_config = {'min-cluster-size': None}
self.config.side_effect = lambda key: _config.get(key)
self.assertTrue(percona_utils.cluster_ready())
# Assume single unit no-min-cluster-size
mock_is_sufficient_peers.return_value = True
self.relation_ids.return_value = []
self.related_units.return_value = []
self.relation_get.return_value = None
_config = {'min-cluster-size': None}
self.config.side_effect = lambda key: _config.get(key)
self.assertTrue(percona_utils.cluster_ready())
@mock.patch.object(percona_utils, 'cluster_ready')
def test_client_node_is_ready(self, mock_cluster_ready):
# Paused
self.is_unit_paused_set.return_value = True
self.assertFalse(percona_utils.client_node_is_ready())
# Cluster not ready
mock_cluster_ready.return_value = False
self.assertFalse(percona_utils.client_node_is_ready())
# Not ready
self.is_unit_paused_set.return_value = False
mock_cluster_ready.return_value = True
self.relation_ids.return_value = ['shared-db:0']
self.leader_get.return_value = {}
self.assertFalse(percona_utils.client_node_is_ready())
# Ready
self.is_unit_paused_set.return_value = False
mock_cluster_ready.return_value = True
self.relation_ids.return_value = ['shared-db:0']
self.leader_get.return_value = {'shared-db:0_password': 'password'}
self.assertTrue(percona_utils.client_node_is_ready())
@mock.patch.object(percona_utils, 'cluster_ready')
def test_leader_node_is_ready(self, mock_cluster_ready):
# Paused
self.is_unit_paused_set.return_value = True
self.assertFalse(percona_utils.leader_node_is_ready())
# Not leader
self.is_unit_paused_set.return_value = False
self.is_leader.return_value = False
self.assertFalse(percona_utils.leader_node_is_ready())
# Not cluster ready
self.is_unit_paused_set.return_value = False
self.is_leader.return_value = True
mock_cluster_ready.return_value = False
self.assertFalse(percona_utils.leader_node_is_ready())
# Leader ready
self.is_unit_paused_set.return_value = False
self.is_leader.return_value = True
mock_cluster_ready.return_value = True
self.assertTrue(percona_utils.leader_node_is_ready())