Enhancing Retry logic to Coordination when joining partitioning grp

Since currently it is in infinite loop it burns cpu cycles
continuosly when trying to join partitioning group and also keeps
filling logs leading to use up of entire disk/log partition.
This happens if zookeeper is in badshape and ceilometer component
using zookeeper for coordination

DocImpact: Adds new option parameters for ceilometer coordination
Change-Id: Icf60381e30f3baf986cf9e008e133287765d9827
Closes-Bug: #1558735
This commit is contained in:
aggaatul 2016-03-17 12:04:19 -07:00 committed by Atul Aggarwal
parent dd2c6509c8
commit 3459bc59f2
1 changed files with 34 additions and 5 deletions

View File

@ -39,12 +39,25 @@ OPTS = [
cfg.FloatOpt('check_watchers',
default=10.0,
help='Number of seconds between checks to see if group '
'membership has changed')
'membership has changed'),
cfg.IntOpt('retry_backoff',
default=1,
help='Retry backoff factor when retrying to connect with'
'coordination backend'),
cfg.IntOpt('max_retry_interval',
default=30,
help='Maximum number of seconds between retry to join '
'partitioning group')
]
cfg.CONF.register_opts(OPTS, group='coordination')
class ErrorJoiningPartitioningGroup(Exception):
def __init__(self):
super(ErrorJoiningPartitioningGroup, self).__init__(_LE(
'Coordination join_group Error joining partitioning group'))
class MemberNotInGroupError(Exception):
def __init__(self, group_id, members, my_id):
super(MemberNotInGroupError, self).__init__(_LE(
@ -53,6 +66,10 @@ class MemberNotInGroupError(Exception):
{'group_id': group_id, 'members': members, 'me': my_id})
def retry_on_error_joining_partition(exception):
return isinstance(exception, ErrorJoiningPartitioningGroup)
def retry_on_member_not_in_group(exception):
return isinstance(exception, MemberNotInGroupError)
@ -128,12 +145,20 @@ class PartitionCoordinator(object):
if (not self._coordinator or not self._coordinator.is_started
or not group_id):
return
while True:
retry_backoff = cfg.CONF.coordination.retry_backoff * 1000
max_retry_interval = cfg.CONF.coordination.max_retry_interval * 1000
@retrying.retry(
wait_exponential_multiplier=retry_backoff,
wait_exponential_max=max_retry_interval,
retry_on_exception=retry_on_error_joining_partition,
wrap_exception=True)
def _inner():
try:
join_req = self._coordinator.join_group(group_id)
join_req.get()
LOG.info(_LI('Joined partitioning group %s'), group_id)
break
except tooz.coordination.MemberAlreadyExist:
return
except tooz.coordination.GroupNotCreated:
@ -142,10 +167,14 @@ class PartitionCoordinator(object):
create_grp_req.get()
except tooz.coordination.GroupAlreadyExist:
pass
raise ErrorJoiningPartitioningGroup()
except tooz.coordination.ToozError:
LOG.exception(_LE('Error joining partitioning group %s,'
' re-trying'), group_id)
self._groups.add(group_id)
raise ErrorJoiningPartitioningGroup()
self._groups.add(group_id)
return _inner()
def leave_group(self, group_id):
if group_id not in self._groups: