Enhancing Retry logic to Coordination when joining partitioning grp

Since currently it is in infinite loop it burns cpu cycles continuosly when trying to join partitioning group and also keeps filling logs leading to use up of entire disk/log partition. This happens if zookeeper is in badshape and ceilometer component using zookeeper for coordination DocImpact: Adds new option parameters for ceilometer coordination Change-Id: Icf60381e30f3baf986cf9e008e133287765d9827 Closes-Bug: #1558735
2016-03-17 12:04:19 -07:00 · 2016-03-17 12:04:19 -07:00 · 3459bc59f2
parent dd2c6509c8
commit 3459bc59f2
1 changed files with 34 additions and 5 deletions
--- a/ceilometer/coordination.py
+++ b/ceilometer/coordination.py
@ -39,12 +39,25 @@ OPTS = [
    cfg.FloatOpt('check_watchers',
                 default=10.0,
                 help='Number of seconds between checks to see if group '
-                      'membership has changed')
-
+                      'membership has changed'),
+    cfg.IntOpt('retry_backoff',
+               default=1,
+               help='Retry backoff factor when retrying to connect with'
+                    'coordination backend'),
+    cfg.IntOpt('max_retry_interval',
+               default=30,
+               help='Maximum number of seconds between retry to join '
+                    'partitioning group')
 ]
 cfg.CONF.register_opts(OPTS, group='coordination')


+class ErrorJoiningPartitioningGroup(Exception):
+    def __init__(self):
+        super(ErrorJoiningPartitioningGroup, self).__init__(_LE(
+            'Coordination join_group Error joining partitioning group'))
+
+
 class MemberNotInGroupError(Exception):
    def __init__(self, group_id, members, my_id):
        super(MemberNotInGroupError, self).__init__(_LE(
@ -53,6 +66,10 @@ class MemberNotInGroupError(Exception):
            {'group_id': group_id, 'members': members, 'me': my_id})


+def retry_on_error_joining_partition(exception):
+    return isinstance(exception, ErrorJoiningPartitioningGroup)
+
+
 def retry_on_member_not_in_group(exception):
    return isinstance(exception, MemberNotInGroupError)

@ -128,12 +145,20 @@ class PartitionCoordinator(object):
        if (not self._coordinator or not self._coordinator.is_started
                or not group_id):
            return
-        while True:
+
+        retry_backoff = cfg.CONF.coordination.retry_backoff * 1000
+        max_retry_interval = cfg.CONF.coordination.max_retry_interval * 1000
+
+        @retrying.retry(
+            wait_exponential_multiplier=retry_backoff,
+            wait_exponential_max=max_retry_interval,
+            retry_on_exception=retry_on_error_joining_partition,
+            wrap_exception=True)
+        def _inner():
            try:
                join_req = self._coordinator.join_group(group_id)
                join_req.get()
                LOG.info(_LI('Joined partitioning group %s'), group_id)
-                break
            except tooz.coordination.MemberAlreadyExist:
                return
            except tooz.coordination.GroupNotCreated:
@ -142,10 +167,14 @@ class PartitionCoordinator(object):
                    create_grp_req.get()
                except tooz.coordination.GroupAlreadyExist:
                    pass
+                raise ErrorJoiningPartitioningGroup()
            except tooz.coordination.ToozError:
                LOG.exception(_LE('Error joining partitioning group %s,'
                                  ' re-trying'), group_id)
-        self._groups.add(group_id)
+                raise ErrorJoiningPartitioningGroup()
+            self._groups.add(group_id)
+
+        return _inner()

    def leave_group(self, group_id):
        if group_id not in self._groups: