Do not hardcode flush_on_reconnect, move to oslo.cache config

Param flush_on_reconnect is very risky to use on production deployments. It can cause exponential raising of connections to memcached servers. Moreover this option makes sense only in keystone's oslo.cache config. This patch is moving flush_on_reconnect from code to oslo.cache config block to be configurable. Co-Authored-By: Hervé Beraud <hberaud@redhat.com> Change-Id: I8e6826bfb2c85e7ceed03e1667bd6a06b3dff469 Closes-Bug: #1888394
2020-07-21 15:42:17 +02:00 · 2020-07-21 15:42:17 +02:00 · a437b219ac
parent bc9c70fd66
commit a437b219ac
6 changed files with 101 additions and 22 deletions
--- a/oslo_cache/_memcache_pool.py
+++ b/oslo_cache/_memcache_pool.py
@ -211,27 +211,7 @@ class MemcacheClientPool(ConnectionPool):
        self._hosts_deaduntil = [0] * len(urls)

    def _create_connection(self):
-        # NOTE(morgan): Explicitly set flush_on_reconnect for pooled
-        # connections. This should ensure that stale data is never consumed
-        # from a server that pops in/out due to a network partition
-        # or disconnect.
-        #
-        # See the help from python-memcached:
-        #
-        # param flush_on_reconnect: optional flag which prevents a
-        #        scenario that can cause stale data to be read: If there's more
-        #        than one memcached server and the connection to one is
-        #        interrupted, keys that mapped to that server will get
-        #        reassigned to another. If the first server comes back, those
-        #        keys will map to it again. If it still has its data, get()s
-        #        can read stale data that was overwritten on another
-        #        server. This flag is off by default for backwards
-        #        compatibility.
-        #
-        # The normal non-pooled clients connect explicitly on each use and
-        # does not need the explicit flush_on_reconnect
-        return _MemcacheClient(self.urls, flush_on_reconnect=True,
-                               **self._arguments)
+        return _MemcacheClient(self.urls, **self._arguments)

    def _destroy_connection(self, conn):
        conn.disconnect_all()
--- a/oslo_cache/_opts.py
+++ b/oslo_cache/_opts.py
@ -110,6 +110,11 @@ FILE_OPTIONS = {
                   default=10,
                   help='Number of seconds that an operation will wait to get '
                        'a memcache client connection.'),
+        cfg.BoolOpt('memcache_pool_flush_on_reconnect',
+                    default=False,
+                    help='Global toggle if memcache will be flushed'
+                    ' on reconnect.'
+                    ' (oslo_cache.memcache_pool backend only).'),
        cfg.BoolOpt('tls_enabled',
                    default=False,
                    help='Global toggle for TLS usage when comunicating with'
@ -150,6 +155,25 @@ def configure(conf):
            conf.register_opt(option, group=section)


+def set_defaults(conf, memcache_pool_flush_on_reconnect=False):
+    """Set defaults for configuration variables.
+
+    Overrides default options values.
+
+    :param conf: Configuration object, managed by the caller.
+    :type conf: oslo.config.cfg.ConfigOpts
+
+    :param memcache_pool_flush_on_reconnect: The default state for the
+        ``flush_on_reconnect`` flag. By default deactivated
+    :type memcache_pool_flush_on_reconnect: bool
+    """
+    conf.register_opt(FILE_OPTIONS, group='cache')
+
+    cfg.set_defaults(
+        FILE_OPTIONS,
+        memcache_pool_flush_on_reconnect=memcache_pool_flush_on_reconnect)
+
+
 def list_opts():
    """Return a list of oslo_config options.

--- a/oslo_cache/backends/memcache_pool.py
+++ b/oslo_cache/backends/memcache_pool.py
@ -48,6 +48,8 @@ class PooledMemcachedBackend(memcached_backend.MemcachedBackend):
                'socket_timeout': arguments.get('socket_timeout', 3.0),
                'server_max_value_length':
                    arguments.get('server_max_value_length'),
+                'flush_on_reconnect': arguments.get('pool_flush_on_reconnect',
+                                                    False),
            },
            maxsize=arguments.get('pool_maxsize', 10),
            unused_timeout=arguments.get('pool_unused_timeout', 60),
--- a/oslo_cache/core.py
+++ b/oslo_cache/core.py
@ -139,10 +139,31 @@ def _build_cache_config(conf):
    # NOTE(yorik-sar): these arguments will be used for memcache-related
    # backends. Use setdefault for url to support old-style setting through
    # backend_argument=url:127.0.0.1:11211
+    #
+    # NOTE(morgan): Explicitly set flush_on_reconnect for pooled
+    # connections. This should ensure that stale data is never consumed
+    # from a server that pops in/out due to a network partition
+    # or disconnect.
+    #
+    # See the help from python-memcached:
+    #
+    # param flush_on_reconnect: optional flag which prevents a
+    #        scenario that can cause stale data to be read: If there's more
+    #        than one memcached server and the connection to one is
+    #        interrupted, keys that mapped to that server will get
+    #        reassigned to another. If the first server comes back, those
+    #        keys will map to it again. If it still has its data, get()s
+    #        can read stale data that was overwritten on another
+    #        server. This flag is off by default for backwards
+    #        compatibility.
+    #
+    # The normal non-pooled clients connect explicitly on each use and
+    # does not need the explicit flush_on_reconnect
    conf_dict.setdefault('%s.arguments.url' % prefix,
                         conf.cache.memcache_servers)
    for arg in ('dead_retry', 'socket_timeout', 'pool_maxsize',
-                'pool_unused_timeout', 'pool_connection_get_timeout'):
+                'pool_unused_timeout', 'pool_connection_get_timeout',
+                'pool_flush_on_reconnect'):
        value = getattr(conf.cache, 'memcache_' + arg)
        conf_dict['%s.arguments.%s' % (prefix, arg)] = value

--- a/oslo_cache/tests/unit/test_cache_basics.py
+++ b/oslo_cache/tests/unit/test_cache_basics.py
@ -352,6 +352,36 @@ class CacheRegionTest(test_cache.BaseTestCase):
                config_dict['test_prefix.arguments.tls_context'],
            )

+    def test_cache_dictionary_config_builder_flush_on_reconnect_enabled(self):
+        """Validate we build a sane dogpile.cache dictionary config."""
+        self.config_fixture.config(group='cache',
+                                   enabled=True,
+                                   config_prefix='test_prefix',
+                                   backend='oslo_cache.dict',
+                                   memcache_pool_flush_on_reconnect=True)
+
+        config_dict = cache._build_cache_config(self.config_fixture.conf)
+
+        self.assertTrue(self.config_fixture.conf.cache.
+                        memcache_pool_flush_on_reconnect)
+        self.assertTrue(config_dict['test_prefix.arguments'
+                                    '.pool_flush_on_reconnect'])
+
+    def test_cache_dictionary_config_builder_flush_on_reconnect_disabled(self):
+        """Validate we build a sane dogpile.cache dictionary config."""
+        self.config_fixture.config(group='cache',
+                                   enabled=True,
+                                   config_prefix='test_prefix',
+                                   backend='oslo_cache.dict',
+                                   memcache_pool_flush_on_reconnect=False)
+
+        config_dict = cache._build_cache_config(self.config_fixture.conf)
+
+        self.assertFalse(self.config_fixture.conf.cache.
+                         memcache_pool_flush_on_reconnect)
+        self.assertFalse(config_dict['test_prefix.arguments'
+                                     '.pool_flush_on_reconnect'])
+
    def test_cache_debug_proxy(self):
        single_value = 'Test Value'
        single_key = 'testkey'
--- a/releasenotes/notes/bug-1888394-5a53e7a9cb25375b.yaml
+++ b/releasenotes/notes/bug-1888394-5a53e7a9cb25375b.yaml
@ -0,0 +1,22 @@
+---
+fixes:
+  - |
+    [`bug 1888394 <https://bugs.launchpad.net/oslo.cache/+bug/1888394>`_]
+    If a memcache server disappears and then reconnects when multiple memcache
+    servers are used (specific to the python-memcached based backends) it is
+    possible that the server will contain stale data. To avoid this, param
+    flush_on_reconnect was used in code.
+
+    But unfortunatelly this option is causing another issue.
+    If memcache server disappears, or client had broken connection to memcache server,
+    clients start to flush server on reconnect.
+
+    This means that network connections will go UP and can cause server to be overloaded
+    until memcache will be unresponsive.
+
+    Simply said this option can cause loop of flushs and overloaded memcached servers.
+    This change is moving optional parameter `flush_on_reconnect` to oslo.cache config.
+
+features:
+  - Configuration option ``memcache_pool_flush_on_reconnect`` added to control
+    if flush will be sent to memcached server after reconnect.