Fix memory leak related to cached lookups

* It turned out that Mistral used a lot of memory because it used cached DB lookups for task executions (only task executions in a terminal state get cached) and the maximum size of the cache was too big, 20000 entries. One task execution in certain cases may take a lot of memory (e.g. several megabytes) so 20000 objects make memory footprint huge. Additionally, when a workflow completes we need to invalidate coresponding task executions in the cache. This didn't happen before this patch. * This patch fixes the aforementioned issues by using partial invalidation of the cache and setting smaller cache size. * Fixed "Starting workflow .." log statement to not print the entire structure of the workflow definition into the workflow log, only its name and input parameters * Minor style fixes Change-Id: I0ee300f631a4bdfa2f618c2a10048267f27b3345 Closes-bug: #1664864 (cherry picked from commit 086a3d43fa)
2017-02-15 18:16:57 +07:00 · 2017-02-15 18:16:57 +07:00 · 00199fdfe8
parent 12527abae8
commit 00199fdfe8
5 changed files with 127 additions and 15 deletions
--- a/mistral/api/controllers/v2/resources.py
+++ b/mistral/api/controllers/v2/resources.py
@ -353,7 +353,7 @@ class ActionExecution(resource.Resource):
    output = types.jsontype
    created_at = wtypes.text
    updated_at = wtypes.text
-    params = types.jsontype
+    params = types.jsontype  # TODO(rakhmerov): What is this??

    @classmethod
    def sample(cls):
--- a/mistral/engine/workflows.py
+++ b/mistral/engine/workflows.py
@ -80,7 +80,11 @@ class Workflow(object):

        assert not self.wf_ex

-        wf_trace.info(self.wf_ex, "Starting workflow: %s" % self.wf_def)
+        wf_trace.info(
+            self.wf_ex,
+            "Starting workflow [name=%s, input=%s]" %
+            (self.wf_def.name, utils.cut(input_dict))
+        )

        # TODO(rakhmerov): This call implicitly changes input_dict! Fix it!
        # After fix we need to move validation after adding risky fields.
@ -156,7 +160,7 @@ class Workflow(object):
        # Since some lookup utils functions may use cache for completed tasks
        # we need to clean caches to make sure that stale objects can't be
        # retrieved.
-        lookup_utils.clean_caches()
+        lookup_utils.clear_caches()

        wf_service.update_workflow_execution_env(self.wf_ex, env)

@ -256,6 +260,11 @@ class Workflow(object):
        # only if it completed successfully or failed.
        self.wf_ex.accepted = states.is_completed(state)

+        if states.is_completed(state):
+            # No need to keep task executions of this workflow in the
+            # lookup cache anymore.
+            lookup_utils.invalidate_cached_task_executions(self.wf_ex.id)
+
        if recursive and self.wf_ex.task_execution_id:
            parent_task_ex = db_api.get_task_execution(
                self.wf_ex.task_execution_id
--- a/mistral/tests/unit/base.py
+++ b/mistral/tests/unit/base.py
@ -245,7 +245,7 @@ class DbTestCase(BaseTest):
        action_manager.sync_db()

    def _clean_db(self):
-        lookup_utils.clean_caches()
+        lookup_utils.clear_caches()

        contexts = [
            get_context(default=False),
--- a/mistral/tests/unit/engine/test_lookup_utils.py
+++ b/mistral/tests/unit/engine/test_lookup_utils.py
@ -0,0 +1,82 @@
+# Copyright 2017 - Nokia Networks.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+from oslo_config import cfg
+
+from mistral.db.v2 import api as db_api
+from mistral.services import workflows as wf_service
+from mistral.tests.unit.engine import base
+from mistral.workflow import lookup_utils
+from mistral.workflow import states
+
+# Use the set_default method to set value otherwise in certain test cases
+# the change in value is not permanent.
+cfg.CONF.set_default('auth_enable', False, group='pecan')
+
+
+class LookupUtilsTest(base.EngineTestCase):
+    def test_task_execution_cache_invalidation(self):
+        wf_text = """---
+        version: '2.0'
+
+        wf:
+          tasks:
+            task1:
+              action: std.noop
+              on-success: join_task
+
+            task2:
+              action: std.noop
+              on-success: join_task
+
+            join_task:
+              join: all
+              on-success: task4
+
+            task4:
+              action: std.noop
+              pause-before: true
+        """
+
+        wf_service.create_workflows(wf_text)
+
+        # Start workflow.
+        wf_ex = self.engine.start_workflow('wf', {})
+
+        self.await_workflow_paused(wf_ex.id)
+
+        with db_api.transaction():
+            # Note: We need to reread execution to access related tasks.
+            wf_ex = db_api.get_workflow_execution(wf_ex.id)
+
+            tasks = wf_ex.task_executions
+
+        self.assertEqual(4, len(tasks))
+
+        self._assert_single_item(tasks, name='task1', state=states.SUCCESS)
+        self._assert_single_item(tasks, name='task2', state=states.SUCCESS)
+        self._assert_single_item(tasks, name='join_task', state=states.SUCCESS)
+        self._assert_single_item(tasks, name='task4', state=states.IDLE)
+
+        # Expecting one cache entry because we know that 'join' operation
+        # uses cached lookups and the workflow is not finished yet.
+        self.assertEqual(1, lookup_utils.get_task_execution_cache_size())
+
+        self.engine.resume_workflow(wf_ex.id)
+
+        self.await_workflow_success(wf_ex.id)
+
+        # Expecting that the cache size is 0 because the workflow has
+        # finished and invalidated corresponding cache entry.
+        self.assertEqual(0, lookup_utils.get_task_execution_cache_size())
--- a/mistral/workflow/lookup_utils.py
+++ b/mistral/workflow/lookup_utils.py
@ -35,8 +35,21 @@ import threading
 from mistral.db.v2 import api as db_api
 from mistral.workflow import states

-_TASK_EXECUTIONS_CACHE_LOCK = threading.RLock()
-_TASK_EXECUTIONS_CACHE = cachetools.LRUCache(maxsize=20000)
+
+def _create_lru_cache_for_workflow_execution(wf_ex_id):
+    return cachetools.LRUCache(maxsize=500)
+
+# This is a two-level caching structure.
+# First level: [<workflow execution id> -> <task execution cache>]
+# Second level (task execution cache): [<task_name> -> <task executions>]
+# The first level (by workflow execution id) allows to invalidate
+# needed cache entry when the workflow gets completed.
+_TASK_EX_CACHE = cachetools.LRUCache(
+    maxsize=100,
+    missing=_create_lru_cache_for_workflow_execution
+)
+
+_CACHE_LOCK = threading.RLock()


 def find_task_executions_by_name(wf_ex_id, task_name):
@ -46,10 +59,8 @@ def find_task_executions_by_name(wf_ex_id, task_name):
    :param task_name: Task name.
    :return: Task executions (possibly a cached value).
    """
-    cache_key = (wf_ex_id, task_name)
-
-    with _TASK_EXECUTIONS_CACHE_LOCK:
-        t_execs = _TASK_EXECUTIONS_CACHE.get(cache_key)
+    with _CACHE_LOCK:
+        t_execs = _TASK_EX_CACHE[wf_ex_id].get(task_name)

    if t_execs:
        return t_execs
@ -66,8 +77,8 @@ def find_task_executions_by_name(wf_ex_id, task_name):
    )

    if all_finished:
-        with _TASK_EXECUTIONS_CACHE_LOCK:
-            _TASK_EXECUTIONS_CACHE[cache_key] = t_execs
+        with _CACHE_LOCK:
+            _TASK_EX_CACHE[wf_ex_id][task_name] = t_execs

    return t_execs

@ -104,6 +115,16 @@ def find_cancelled_task_executions(wf_ex_id):
    return find_task_executions_with_state(wf_ex_id, states.CANCELLED)


-def clean_caches():
-    with _TASK_EXECUTIONS_CACHE_LOCK:
-        _TASK_EXECUTIONS_CACHE.clear()
+def get_task_execution_cache_size():
+    return len(_TASK_EX_CACHE)
+
+
+def invalidate_cached_task_executions(wf_ex_id):
+    with _CACHE_LOCK:
+        if wf_ex_id in _TASK_EX_CACHE:
+            del _TASK_EX_CACHE[wf_ex_id]
+
+
+def clear_caches():
+    with _CACHE_LOCK:
+        _TASK_EX_CACHE.clear()