From 3b4136ff1eb27472cb6d2ee4d1603e2b92f22ac9 Mon Sep 17 00:00:00 2001
From: Renat Akhmerov <renat.akhmerov@gmail.com>
Date: Tue, 6 Nov 2018 11:17:38 +0700
Subject: [PATCH] Add batch size for integrity checker

* Added the new property 'execution_integrity_check_batch_size'
  under the [engine] group to limit the number of task executions
  that the integrity checker may process during one iteration.

Closes-Bug: #1801876
Change-Id: I3c5074c45c476ebff109617cb15d56c54575dd4f
---
 mistral/config.py                                   |  8 ++++++++
 mistral/engine/workflow_handler.py                  |  3 ++-
 mistral/tests/unit/engine/base.py                   |  4 ++--
 mistral/tests/unit/engine/test_direct_workflow.py   |  2 +-
 ..._checker_work_with_batches-56c1cd94200d4c38.yaml | 13 +++++++++++++
 5 files changed, 26 insertions(+), 4 deletions(-)
 create mode 100644 releasenotes/notes/make_integrity_checker_work_with_batches-56c1cd94200d4c38.yaml

diff --git a/mistral/config.py b/mistral/config.py
index e5785bf78..033f0f734 100644
--- a/mistral/config.py
+++ b/mistral/config.py
@@ -184,6 +184,14 @@ engine_opts = [
                ' set to a negative value Mistral will never be doing '
                ' this check.')
     ),
+    cfg.IntOpt(
+        'execution_integrity_check_batch_size',
+        default=5,
+        min=1,
+        help=_('A number of task executions in RUNNING state that the'
+               ' execution integrity checker can process in a single'
+               ' iteration.')
+    ),
     cfg.IntOpt(
         'action_definition_cache_time',
         default=60,
diff --git a/mistral/engine/workflow_handler.py b/mistral/engine/workflow_handler.py
index b9e14b011..a3476c371 100644
--- a/mistral/engine/workflow_handler.py
+++ b/mistral/engine/workflow_handler.py
@@ -131,7 +131,8 @@ def _check_and_fix_integrity(wf_ex_id):
 
         running_task_execs = db_api.get_task_executions(
             workflow_execution_id=wf_ex.id,
-            state=states.RUNNING
+            state=states.RUNNING,
+            limit=CONF.engine.execution_integrity_check_batch_size
         )
 
         for t_ex in running_task_execs:
diff --git a/mistral/tests/unit/engine/base.py b/mistral/tests/unit/engine/base.py
index 1f1062b34..81eff0c2f 100644
--- a/mistral/tests/unit/engine/base.py
+++ b/mistral/tests/unit/engine/base.py
@@ -271,8 +271,8 @@ class EngineTestCase(base.DbTestCase):
             lambda: self.is_workflow_in_state(ex_id, state),
             delay,
             timeout,
-            fail_message="Execution {ex_id} to reach {state} "
-                         "state but is in {current}",
+            fail_message="Execution {ex_id} must have reached state {state} "
+                         "state but it is in {current}",
             fail_message_formatter=lambda m: m.format(
                 ex_id=ex_id,
                 state=state,
diff --git a/mistral/tests/unit/engine/test_direct_workflow.py b/mistral/tests/unit/engine/test_direct_workflow.py
index e6141b6d0..f99dc5bc2 100644
--- a/mistral/tests/unit/engine/test_direct_workflow.py
+++ b/mistral/tests/unit/engine/test_direct_workflow.py
@@ -1044,7 +1044,7 @@ class DirectWorkflowEngineTest(base.EngineTestCase):
         """
 
         # Generate the workflow text.
-        task_cnt = 200
+        task_cnt = 50
 
         on_success_list_str = ''
 
diff --git a/releasenotes/notes/make_integrity_checker_work_with_batches-56c1cd94200d4c38.yaml b/releasenotes/notes/make_integrity_checker_work_with_batches-56c1cd94200d4c38.yaml
new file mode 100644
index 000000000..52ef30393
--- /dev/null
+++ b/releasenotes/notes/make_integrity_checker_work_with_batches-56c1cd94200d4c38.yaml
@@ -0,0 +1,13 @@
+---
+fixes:
+  - |
+    Workflow execution integrity checker mechanism was too agressive in case
+    of big workflows that have many task executions in RUNNING state at the
+    same time. The mechanism was selecting them all in one query and calling
+    "on_action_complete" for each of them within a single DB transaction.
+    That could lead to situations when this mechanism would totally block
+    all normal workflow processing whereas it should only be a "last chance"
+    aid in case of real infrastructure failures (e.g. MQ outage).
+    This issue has been fixed by adding a configurable batch size, so that
+    the checker can't select more than this number of task executions in
+    RUNNING state at once.