From 3b4136ff1eb27472cb6d2ee4d1603e2b92f22ac9 Mon Sep 17 00:00:00 2001 From: Renat Akhmerov Date: Tue, 6 Nov 2018 11:17:38 +0700 Subject: [PATCH] Add batch size for integrity checker * Added the new property 'execution_integrity_check_batch_size' under the [engine] group to limit the number of task executions that the integrity checker may process during one iteration. Closes-Bug: #1801876 Change-Id: I3c5074c45c476ebff109617cb15d56c54575dd4f --- mistral/config.py | 8 ++++++++ mistral/engine/workflow_handler.py | 3 ++- mistral/tests/unit/engine/base.py | 4 ++-- mistral/tests/unit/engine/test_direct_workflow.py | 2 +- ..._checker_work_with_batches-56c1cd94200d4c38.yaml | 13 +++++++++++++ 5 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 releasenotes/notes/make_integrity_checker_work_with_batches-56c1cd94200d4c38.yaml diff --git a/mistral/config.py b/mistral/config.py index e5785bf78..033f0f734 100644 --- a/mistral/config.py +++ b/mistral/config.py @@ -184,6 +184,14 @@ engine_opts = [ ' set to a negative value Mistral will never be doing ' ' this check.') ), + cfg.IntOpt( + 'execution_integrity_check_batch_size', + default=5, + min=1, + help=_('A number of task executions in RUNNING state that the' + ' execution integrity checker can process in a single' + ' iteration.') + ), cfg.IntOpt( 'action_definition_cache_time', default=60, diff --git a/mistral/engine/workflow_handler.py b/mistral/engine/workflow_handler.py index b9e14b011..a3476c371 100644 --- a/mistral/engine/workflow_handler.py +++ b/mistral/engine/workflow_handler.py @@ -131,7 +131,8 @@ def _check_and_fix_integrity(wf_ex_id): running_task_execs = db_api.get_task_executions( workflow_execution_id=wf_ex.id, - state=states.RUNNING + state=states.RUNNING, + limit=CONF.engine.execution_integrity_check_batch_size ) for t_ex in running_task_execs: diff --git a/mistral/tests/unit/engine/base.py b/mistral/tests/unit/engine/base.py index 1f1062b34..81eff0c2f 100644 --- a/mistral/tests/unit/engine/base.py +++ b/mistral/tests/unit/engine/base.py @@ -271,8 +271,8 @@ class EngineTestCase(base.DbTestCase): lambda: self.is_workflow_in_state(ex_id, state), delay, timeout, - fail_message="Execution {ex_id} to reach {state} " - "state but is in {current}", + fail_message="Execution {ex_id} must have reached state {state} " + "state but it is in {current}", fail_message_formatter=lambda m: m.format( ex_id=ex_id, state=state, diff --git a/mistral/tests/unit/engine/test_direct_workflow.py b/mistral/tests/unit/engine/test_direct_workflow.py index e6141b6d0..f99dc5bc2 100644 --- a/mistral/tests/unit/engine/test_direct_workflow.py +++ b/mistral/tests/unit/engine/test_direct_workflow.py @@ -1044,7 +1044,7 @@ class DirectWorkflowEngineTest(base.EngineTestCase): """ # Generate the workflow text. - task_cnt = 200 + task_cnt = 50 on_success_list_str = '' diff --git a/releasenotes/notes/make_integrity_checker_work_with_batches-56c1cd94200d4c38.yaml b/releasenotes/notes/make_integrity_checker_work_with_batches-56c1cd94200d4c38.yaml new file mode 100644 index 000000000..52ef30393 --- /dev/null +++ b/releasenotes/notes/make_integrity_checker_work_with_batches-56c1cd94200d4c38.yaml @@ -0,0 +1,13 @@ +--- +fixes: + - | + Workflow execution integrity checker mechanism was too agressive in case + of big workflows that have many task executions in RUNNING state at the + same time. The mechanism was selecting them all in one query and calling + "on_action_complete" for each of them within a single DB transaction. + That could lead to situations when this mechanism would totally block + all normal workflow processing whereas it should only be a "last chance" + aid in case of real infrastructure failures (e.g. MQ outage). + This issue has been fixed by adding a configurable batch size, so that + the checker can't select more than this number of task executions in + RUNNING state at once.