Add batch size for integrity checker
* Added the new property 'execution_integrity_check_batch_size' under the [engine] group to limit the number of task executions that the integrity checker may process during one iteration. Closes-Bug: #1801876 Change-Id: I3c5074c45c476ebff109617cb15d56c54575dd4f
This commit is contained in:
parent
80a1bed67b
commit
3b4136ff1e
|
@ -184,6 +184,14 @@ engine_opts = [
|
|||
' set to a negative value Mistral will never be doing '
|
||||
' this check.')
|
||||
),
|
||||
cfg.IntOpt(
|
||||
'execution_integrity_check_batch_size',
|
||||
default=5,
|
||||
min=1,
|
||||
help=_('A number of task executions in RUNNING state that the'
|
||||
' execution integrity checker can process in a single'
|
||||
' iteration.')
|
||||
),
|
||||
cfg.IntOpt(
|
||||
'action_definition_cache_time',
|
||||
default=60,
|
||||
|
|
|
@ -131,7 +131,8 @@ def _check_and_fix_integrity(wf_ex_id):
|
|||
|
||||
running_task_execs = db_api.get_task_executions(
|
||||
workflow_execution_id=wf_ex.id,
|
||||
state=states.RUNNING
|
||||
state=states.RUNNING,
|
||||
limit=CONF.engine.execution_integrity_check_batch_size
|
||||
)
|
||||
|
||||
for t_ex in running_task_execs:
|
||||
|
|
|
@ -271,8 +271,8 @@ class EngineTestCase(base.DbTestCase):
|
|||
lambda: self.is_workflow_in_state(ex_id, state),
|
||||
delay,
|
||||
timeout,
|
||||
fail_message="Execution {ex_id} to reach {state} "
|
||||
"state but is in {current}",
|
||||
fail_message="Execution {ex_id} must have reached state {state} "
|
||||
"state but it is in {current}",
|
||||
fail_message_formatter=lambda m: m.format(
|
||||
ex_id=ex_id,
|
||||
state=state,
|
||||
|
|
|
@ -1044,7 +1044,7 @@ class DirectWorkflowEngineTest(base.EngineTestCase):
|
|||
"""
|
||||
|
||||
# Generate the workflow text.
|
||||
task_cnt = 200
|
||||
task_cnt = 50
|
||||
|
||||
on_success_list_str = ''
|
||||
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
---
|
||||
fixes:
|
||||
- |
|
||||
Workflow execution integrity checker mechanism was too agressive in case
|
||||
of big workflows that have many task executions in RUNNING state at the
|
||||
same time. The mechanism was selecting them all in one query and calling
|
||||
"on_action_complete" for each of them within a single DB transaction.
|
||||
That could lead to situations when this mechanism would totally block
|
||||
all normal workflow processing whereas it should only be a "last chance"
|
||||
aid in case of real infrastructure failures (e.g. MQ outage).
|
||||
This issue has been fixed by adding a configurable batch size, so that
|
||||
the checker can't select more than this number of task executions in
|
||||
RUNNING state at once.
|
Loading…
Reference in New Issue