From e7b2224768018ea0f66bd9b3427b71e2b7f38908 Mon Sep 17 00:00:00 2001 From: Andras Kovi Date: Thu, 7 Dec 2017 17:25:03 +0100 Subject: [PATCH] Fix race condition between task completion and child task processing Check the timestamp of the most recently completed child and proceed with the integrity restoration only if this happened later than the check delay. Change-Id: Icff2be3e1fbd6db78d371183521d7fbda8d3d72c Closes-Bug: 1736987 --- mistral/engine/workflow_handler.py | 31 ++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/mistral/engine/workflow_handler.py b/mistral/engine/workflow_handler.py index afa40035a..150eaed69 100644 --- a/mistral/engine/workflow_handler.py +++ b/mistral/engine/workflow_handler.py @@ -174,18 +174,29 @@ def check_and_fix_integrity(wf_ex): ) if all_finished: - # We found a task execution in RUNNING state for which all - # child executions are finished. We need to call - # "schedule_on_action_complete" on the task handler for any of - # the child executions so that the task state is calculated and - # updated properly. - LOG.warning( - "Found a task execution that is likely stuck in RUNNING state" - " because all child executions are finished," - " will try to recover [task_execution=%s]", t_ex.id + # Find the timestamp of the most recently finished child. + most_recent_child_timestamp = max( + [c_ex.updated_at or c_ex.created_at for c_ex in + child_executions] + ) + interval = timeutils.delta_seconds( + most_recent_child_timestamp, + timeutils.utcnow() ) - task_handler.schedule_on_action_complete(child_executions[-1]) + if interval > check_after_seconds: + # We found a task execution in RUNNING state for which all + # child executions are finished. We need to call + # "schedule_on_action_complete" on the task handler for any of + # the child executions so that the task state is calculated and + # updated properly. + LOG.warning( + "Found a task execution that is likely stuck in RUNNING" + " state because all child executions are finished," + " will try to recover [task_execution=%s]", t_ex.id + ) + + task_handler.schedule_on_action_complete(child_executions[-1]) def pause_workflow(wf_ex, msg=None):