Avoid endless backtraces on StorageFailure

If the storage is down (for instance sqlalchemy), the logbook cannot be
loaded, then taskflow goes into a endless loop of exceptions. To
mitigate this issue, use the retry feature of tenacity, the backtrace
will be displayed only every 5 seconds

Partial-Bug: #2037050

Change-Id: I656abdf7325c46d3afb2cc7ca905f1a335fb0d2f
This commit is contained in:
Gregory Thiemonge 2023-09-22 08:21:08 -04:00
parent 07a1a3f417
commit 981052a6cb
2 changed files with 15 additions and 0 deletions

View File

@ -0,0 +1,5 @@
---
fixes:
- |
Fixed potential endless loop of exceptions when the storage is down and
Taskflow loads a logbook.

View File

@ -24,6 +24,7 @@ import time
import enum
from oslo_utils import timeutils
from oslo_utils import uuidutils
import tenacity
from taskflow import exceptions as excp
from taskflow import states
@ -31,6 +32,10 @@ from taskflow.types import notifier
from taskflow.utils import iter_utils
RETRY_ATTEMPTS = 3
RETRY_WAIT_TIMEOUT = 5
class JobPriority(enum.Enum):
"""Enum of job priorities (modeled after hadoop job priorities)."""
@ -251,6 +256,11 @@ class Job(object, metaclass=abc.ABCMeta):
"""The non-uniquely identifying name of this job."""
return self._name
@tenacity.retry(retry=tenacity.retry_if_exception_type(
exception_types=excp.StorageFailure),
stop=tenacity.stop_after_attempt(RETRY_ATTEMPTS),
wait=tenacity.wait_fixed(RETRY_WAIT_TIMEOUT),
reraise=True)
def _load_book(self):
book_uuid = self.book_uuid
if self._backend is not None and book_uuid is not None: