Avoid endless backtraces on StorageFailure

If the storage is down (for instance sqlalchemy), the logbook cannot be loaded, then taskflow goes into a endless loop of exceptions. To mitigate this issue, use the retry feature of tenacity, the backtrace will be displayed only every 5 seconds Partial-Bug: #2037050 Change-Id: I656abdf7325c46d3afb2cc7ca905f1a335fb0d2f
2023-09-22 08:21:08 -04:00 · 2023-09-22 08:21:08 -04:00 · 981052a6cb
parent 07a1a3f417
commit 981052a6cb
2 changed files with 15 additions and 0 deletions
--- a/releasenotes/notes/fix-endless-loop-on-storage-failures-b98b30f0c34d25e1.yaml
+++ b/releasenotes/notes/fix-endless-loop-on-storage-failures-b98b30f0c34d25e1.yaml
@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    Fixed potential endless loop of exceptions when the storage is down and
+    Taskflow loads a logbook.
--- a/taskflow/jobs/base.py
+++ b/taskflow/jobs/base.py
@ -24,6 +24,7 @@ import time
 import enum
 from oslo_utils import timeutils
 from oslo_utils import uuidutils
+import tenacity

 from taskflow import exceptions as excp
 from taskflow import states
@ -31,6 +32,10 @@ from taskflow.types import notifier
 from taskflow.utils import iter_utils


+RETRY_ATTEMPTS = 3
+RETRY_WAIT_TIMEOUT = 5
+
+
 class JobPriority(enum.Enum):
    """Enum of job priorities (modeled after hadoop job priorities)."""

@ -251,6 +256,11 @@ class Job(object, metaclass=abc.ABCMeta):
        """The non-uniquely identifying name of this job."""
        return self._name

+    @tenacity.retry(retry=tenacity.retry_if_exception_type(
+                    exception_types=excp.StorageFailure),
+                    stop=tenacity.stop_after_attempt(RETRY_ATTEMPTS),
+                    wait=tenacity.wait_fixed(RETRY_WAIT_TIMEOUT),
+                    reraise=True)
    def _load_book(self):
        book_uuid = self.book_uuid
        if self._backend is not None and book_uuid is not None: