From 2184d8e1e5bd3acb4073b7f1d4439f1d3bf658e6 Mon Sep 17 00:00:00 2001 From: Terry Wilson Date: Tue, 15 Sep 2020 13:42:08 -0500 Subject: [PATCH] Don't give up when an Exception happens in idl.run It's possible that idl.run() could have a bug where it raises an Exception for an extended period of time while ovsdb-server is down, but recover once ovsdb-server comes back up. Specifically, python-ovs currently doesn't properly catch an exception when the socket type is 'ssl' that it catches for other protocols. Conflicts: ovsdbapp/backend/ovs_idl/connection.py Change-Id: Ia068650d2db3d5d8642771a6df5a260d692aea20 Closes-Bug: #1895727 (cherry picked from commit 83cf7aa6c81f1b2341b2bba1fe156047fa5d29f6) (cherry picked from commit 4807809ba7ec09a7e9cf533e334de282e0d373cd) (cherry picked from commit b49239d02408065e7e16ae5085c27df77ec4ac57) --- ovsdbapp/backend/ovs_idl/connection.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/ovsdbapp/backend/ovs_idl/connection.py b/ovsdbapp/backend/ovs_idl/connection.py index b06fefdd..75b8631c 100644 --- a/ovsdbapp/backend/ovs_idl/connection.py +++ b/ovsdbapp/backend/ovs_idl/connection.py @@ -15,6 +15,7 @@ import logging import os import threading +import time import traceback from ovs.db import idl @@ -93,9 +94,9 @@ class Connection(object): while self._is_running: # If we fail in an Idl call, we could have missed an update # from the server, leaving us out of sync with ovsdb-server. - # It is not safe to continue without restarting the connection, - # though it is likely that the error is unrecoverable, so only try - # a few times before bailing completely. + # It is not safe to continue without restarting the connection. + # Though it is likely that the error is unrecoverable, keep trying + # indefinitely just in case. try: self.idl.wait(self.poller) self.poller.fd_wait(self.txns.alert_fileno, poller.POLLIN) @@ -107,13 +108,18 @@ class Connection(object): # in python-ovs errors += 1 LOG.exception(e) - if errors <= 3: - with self.lock: - self.idl.force_reconnect() + with self.lock: + self.idl.force_reconnect() + try: idlutils.wait_for_change(self.idl, self.timeout) - continue - self._is_running = False - break + except Exception as e: + # This could throw the same exception as idl.run() + # or Exception("timeout"), either way continue + LOG.exception(e) + sleep = min(2 ** errors, 60) + LOG.info("Trying to recover, sleeping %s seconds", sleep) + time.sleep(sleep) + continue errors = 0 txn = self.txns.get_nowait() if txn is not None: