Prevent hung process/threads when server_endpoint is not reachable

This commit is to address an issue that can cause Shaker to get
stuck indefinately waiting for a zeroMQ message that will never
arrive.

To reproduce set server_endpoint to 10.0.0.1:8080 (assuming
nothing is listening and/or this address is unreachable).
Shaker will go through launching the heartbeat thread, and
creating the heat stack, but then will be stuck at:
Waiting for quorum of agents:
If debug is turned on, there are no heartbeats coming through.

In order to fix this situation this commit sends 3 "tcp pings"
to the server_endpoint in order to test it's response before
the heartbeat thread and heat stack are created.

Change-Id: Ifcfacfb383e7553b53c5b2a20ad39ceccb833c8f
This commit is contained in:
Oded Le'Sage 2020-02-05 09:23:58 -06:00
parent b948585b71
commit 639760a0fa
1 changed files with 102 additions and 4 deletions

View File

@ -14,13 +14,15 @@
# limitations under the License.
import multiprocessing
from oslo_log import log as logging
import socket
import time
import zmq
from oslo_log import log as logging
from shaker.agent import agent
from shaker.engine import utils
from six.moves import zip_longest
from timeit import default_timer as timer
LOG = logging.getLogger(__name__)
@ -29,13 +31,27 @@ HEARTBEAT_AGENT = '__heartbeat'
class MessageQueue(object):
def __init__(self, endpoint):
_, port = utils.split_address(endpoint)
ip, port = utils.split_address(endpoint)
context = zmq.Context()
self.socket = context.socket(zmq.REP)
self.socket.bind("tcp://*:%s" % port)
LOG.info('Listening on *:%s', port)
# Test that endpoint is actually reachable
# otherwise the process will get stuck indefinately waiting
# for a REQ/REP that will never happen.
# The code to support this was adapted from pypi package tcping
try:
LOG.info("Testing route to %s" % endpoint)
ping_test = Ping(ip, port)
ping_test.ping(3)
if ping_test._success == 0:
raise socket.timeout("No valid route to %s" % endpoint)
except socket.error as e:
LOG.exception(e)
raise
heartbeat = multiprocessing.Process(
target=agent.work,
kwargs=dict(agent_id=HEARTBEAT_AGENT, endpoint=endpoint,
@ -65,3 +81,85 @@ class MessageQueue(object):
else:
LOG.exception(e)
raise
class Socket(object):
def __init__(self, family, type_, timeout):
s = socket.socket(family, type_)
s.settimeout(timeout)
self._s = s
def connect(self, host, port=80):
self._s.connect((host, int(port)))
def shutdown(self):
self._s.shutdown(socket.SHUT_RD)
def close(self):
self._s.close()
class Timer(object):
def __init__(self):
self._start = 0
self._stop = 0
def start(self):
self._start = timer()
def stop(self):
self._stop = timer()
def cost(self, funcs, args):
self.start()
for func, arg in zip_longest(funcs, args):
if arg:
func(*arg)
else:
func()
self.stop()
return self._stop - self._start
class Ping(object):
def __init__(self, host, port=80, timeout=1):
self.timer = Timer()
self._success = 0
self._failed = 0
self._conn_times = []
self._host = host
self._port = port
self._timeout = timeout
def _create_socket(self, family, type_):
return Socket(family, type_, self._timeout)
def ping(self, count=10):
for n in range(1, count + 1):
s = self._create_socket(socket.AF_INET, socket.SOCK_STREAM)
try:
time.sleep(1)
cost_time = self.timer.cost(
(s.connect, s.shutdown),
((self._host, self._port), None))
s_runtime = 1000 * (cost_time)
LOG.debug("Connected to %s[:%s]: seq=%d time=%.2f ms" % (
self._host, self._port, n, s_runtime))
self._conn_times.append(s_runtime)
except socket.timeout:
LOG.error("Connected to %s[:%s]: seq=%d time out!" % (
self._host, self._port, n))
self._failed += 1
except KeyboardInterrupt:
raise KeyboardInterrupt()
else:
self._success += 1
finally:
s.close()