NSX-v3 update endpoint state only on timeout

This patch removes the NSX v3 client cluster logic that
forces a revalidate of all endpoints when endpoint
selection only finds DOWN endpoints. The revalidate
call can cause cascading backpressure under certain
circumstances.

Now DOWN endpoints are only returned to UP as part
of the endpoint keepalive ping that is controlled via
conn_idle_timeout config property. Thus, the default
conn_idle_timeout is also decreased to 10s ensuring
endpoint revalidation occurs (by default) on a fequent
basis.

backport: liberty

Change-Id: I5423bce793892dd864353a23ca7c288b846a1ab6
Closes-Bug: #1541591
This commit is contained in:
Boden R 2016-02-03 14:39:27 -07:00
parent 772e43f576
commit e7acdfe91a
4 changed files with 6 additions and 17 deletions

View File

@ -369,7 +369,7 @@
# The amount of time in seconds to wait before ensuring connectivity to
# the NSX manager if no manager connection has been used.
# conn_idle_timeout = 60
# conn_idle_timeout = 10
# UUID of the default tier0 router that will be used for connecting to
# tier1 logical routers and configuring external networks

View File

@ -232,7 +232,7 @@ nsx_v3_opts = [
help=_("Maximum concurrent connections to each NSX "
"manager.")),
cfg.IntOpt('conn_idle_timeout',
default=60,
default=10,
help=_('Ensure connectivity to the NSX manager if a connection '
'is not used within timeout seconds.')),
cfg.IntOpt('redirects',

View File

@ -326,11 +326,6 @@ class ClusteredAPI(object):
if up == len(self._endpoints)
else ClusterHealth.ORANGE)
def revalidate_endpoints(self):
# validate each endpoint in serial
for endpoint in self._endpoints.values():
self._validate(endpoint)
def _validate(self, endpoint):
try:
with endpoint.pool.item() as conn:
@ -343,7 +338,7 @@ class ClusteredAPI(object):
"'%(ep)s' due to: %(err)s"),
{'ep': endpoint, 'err': e})
def _select_endpoint(self, revalidate=False):
def _select_endpoint(self):
connected = {}
for provider_id, endpoint in self._endpoints.items():
if endpoint.state == EndpointState.UP:
@ -352,12 +347,6 @@ class ClusteredAPI(object):
# connection can be used now
return endpoint
if not connected and revalidate:
LOG.debug("All endpoints DOWN; revalidating.")
# endpoints may have become available, try to revalidate
self.revalidate_endpoints()
return self._select_endpoint(revalidate=False)
# no free connections; randomly select a connected endpoint
# which will likely wait on pool.item() until a connection frees up
return (connected[random.choice(connected.keys())]
@ -382,8 +371,10 @@ class ClusteredAPI(object):
@contextlib.contextmanager
def endpoint_connection(self):
endpoint = self._select_endpoint(revalidate=True)
endpoint = self._select_endpoint()
if not endpoint:
# all endpoints are DOWN and will have their next
# state updated as per _endpoint_keepalive()
raise nsx_exc.ServiceClusterUnavailable(
cluster_id=self.cluster_id)

View File

@ -71,8 +71,6 @@ class NsxV3PluginTestCaseMixin(test_plugin.NeutronDbPluginV2TestCase,
self.cluster = nsx_cluster.NSXClusteredAPI(
http_provider=nsxlib_testcase.MemoryMockAPIProvider(self.mock_api))
self.cluster.revalidate_endpoints()
def _patch_object(*args, **kwargs):
patcher = mock.patch.object(*args, **kwargs)
patcher.start()