Add ZK cache stats

To observe the performance of the ZK connection and the new tree
caches, add some statsd metrics for each of these.  This will
let us monitor queue size over time.

Also, update the assertReportedStat method to output all received
stats if the expected stat was not found (like Zuul).

Change-Id: Ia7e1e0980fdc34007f80371ee0a77d4478948518
Depends-On: https://review.opendev.org/886552
This commit is contained in:
James E. Blair 2023-06-19 11:49:14 -07:00
parent 426951af18
commit 07c83f555d
5 changed files with 126 additions and 3 deletions

View File

@ -688,3 +688,45 @@ generic format ``<service-type>.<method>.<operation>``. For example, the
Since these calls reflect the internal operations of the
``openstacksdk``, the exact keys logged may vary across providers and
releases.
Internal metrics
^^^^^^^^^^^^^^^^
The following metrics are low-level performance metrics of the
launcher itself, primarily of interest to Nodepool developers, and are
subject to change in the future as development needs change:
.. zuul:stat:: nodepool.launcher.<hostname>.zk.client.connection_queue
:type: gauge
ZooKeeper client connection queue length.
.. zuul:stat:: nodepool.launcher.<hostname>.zk.node_cache.event_queue
:type: gauge
Node cache event queue length.
.. zuul:stat:: nodepool.launcher.<hostname>.zk.node_cache.playback_queue
:type: gauge
Node cache playback queue length.
.. zuul:stat:: nodepool.launcher.<hostname>.zk.request_cache.event_queue
:type: gauge
Request cache event queue length.
.. zuul:stat:: nodepool.launcher.<hostname>.zk.request_cache.playback_queue
:type: gauge
Request cache playback queue length.
.. zuul:stat:: nodepool.launcher.<hostname>.zk.image_cache.event_queue
:type: gauge
Image cache event queue length.
.. zuul:stat:: nodepool.launcher.<hostname>.zk.image_cache.playback_queue
:type: gauge
Image cache playback queue length.

View File

@ -1043,6 +1043,7 @@ class NodePool(threading.Thread):
self._cleanup_thread = None
self._delete_thread = None
self._stats_thread = None
self._local_stats_thread = None
self._submittedRequests = {}
self.ready = False
@ -1066,6 +1067,9 @@ class NodePool(threading.Thread):
self._stats_thread.stop()
self._stats_thread.join()
if self._local_stats_thread:
self._local_stats_thread.join()
# Don't let stop() return until all pool threads have been
# terminated.
self.log.debug("Stopping pool threads")
@ -1272,6 +1276,20 @@ class NodePool(threading.Thread):
for i in range(0, need):
createRequest(label.name)
def _localStats(self):
if not self.statsd:
self.log.info("Statsd not configured")
return
hostname = socket.gethostname()
key = f'nodepool.launcher.{hostname}'
while not self._stopped:
try:
if self.zk:
self.zk.reportStats(self.statsd, key)
except Exception:
self.log.exception("Unable to run local stats reporting:")
self._stop_event.wait(10)
def run(self):
'''
Start point for the NodePool thread.
@ -1309,6 +1327,11 @@ class NodePool(threading.Thread):
self._stats_thread = StatsWorker(self, self.stats_interval)
self._stats_thread.start()
if not self._local_stats_thread:
self._local_stats_thread = threading.Thread(
target=self._localStats)
self._local_stats_thread.start()
# Stop any PoolWorker threads if the pool was removed
# from the config.
pool_keys = set()

View File

@ -319,9 +319,9 @@ class BaseTestCase(testtools.TestCase):
if value:
self.assertNotEqual(kind, None)
for _ in iterate_timeout(5 * SECOND, Exception,
"Find statsd event",
interval=0.1):
start = time.time()
timeout = 5
while time.time() <= (start + timeout):
# Note our fake statsd just queues up results in a queue.
# We just keep going through them until we find one that
# matches, or fail out. If statsd pipelines are used,
@ -358,6 +358,10 @@ class BaseTestCase(testtools.TestCase):
# this key matches
return True
stats = itertools.chain.from_iterable(
[s.decode('utf-8').split('\n') for s in self.statsd.stats])
for stat in stats:
self.log.debug("Stat: %s", stat)
raise Exception("Key %s not found in reported stats" % key)

View File

@ -144,6 +144,30 @@ class TestLauncher(tests.DBTestCase):
self.assertReportedStat('nodepool.label.fake-label2.nodes.aborted',
value='0', kind='g')
hostname = socket.gethostname()
# Only check for presence since other threads could have
# outstanding requests and we don't know the value.
self.assertReportedStat(f'nodepool.launcher.{hostname}.zk'
'.client.connection_queue')
self.assertReportedStat(f'nodepool.launcher.{hostname}.zk'
'.node_cache.event_queue',
value='0', kind='g')
self.assertReportedStat(f'nodepool.launcher.{hostname}.zk'
'.node_cache.playback_queue',
value='0', kind='g')
self.assertReportedStat(f'nodepool.launcher.{hostname}.zk'
'.request_cache.event_queue',
value='0', kind='g')
self.assertReportedStat(f'nodepool.launcher.{hostname}.zk'
'.request_cache.playback_queue',
value='0', kind='g')
self.assertReportedStat(f'nodepool.launcher.{hostname}.zk'
'.image_cache.event_queue',
value='0', kind='g')
self.assertReportedStat(f'nodepool.launcher.{hostname}.zk'
'.image_cache.playback_queue',
value='0', kind='g')
def test_node_assignment_order(self):
"""Test that nodes are assigned in the order requested"""
configfile = self.setup_config('node_many_labels.yaml')

View File

@ -1514,6 +1514,36 @@ class ZooKeeper(ZooKeeperBase):
'''
self.client.resetHosts(hosts)
def reportStats(self, statsd, root_key):
'''
Report stats using the supplied statsd object.
:param statsd statsd: A statsd instance
:param str root_key: The root key (eg 'nodepool.launcher.hostname')
'''
pipeline = statsd.pipeline()
key = f'{root_key}.zk.client.connection_queue'
pipeline.gauge(key, len(self.client.client._queue))
key = f'{root_key}.zk.node_cache.event_queue'
pipeline.gauge(key, self._node_cache._event_queue.qsize())
key = f'{root_key}.zk.node_cache.playback_queue'
pipeline.gauge(key, self._node_cache._playback_queue.qsize())
key = f'{root_key}.zk.request_cache.event_queue'
pipeline.gauge(key, self._request_cache._event_queue.qsize())
key = f'{root_key}.zk.request_cache.playback_queue'
pipeline.gauge(key, self._request_cache._playback_queue.qsize())
key = f'{root_key}.zk.image_cache.event_queue'
pipeline.gauge(key, self._image_cache._event_queue.qsize())
key = f'{root_key}.zk.image_cache.playback_queue'
pipeline.gauge(key, self._image_cache._playback_queue.qsize())
pipeline.send()
@contextmanager
def imageBuildLock(self, image, blocking=True, timeout=None):
'''