Update zoned executor stats

This gives us 3 sets of executor stats: zoned, unzoned, and all.

The zoned and unzoned stats are moved to a hierarchy that avoids
name collisions.

The only way to detect whether an executor in a zone is online
is with a function with its zone name registered.  Since the only
function currently registered with a zone name is "execute:execute"
and that is unregistered when an executor is online but not accepting,
we need to add a new dummy function in order to do this accounting.

This also updates the docs and adds a release note about the (minor)
change in stats meaning.

Change-Id: Ie28963426024f2d54275426794549f31ace9d998
This commit is contained in:
James E. Blair 2021-02-26 17:03:51 -08:00
parent 59c93ed108
commit 7599b6bdc0
5 changed files with 126 additions and 17 deletions

View File

@ -337,7 +337,14 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
.. stat:: zuul.executors
Holds metrics related to Zuul executors.
Holds metrics related to unzoned executors.
This is a copy of :stat:`zuul.executors.unzoned`. It does not
include information about zoned executors.
.. warning:: The metrics at this location are deprecated and will
be removed in a future version. Please begin using
:stat:`zuul.executors.unzoned` instead.
.. stat:: online
:type: gauge
@ -361,6 +368,61 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
executor to run on. This should ideally be at zero; persistent
higher values indicate more executor resources would be useful.
.. stat:: unzoned
Holds metrics related to unzoned executors.
.. stat:: online
:type: gauge
The number of unzoned Zuul executor processes online.
.. stat:: accepting
:type: gauge
The number of unzoned Zuul executor processes accepting new
jobs.
.. stat:: jobs_running
:type: gauge
The number of unzoned executor jobs running.
.. stat:: jobs_queued
:type: gauge
The number of jobs allocated nodes, but queued waiting for an
unzoned executor to run on. This should ideally be at zero;
persistent higher values indicate more executor resources
would be useful.
.. stat:: zone
Holds metrics related to zoned executors.
.. stat:: <zone>.online
:type: gauge
The number of Zuul executor processes online in this zone.
.. stat:: <zone>.accepting
:type: gauge
The number of Zuul executor processes accepting new jobs in
this zone.
.. stat:: <zone>.jobs_running
:type: gauge
The number of executor jobs running in this zone.
.. stat:: <zone>.jobs_queued
:type: gauge
The number of jobs allocated nodes, but queued waiting for an
executor in this zone to run on. This should ideally be at
zero; persistent higher values indicate more executor
resources would be useful.
.. stat:: zuul.scheduler

View File

@ -0,0 +1,12 @@
---
upgrade:
- |
Two sets of statsd metrics are now reported for executors: zoned
and unzoned. The existing statsd keys are now deprecated; new
statsd keys are available for both zoned and unzoned executors.
See :stat:`zuul.executors` for details.
fixes:
- |
If zoned executors were used with prior releases of Zuul, the
reported executor statistics would only represent a single,
unspecified zone. This has now been corrected.

View File

@ -114,11 +114,21 @@ class TestSchedulerZone(ZuulTestCase):
# Validate that the reported executor stats are correct. There must
# be two executors online and two accepting (one unzoned and one zoned)
self.assertReportedStat('zuul.executors.online', value='2', kind='g')
# TODO(corvus): remove deprecated top-level stats in 5.0
self.assertReportedStat(
'zuul.executors.online', value='1', kind='g')
self.assertReportedStat(
'zuul.executors.accepting', value='1', kind='g')
self.assertReportedStat(
'zuul.executors.test-provider_vpn.accepting', value='1', kind='g')
'zuul.executors.unzoned.online', value='1', kind='g')
self.assertReportedStat(
'zuul.executors.unzoned.accepting', value='1', kind='g')
self.assertReportedStat(
'zuul.executors.zone.test-provider_vpn.online',
value='1', kind='g')
self.assertReportedStat(
'zuul.executors.zone.test-provider_vpn.accepting',
value='1', kind='g')
self.gearman_server.hold_jobs_in_queue = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')

View File

@ -2699,10 +2699,17 @@ class ExecutorServer(BaseMergeServer):
if self.zone:
function_name += ':%s' % self.zone
# This function only exists so we can count how many executors
# are online.
online_name = 'executor:online'
if self.zone:
online_name += ':%s' % self.zone
self.executor_jobs = {
"executor:resume:%s" % self.hostname: self.resumeJob,
"executor:stop:%s" % self.hostname: self.stopJob,
function_name: self.executeJob,
online_name: self.noop,
}
self.executor_gearworker = ZuulGearWorker(
@ -2720,6 +2727,10 @@ class ExecutorServer(BaseMergeServer):
def _repoLock(self, connection_name, project_name):
return self.repo_locks.getRepoLock(connection_name, project_name)
def noop(self, job):
"""A noop gearman job so we can register for statistics."""
job.sendWorkComplete()
def start(self):
# Start merger worker only if we process merge jobs
if self.process_merge_jobs:

View File

@ -436,39 +436,55 @@ class Scheduler(threading.Thread):
return
functions = getGearmanFunctions(self.rpc.gearworker.gearman)
functions.update(getGearmanFunctions(self.rpc_slow.gearworker.gearman))
executors_online = 0
mergers_online = 0
merge_queue = 0
merge_running = 0
for (name, (queued, running, registered)) in functions.items():
if name.startswith('executor:execute'):
executors_accepting = registered
execute_queue = queued - running
execute_running = running
tokens = name.split(':', 2)
if len(tokens) == 2:
# unzoned case
self.statsd.gauge('zuul.executors.unzoned.accepting',
registered)
self.statsd.gauge('zuul.executors.unzoned.jobs_running',
running)
self.statsd.gauge('zuul.executors.unzoned.jobs_queued',
execute_queue)
# TODO(corvus): Remove for 5.0:
self.statsd.gauge('zuul.executors.accepting',
executors_accepting)
registered)
self.statsd.gauge('zuul.executors.jobs_running',
execute_running)
running)
self.statsd.gauge('zuul.executors.jobs_queued',
execute_queue)
else:
# zoned case
zone = tokens[2]
self.statsd.gauge('zuul.executors.%s.accepting' %
self.statsd.gauge('zuul.executors.zone.%s.accepting' %
normalize_statsd_name(zone),
executors_accepting)
self.statsd.gauge('zuul.executors.%s.jobs_running' %
registered)
self.statsd.gauge('zuul.executors.zone.%s.jobs_running' %
normalize_statsd_name(zone),
execute_running)
self.statsd.gauge('zuul.executors.%s.jobs_queued' %
running)
self.statsd.gauge('zuul.executors.zone.%s.jobs_queued' %
normalize_statsd_name(zone),
execute_queue)
if name.startswith('executor:stop'):
executors_online += registered
if name.startswith('executor:online'):
tokens = name.split(':', 2)
if len(tokens) == 2:
# unzoned case
self.statsd.gauge('zuul.executors.unzoned.online',
registered)
# TODO(corvus): Remove for 5.0:
self.statsd.gauge('zuul.executors.online', registered)
else:
# zoned case
zone = tokens[2]
self.statsd.gauge('zuul.executors.zone.%s.online' %
normalize_statsd_name(zone),
registered)
if name == 'merger:merge':
mergers_online = registered
if name.startswith('merger:'):
@ -477,8 +493,6 @@ class Scheduler(threading.Thread):
self.statsd.gauge('zuul.mergers.online', mergers_online)
self.statsd.gauge('zuul.mergers.jobs_running', merge_running)
self.statsd.gauge('zuul.mergers.jobs_queued', merge_queue)
self.statsd.gauge('zuul.executors.online', executors_online)
self.statsd.gauge('zuul.scheduler.eventqueues.trigger',
self.trigger_event_queue.qsize())
self.statsd.gauge('zuul.scheduler.eventqueues.result',