diff --git a/doc/source/operation.rst b/doc/source/operation.rst index b8192f6b0..4de69f6e9 100644 --- a/doc/source/operation.rst +++ b/doc/source/operation.rst @@ -493,6 +493,52 @@ The following metrics are produced by a ``nodepool-builder`` process: Number of manual build requests outstanding (does not include currently running builds). +.. zuul:stat:: nodepool.image..image_build_requests + :type: gauge + + Number of manual build requests outstanding (does not include + currently running builds) for the specified image. + +.. zuul:stat:: nodepool.builder..current_builds + :type: gauge + + The number of builds currently in progress. + +.. zuul:stat:: nodepool.builder..current_uploads + :type: gauge + + The number of uploads currently in progress. + +.. zuul:stat:: nodepool.builder..build_workers + :type: gauge + + The number of simultaneous build workers configured for this builder. + +.. zuul:stat:: nodepool.builder..upload_workers + :type: gauge + + The number of simultaneous upload workers configured for this builder. + +.. zuul:stat:: nodepool.builder..image..build.state + :type: gauge + + Indicates whether a builder is currently building an image. The + value will be one of the following constants: + + 0: idle + 1: building + 3: paused + +.. zuul:stat:: nodepool.builder..image..provider..upload.state + :type: gauge + + Indicates whether a builder is currently uploading an image. The + value will be one of the following constants: + + 0: idle + 2: uploading + 3: paused + Nodepool launcher ~~~~~~~~~~~~~~~~~ diff --git a/nodepool/builder.py b/nodepool/builder.py index 0aca542f9..0a8e6f68e 100644 --- a/nodepool/builder.py +++ b/nodepool/builder.py @@ -49,6 +49,12 @@ DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS = "--qemu-img-options 'compat=0.10'" # DIB process polling timeout, in milliseconds BUILD_PROCESS_POLL_TIMEOUT = 30 * 1000 +# Constants for image processing status +STATUS_IDLE = 0 +STATUS_BUILDING = 1 +STATUS_UPLOADING = 2 +STATUS_PAUSED = 3 + class DibImageFile(object): ''' @@ -120,6 +126,8 @@ class BaseWorker(threading.Thread): self._statsd = stats.get_client() self._interval = interval self._builder_id = builder_id + # Record what this worker is doing for each image + self._image_status = {} def _checkForZooKeeperChanges(self, new_config): ''' @@ -158,7 +166,8 @@ class CleanupWorker(BaseWorker): ''' def __init__(self, name, builder_id, config_path, secure_path, - interval, zk): + interval, zk, builder): + self.builder = builder super(CleanupWorker, self).__init__(builder_id, config_path, secure_path, interval, zk) self.log = logging.getLogger( @@ -412,14 +421,41 @@ class CleanupWorker(BaseWorker): ''' if not self._statsd: return - count = 0 + request_count = 0 + pipeline = self._statsd.pipeline() for image_name in self._zk.getImageNames(): + this_image_request_count = 0 request = self._zk.getBuildRequest(image_name) if request and request.pending: - count += 1 - pipeline = self._statsd.pipeline() + request_count += 1 + this_image_request_count += 1 + # Determine an overall build/upload state + build_status = STATUS_IDLE + for build_worker in self.builder._build_workers: + build_status = max( + build_worker._image_status.get( + image_name, STATUS_IDLE), + build_status) + key = (f'nodepool.builder.{self._hostname}.image.' + f'{image_name}.build.state') + pipeline.gauge(key, build_status) + upload_status_by_provider = {} + for upload_worker in self.builder._upload_workers: + for provider_name, provider_status in \ + upload_worker._image_status.get(image_name, {}).items(): + upload_status_by_provider[provider_name] = max( + provider_status, upload_status_by_provider.get( + provider_name, STATUS_IDLE)) + for provider_name, upload_status in \ + upload_status_by_provider.items(): + key = (f'nodepool.builder.{self._hostname}.image.{image_name}.' + f'provider.{provider_name}.upload.state') + pipeline.gauge(key, upload_status) + key = f'nodepool.image.{image_name}.image_build_requests' + pipeline.gauge(key, this_image_request_count) key = 'nodepool.image_build_requests' - pipeline.gauge(key, count) + pipeline.gauge(key, request_count) + pipeline.send() def _cleanupCurrentProviderUploads(self, provider, image, build_id): @@ -670,10 +706,13 @@ class BuildWorker(BaseWorker): ''' # Check if diskimage builds are paused. if diskimage.pause: + self._image_status[diskimage.name] = STATUS_PAUSED return if self._zk.getImagePaused(diskimage.name): + self._image_status[diskimage.name] = STATUS_PAUSED return + self._image_status[diskimage.name] = STATUS_IDLE if not diskimage.image_types: # We don't know what formats to build. return @@ -703,10 +742,13 @@ class BuildWorker(BaseWorker): return self.log.info("Building image %s" % diskimage.name) + self._image_status[diskimage.name] = STATUS_BUILDING self._buildWrapper(diskimage) except exceptions.ZKLockException: # Lock is already held. Skip it. pass + finally: + self._image_status[diskimage.name] = STATUS_IDLE def _buildWrapper(self, diskimage): ''' @@ -779,10 +821,13 @@ class BuildWorker(BaseWorker): ''' # Check if diskimage builds are paused. if diskimage.pause: + self._image_status[diskimage.name] = STATUS_PAUSED return if self._zk.getImagePaused(diskimage.name): + self._image_status[diskimage.name] = STATUS_PAUSED return + self._image_status[diskimage.name] = STATUS_IDLE if not diskimage.image_types: # We don't know what formats to build. return @@ -800,6 +845,7 @@ class BuildWorker(BaseWorker): self.log.info( "Manual build request for image %s" % diskimage.name) + self._image_status[diskimage.name] = STATUS_BUILDING data = self._buildWrapper(diskimage) # Remove request on a successful build @@ -809,6 +855,8 @@ class BuildWorker(BaseWorker): except exceptions.ZKLockException: # Lock is already held. Skip it. pass + finally: + self._image_status[diskimage.name] = STATUS_IDLE def _buildImage(self, build_id, diskimage): ''' @@ -1241,10 +1289,15 @@ class UploadWorker(BaseWorker): :returns: True if an upload was attempted, False otherwise. ''' + + self._image_status.setdefault(image.name, {}) + # Check if image uploads are paused. if provider.diskimages.get(image.name).pause: + self._image_status[image.name][provider.name] = STATUS_PAUSED return False + self._image_status[image.name][provider.name] = STATUS_IDLE # Search for the most recent 'ready' image build builds = self._zk.getMostRecentBuilds(1, image.name, zk.READY) @@ -1300,6 +1353,8 @@ class UploadWorker(BaseWorker): upnum = self._zk.storeImageUpload( image.name, build.id, provider.name, data) + self._image_status[image.name][provider.name] =\ + STATUS_UPLOADING data = self._uploadImage(build.id, upnum, image.name, local_images, provider, build.username, build.python_path, @@ -1312,6 +1367,8 @@ class UploadWorker(BaseWorker): except exceptions.ZKLockException: # Lock is already held. Skip it. return False + finally: + self._image_status[image.name][provider.name] = STATUS_IDLE def run(self): @@ -1467,7 +1524,7 @@ class NodePoolBuilder(object): self._janitor = CleanupWorker( 0, builder_id, self._config_path, self._secure_path, - self.cleanup_interval, self.zk) + self.cleanup_interval, self.zk, self) self._janitor.start() # Wait until all threads are running. Otherwise, we have a race diff --git a/nodepool/tests/unit/test_builder.py b/nodepool/tests/unit/test_builder.py index 25f88e3e2..48645cb83 100644 --- a/nodepool/tests/unit/test_builder.py +++ b/nodepool/tests/unit/test_builder.py @@ -17,6 +17,7 @@ import os import uuid import fixtures import mock +import socket import time from nodepool import builder, tests @@ -478,6 +479,14 @@ class TestNodePoolBuilder(tests.DBTestCase): '4096', 'g') self.assertReportedStat('nodepool.dib_image_build.' 'fake-image-vhd.vhd.size', '4096', 'g') + hostname = socket.gethostname() + self.assertReportedStat(f'nodepool.builder.{hostname}.' + 'image.fake-image-default-format.' + 'build.state', '0', 'g') + self.assertReportedStat(f'nodepool.builder.{hostname}.' + 'image.fake-image-default-format.' + 'provider.fake-provider-default-format.' + 'upload.state', '0', 'g') def test_diskimage_build_parents(self): configfile = self.setup_config('node_diskimage_parents.yaml') diff --git a/releasenotes/notes/builder-stats-b29d568e53ea7d1b.yaml b/releasenotes/notes/builder-stats-b29d568e53ea7d1b.yaml new file mode 100644 index 000000000..167667e5f --- /dev/null +++ b/releasenotes/notes/builder-stats-b29d568e53ea7d1b.yaml @@ -0,0 +1,12 @@ +--- +features: + - | + The following new statsd keys are available for builders: + + * :zuul:stat:`nodepool.image..image_build_requests` + * :zuul:stat:`nodepool.builder..current_builds` + * :zuul:stat:`nodepool.builder..current_uploads` + * :zuul:stat:`nodepool.builder..build_workers` + * :zuul:stat:`nodepool.builder..upload_workers` + * :zuul:stat:`nodepool.builder..image..build.state` + * :zuul:stat:`nodepool.builder..image..provider..upload.state`