From 7225354ec04f0466d95b6d68e51002723d6529c9 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 10 Jan 2019 14:45:21 -0500 Subject: [PATCH] Add a timeout for the image build A builder thread can wedge if the build process wedges. Add a timeout to the subprocess. Since it was the call to readline() that would block, we change the process to have DIB write directly to the log. This allows us to set a timeout in the Popen.wait() call. And we kill the dib subprocess, as well. The timeout value can be controlled in the diskimage configuration and defaults to 8 hours. Change-Id: I188e8a74dc39b55a4b50ade5c1a96832fea76a7d --- doc/source/configuration.rst | 6 ++ nodepool/builder.py | 45 +++++++---- nodepool/cmd/config_validator.py | 1 + nodepool/config.py | 5 +- nodepool/tests/__init__.py | 23 +++--- nodepool/tests/fake-image-create | 78 +++++++++++++------ .../tests/fixtures/config_validate/good.yaml | 1 + .../fixtures/diskimage_build_timeout.yaml | 26 +++++++ nodepool/tests/unit/test_builder.py | 9 +++ .../notes/build-timeout-bb68a1fd24f97a10.yaml | 6 ++ 10 files changed, 149 insertions(+), 51 deletions(-) create mode 100644 nodepool/tests/fixtures/diskimage_build_timeout.yaml create mode 100644 releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index 27abacf86..7bec67054 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -277,6 +277,12 @@ Options Specifies the distro to be used as a base image to build the image using diskimage-builder. + .. attr:: build-timeout + :type: int + + How long (in seconds) to wait for the diskimage build before giving up. + The default is 8 hours. + .. attr:: elements :type: list diff --git a/nodepool/builder.py b/nodepool/builder.py index dce709e9e..16fe7148b 100755 --- a/nodepool/builder.py +++ b/nodepool/builder.py @@ -739,21 +739,28 @@ class BuildWorker(BaseWorker): if 'qcow2' in img_types: qemu_img_options = DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS - cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s %s' % + log_fn = self._getBuildLog(diskimage.name, build_id) + + cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s --logfile %s %s' % (self.dib_cmd, img_types, qemu_img_options, filename, - img_elements)) + log_fn, img_elements)) self._pruneBuildLogs(diskimage.name) - log_fn = self._getBuildLog(diskimage.name, build_id) self.log.info('Running %s' % (cmd,)) self.log.info('Logging to %s' % (log_fn,)) start_time = time.monotonic() + + # We used to use readline() on stdout to output the lines to the + # build log. Unfortunately, this would block as long as the process + # ran (with no easy way to timeout the read) and wedge the builder. + # Now we use --logfile option to the dib command and set a timeout + # on the wait() call to prevent the wedge. + did_timeout = False try: p = subprocess.Popen( shlex.split(cmd), - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env) except OSError as e: @@ -761,17 +768,20 @@ class BuildWorker(BaseWorker): "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror) ) - with open(log_fn, 'wb') as log: - while True: - ln = p.stdout.readline() - log.write(ln) - log.flush() - if not ln: - break - - rc = p.wait() - m = "Exit code: %s\n" % rc - log.write(m.encode('utf8')) + try: + rc = p.wait(timeout=diskimage.build_timeout) + except subprocess.TimeoutExpired: + p.kill() + did_timeout = True + rc = 1 + self.log.error( + "Build timeout for image %s, build %s (log: %s)", + diskimage.name, build_id, log_fn) + else: + # Append return code to dib's log file + with open(log_fn, 'ab') as log: + m = "Exit code: %s\n" % rc + log.write(m.encode('utf8')) # It's possible the connection to the ZK cluster could have been # interrupted during the build. If so, wait for it to return. @@ -796,9 +806,10 @@ class BuildWorker(BaseWorker): self.log.info("ZooKeeper lost while building %s" % diskimage.name) self._zk.resetLostFlag() build_data.state = zk.FAILED - elif p.returncode: + elif p.returncode or did_timeout: self.log.info( - "DIB failed creating %s (%s)" % (diskimage.name, p.returncode)) + "DIB failed creating %s (%s) (timeout=%s)" % ( + diskimage.name, p.returncode, did_timeout)) build_data.state = zk.FAILED else: self.log.info("DIB image %s is built" % diskimage.name) diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py index c3dde57d7..f2646af34 100644 --- a/nodepool/cmd/config_validator.py +++ b/nodepool/cmd/config_validator.py @@ -44,6 +44,7 @@ class ConfigValidator: 'rebuild-age': int, 'env-vars': {str: str}, 'username': str, + 'build-timeout': int, } webapp = { diff --git a/nodepool/config.py b/nodepool/config.py index 2b5e4ebd4..7f09961c7 100755 --- a/nodepool/config.py +++ b/nodepool/config.py @@ -118,6 +118,7 @@ class Config(ConfigValue): d.image_types = set(diskimage.get('formats', [])) d.pause = bool(diskimage.get('pause', False)) d.username = diskimage.get('username', 'zuul') + d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60)) self.diskimages[d.name] = d def setSecureDiskimageEnv(self, diskimages, secure_config_path): @@ -179,6 +180,7 @@ class DiskImage(ConfigValue): self.image_types = None self.pause = False self.username = None + self.build_timeout = None def __eq__(self, other): if isinstance(other, DiskImage): @@ -189,7 +191,8 @@ class DiskImage(ConfigValue): other.env_vars == self.env_vars and other.image_types == self.image_types and other.pause == self.pause and - other.username == self.username) + other.username == self.username and + other.build_timeout == self.build_timeout) return False def __repr__(self): diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 4a8e83d91..376b4fb5f 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -410,20 +410,25 @@ class DBTestCase(BaseTestCase): time.sleep(1) self.wait_for_threads() - def waitForBuild(self, image_name, build_id): + def waitForBuild(self, image_name, build_id, states=None): + if states is None: + states = (zk.READY,) + base = "-".join([image_name, build_id]) - while True: - self.wait_for_threads() - files = builder.DibImageFile.from_image_id( - self._config_images_dir.path, base) - if files: - break - time.sleep(1) while True: self.wait_for_threads() build = self.zk.getBuild(image_name, build_id) - if build and build.state == zk.READY: + if build and build.state in states: + break + time.sleep(1) + + # We should only expect a dib manifest with a successful build. + while build.state == zk.READY: + self.wait_for_threads() + files = builder.DibImageFile.from_image_id( + self._config_images_dir.path, base) + if files: break time.sleep(1) diff --git a/nodepool/tests/fake-image-create b/nodepool/tests/fake-image-create index 9d236ff16..1a2406fd2 100755 --- a/nodepool/tests/fake-image-create +++ b/nodepool/tests/fake-image-create @@ -1,10 +1,49 @@ #!/bin/bash +outfile= +outtypes=("qcow2") + +all_args=$* +logfile= +checksum= +no_tmpfs= +qemu_img_options= +x= + +TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum,logfile: -- "$@") +if [ $? -ne 0 ]; then + echo "Invalid option" + exit 1 +fi +eval set -- "$TEMP" +while true ; do + case "$1" in + --checksum) + checksum=1; shift 1;; + --no-tmpfs) + no_tmpfs=1; shift 1;; + --qemu-img-options) + qemu_img_options=$2; shift 2;; + --logfile) + logfile=$2; shift 2;; + -o) outfile=$2; shift 2;; + -t) IFS="," read -a outtypes <<< "$2"; shift 2;; + -x) x=1; shift;; + --) shift ; break ;; + *) echo "Unknown option : $1"; exit 1;; + esac +done + +# If --logfile was given, direct stdout to it, as well +if [ ! -z "$logfile" ]; then + exec > >(tee -a ${logfile}) +fi + echo "*** fake-image-create: start" echo "arguments:" echo "----" -echo $* +echo "$all_args" echo "----" if [[ "${SHOULD_FAIL}" == 'true' ]]; then @@ -38,30 +77,21 @@ if [[ "${BASE_IMAGE_FILE}" != "Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2" exit 1 fi -outfile= -outtypes=("qcow2") - -TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum -- "$@") -if [ $? -ne 0 ]; then - echo "Invalid option" - exit 1 +if [ ! -z "$logfile" ]; then + echo " -> logfile: $logfile" +fi +if [ ! -z "$checksum" ]; then + echo " -> set --checksum" +fi +if [ ! -z "$no_tmpfs" ]; then + echo " -> set --no-tmpfs" +fi +if [ ! -z "$qemu_img_options" ]; then + echo " -> qemu-img-options: $qemu_img_options" +fi +if [ ! -z "$x" ]; then + echo " -> debugging enabled" fi -eval set -- "$TEMP" -while true ; do - case "$1" in - --checksum) - echo " -> set --checksum"; shift 1;; - --no-tmpfs) - echo " -> set --no-tmpfs"; shift 1;; - --qemu-img-options) - echo " -> qemu-img-options: $2"; shift 2;; - -o) outfile=$2; shift 2;; - -t) IFS="," read -a outtypes <<< "$2"; shift 2;; - -x) echo " -> debugging enabled"; shift;; - --) shift ; break ;; - *) echo "Unknown option : $1"; exit 1;; - esac -done if [ -z "$outfile" ]; then echo "No output file specified." diff --git a/nodepool/tests/fixtures/config_validate/good.yaml b/nodepool/tests/fixtures/config_validate/good.yaml index db7d1ecfd..fc5993741 100644 --- a/nodepool/tests/fixtures/config_validate/good.yaml +++ b/nodepool/tests/fixtures/config_validate/good.yaml @@ -130,6 +130,7 @@ diskimages: - cache-devstack release: trusty rebuild-age: 3600 + build-timeout: 3600 env-vars: TMPDIR: /opt/dib_tmp DIB_IMAGE_CACHE: /opt/dib_cache diff --git a/nodepool/tests/fixtures/diskimage_build_timeout.yaml b/nodepool/tests/fixtures/diskimage_build_timeout.yaml new file mode 100644 index 000000000..01770bddd --- /dev/null +++ b/nodepool/tests/fixtures/diskimage_build_timeout.yaml @@ -0,0 +1,26 @@ +elements-dir: . +images-dir: '{images_dir}' +build-log-dir: '{build_log_dir}' + +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +labels: [] + +providers: [] + +diskimages: + - name: fake-image + formats: + - tar + elements: + - fedora + - vm + release: 21 + env-vars: + TMPDIR: /opt/dib_tmp + DIB_IMAGE_CACHE: /opt/dib_cache + DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ + BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 diff --git a/nodepool/tests/unit/test_builder.py b/nodepool/tests/unit/test_builder.py index c992bf7aa..cfea4b855 100644 --- a/nodepool/tests/unit/test_builder.py +++ b/nodepool/tests/unit/test_builder.py @@ -16,6 +16,8 @@ import os import uuid import fixtures +import mock +import subprocess from nodepool import builder, exceptions, tests from nodepool.driver.fake import provider as fakeprovider @@ -335,3 +337,10 @@ class TestNodePoolBuilder(tests.DBTestCase): self.assertEqual(build_default._formats, ['qcow2']) self.assertEqual(build_vhd._formats, ['vhd']) + + @mock.patch.object(subprocess.Popen, 'wait') + def test_diskimage_build_timeout(self, mock_wait): + mock_wait.side_effect = subprocess.TimeoutExpired('dib_cmd', 1) + configfile = self.setup_config('diskimage_build_timeout.yaml') + self.useBuilder(configfile, cleanup_interval=0) + self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,)) diff --git a/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml b/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml new file mode 100644 index 000000000..6901903db --- /dev/null +++ b/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + A new option (build-timeout) has been added to the builder diskimage + configuration to control how long the builder should wait for image + builds before giving up. The default is 8 hours.