From 20bb6a02551d1608e8e1b7a5a51cbc8aeac2284a Mon Sep 17 00:00:00 2001
From: Andreas Florath <andreas@florath.net>
Date: Wed, 20 Jul 2016 23:34:50 +0200
Subject: [PATCH] Add option to be able to run_functests.sh in parallel

Running the functional tests is time consuming.  This patch adds the
option `-j <job count>` to the tests/run_functests.sh: when given the
test run in parallel up the <job count> jobs.
When using this, be sure to have enough resources (CPUs, RAM and HD
space) on the host.

In addition there was the need to change two things:
o Global /tmp/dib-test-should-fail was move to temporary build
  directory of each execution.
o Because the logs might now interleave, each log line has now a
  prefix of the name of the testcase.

[In my environment running functests sequential takes 15+ minutes,
running them parallel takes less than 6 minutes.]

Change-Id: Id9ea5131f0026c292ca6453ba2c80fe12c47f808
Signed-off-by: Andreas Florath <andreas@florath.net>
---
 doc/source/developer/developing_elements.rst |   5 +
 tests/run_functests.sh                       | 129 +++++++++++++++++--
 2 files changed, 126 insertions(+), 8 deletions(-)
diff --git a/doc/source/developer/developing_elements.rst b/doc/source/developer/developing_elements.rst
index 0cee5946e..aa5364406 100644
--- a/doc/source/developer/developing_elements.rst
+++ b/doc/source/developer/developing_elements.rst
@@ -385,6 +385,11 @@ line to run it.  If it should not be run as part of the default CI
 run, you can submit a change with it added to ``DEFAULT_SKIP_TESTS``
 in that file.
 
+Running the functional tests is time consuming.  Multiple parallel
+jobs can be started by specifying ``-j <job count>``.  Each of the
+jobs uses a lot resources (CPU, disk space, RAM) - therefore the job
+count must carefully be chosen.
+
 python
 """"""
 
diff --git a/tests/run_functests.sh b/tests/run_functests.sh
index f43c1e2f7..4c6801eea 100755
--- a/tests/run_functests.sh
+++ b/tests/run_functests.sh
@@ -23,21 +23,65 @@ DEFAULT_SKIP_TESTS=(
     debian-minimal/testing-build-succeeds
 )
 
+function log_with_prefix {
+    local pr=$1
+
+    while read a; do
+        echo $(date +"%Y%m%d-%H%M%S.%N") "[$pr] $a"
+    done
+}
+
+# Log job control messages
+function log_jc {
+    local msg="$1"
+    printf "[JOB-CONTROL] %s %s\n" "$(date)" "${msg}"
+}
+
+function job_cnt {
+    running_jobs=$(jobs -p)
+    echo ${running_jobs} | wc -w
+}
+
+# This is needed, because the better 'wait -n' is
+# available since bash 4.3 only.
+function wait_minus_n {
+    if [ "${BASH_VERSINFO[0]}" -gt 4 \
+                               -o "${BASH_VERSINFO[0]}" = 4 \
+                               -a "${BASH_VERSINFO[1]}" -ge 3 ]; then
+        # Good way: wait on any job
+        wait -n
+        return $?
+    else
+        # Not that good way: wait on one specific job
+        # (others may be finished in the mean time)
+        local wait_for_pid=$(jobs -p | head -1)
+        wait ${wait_for_pid}
+        return $?
+    fi
+}
+
 # run_disk_element_test <test_element> <element>
 #  Run a disk-image-build .tar build of ELEMENT including any elements
 #  specified by TEST_ELEMENT
 function run_disk_element_test() {
     local test_element=$1
     local element=$2
+    local dont_use_tmp=$3
+    local use_tmp_flag=""
     local dest_dir=$(mktemp -d)
 
-    trap "rm -rf $dest_dir /tmp/dib-test-should-fail" EXIT
+    trap "rm -rf $dest_dir" EXIT
+
+    if [ "${dont_use_tmp}" = "yes" ]; then
+        use_tmp_flag="--no-tmpfs"
+    fi
 
     if break="after-error" break_outside_target=1 \
-        break_cmd="cp \$TMP_MOUNT_PATH/tmp/dib-test-should-fail /tmp/ 2>&1 > /dev/null || true" \
+        break_cmd="cp -v \$TMP_MOUNT_PATH/tmp/dib-test-should-fail ${dest_dir} || true" \
         DIB_SHOW_IMAGE_USAGE=1 \
         ELEMENTS_PATH=$DIB_ELEMENTS:$DIB_ELEMENTS/$element/test-elements \
-        $DIB_CMD -x -t tar,qcow2 -o $dest_dir/image -n $element $test_element; then
+        $DIB_CMD -x -t tar,qcow2 ${use_tmp_flag} -o $dest_dir/image -n $element $test_element 2>&1 \
+           | log_with_prefix "${element}/${test_element}"; then
 
         if ! [ -f "$dest_dir/image.qcow2" ]; then
             echo "Error: qcow2 build failed for element: $element, test-element: $test_element."
@@ -58,7 +102,7 @@ function run_disk_element_test() {
             fi
         fi
     else
-        if [ -f "/tmp/dib-test-should-fail" ]; then
+        if [ -f "${dest_dir}/dib-test-should-fail" ]; then
             echo "PASS: Element $element, test-element: $test_element"
         else
             echo "Error: Build failed for element: $element, test-element: $test_element."
@@ -79,7 +123,8 @@ function run_ramdisk_element_test() {
     local dest_dir=$(mktemp -d)
 
     if ELEMENTS_PATH=$DIB_ELEMENTS/$element/test-elements \
-        $DIB_CMD -x -o $dest_dir/image $element $test_element; then
+        $DIB_CMD -x -o $dest_dir/image $element $test_element \
+            | log_with_prefix "${element}/${test_element}"; then
         # TODO(dtantsur): test also kernel presence once we sort out its naming
         # problem (vmlinuz vs kernel)
         if ! [ -f "$dest_dir/image.initramfs" ]; then
@@ -109,12 +154,15 @@ for e in $DIB_ELEMENTS/*/test-elements/*; do
     TESTS+=("$element/$test_element")
 done
 
-while getopts ":hl" opt; do
+JOB_MAX_CNT=1
+
+while getopts ":hlpj:" opt; do
     case $opt in
         h)
             echo "run_functests.sh [-h] [-l] <test> <test> ..."
             echo "  -h : show this help"
             echo "  -l : list available tests"
+            echo "  -p : run all tests in parallel"
             echo "  <test> : functional test to run"
             echo "           Special test 'all' will run all tests"
             exit 0
@@ -128,6 +176,10 @@ while getopts ":hl" opt; do
             echo
             exit 0
             ;;
+        j)
+            JOB_MAX_CNT=${OPTARG}
+            echo "Running parallel - using [${JOB_MAX_CNT}] jobs"
+            ;;
         \?)
             echo "Invalid option: -$OPTARG"
             exit 1
@@ -136,6 +188,15 @@ while getopts ":hl" opt; do
 done
 shift $((OPTIND-1))
 
+DONT_USE_TMP="no"
+if [ "${JOB_MAX_CNT}" -gt 1 ]; then
+    # switch off using tmp dir for image building
+    # (The mem check using the tmp dir is currently done
+    #  based on the available memory - and not on the free.
+    #  See #1618124 for more details)
+    DONT_USE_TMP="yes"
+fi
+
 # cull the list of tests to run into TESTS_TO_RUN
 TESTS_TO_RUN=()
 title=""
@@ -171,7 +232,36 @@ for test in "${TESTS_TO_RUN[@]}"; do
 done
 echo "------"
 
+function wait_and_exit_on_failure {
+    local pid=$1
+
+    wait ${pid}
+    result=$?
+
+    if [ "${result}" -ne 0 ]; then
+        exit ${result}
+    fi
+    return 0
+}
+
+EXIT_CODE=0
 for test in "${TESTS_TO_RUN[@]}"; do
+    running_jobs_cnt=$(job_cnt)
+    log_jc "Number of running jobs [${running_jobs_cnt}] max jobs [${JOB_MAX_CNT}]"
+    if [ "${running_jobs_cnt}" -ge "${JOB_MAX_CNT}" ]; then
+        log_jc "Waiting for job to finish"
+        wait_minus_n
+        result=$?
+
+        if [ "${result}" -ne 0 ]; then
+            EXIT_CODE=1
+            # If a job fails, do not start any new ones.
+            break
+        fi
+    fi
+
+    log_jc "Starting new job"
+
     # from above; each array value is element/test_element.  split it
     # back up
     element=${test%/*}
@@ -186,7 +276,30 @@ for test in "${TESTS_TO_RUN[@]}"; do
     fi
 
     echo "Running $test ($element_type)"
-    run_${element_type}_element_test $test_element $element
+    run_${element_type}_element_test $test_element $element ${DONT_USE_TMP} &
 done
 
-echo "Tests passed!"
+# Wait for the rest of the jobs
+while true; do
+    running_jobs_cnt=$(job_cnt)
+    log_jc "Number of running jobs left [${running_jobs_cnt}]"
+
+    if [ "${running_jobs_cnt}" -eq 0 ]; then
+        break;
+    fi
+
+    wait_minus_n
+    result=$?
+
+    if [ "${result}" -ne 0 ]; then
+        EXIT_CODE=1
+    fi
+done
+
+if [ "${EXIT_CODE}" -eq 0 ]; then
+    echo "Tests passed!"
+    exit 0
+else
+    echo "At least one test failed"
+    exit 1
+fi