Add retry/rerun support for exec module

Add support for retries and reruns at specified intervals for
divingbell-exec scripts. Also adds support for timeouts.

Also update osh-infra-upgrade-host to allow gate to run.

Change-Id: I5f4cd43b13a467d94f67b358f3190f515256ae66
This commit is contained in:
Craig Anderson 2018-12-11 06:53:54 +00:00
parent 012800d854
commit 4ed467e512
4 changed files with 227 additions and 39 deletions

View File

@ -16,6 +16,8 @@
# limitations under the License.
*/}}
{{- $exec_loop_sleep_interval := 60 }}
set -e
cat <<'UNIQUE_EOF_9c341059-25a0-4725-9489-1789e255e381' > {{ .Values.conf.chroot_mnt_path | quote }}/tmp/exec_host_{{ .Chart.Version }}.sh
@ -74,19 +76,28 @@ cd "${exec_path}"
{{- $_ := set $.Values "__blocking_policy" $keypath.blocking_policy }}
{{- end }}
{{- $_ := set $.Values "__timeout" 3600 }}
{{- $_ := set $.Values "__timeout" 1800 }}
{{- if hasKey $keypath "timeout" }}
{{- fail (print "NOT IMPLEMENTED: 'timeout' FOR '" $script "'") }}
{{- if eq ($keypath.timeout | toString) "infinite" }}
{{- fail (print "BAD 'timeout' FOR '" $script "': 'infinite' timeouts not supported.") }}
{{- end }}
{{- $_ := set $.Values "__timeout" $keypath.timeout }}
{{- end }}
{{- $_ := set $.Values "__rerun_interval" "infinite" }}
{{- if hasKey $keypath "rerun_interval" }}
{{- fail (print "NOT IMPLEMENTED: 'rerun_interval' FOR '" $script "'") }}
{{- if not (eq ($keypath.rerun_interval | toString) "infinity") }}
{{- if lt ($keypath.rerun_interval | int) $exec_loop_sleep_interval }}
{{- fail (print "BAD 'rerun_interval' FOR '" $script "': Got '" $keypath.rerun_interval "', but expected >= '" $exec_loop_sleep_interval "'.") }}
{{- end }}
{{- if not (eq $.Values.__rerun_policy "always") }}
{{- fail (print "BAD COMBINATION: Must use 'rerun_policy' of 'always' when defining a finite 'rerun_interval'. Got 'rerun_policy' of '" $.Values.__rerun_policy "' and 'rerun_interval' of '" $keypath.rerun_interval "' for '" $script "'.") }}
{{- end }}
{{- end }}
{{- $_ := set $.Values "__rerun_interval" $keypath.rerun_interval }}
{{- end }}
{{- $_ := set $.Values "__rerun_interval_persist" "false" }}
{{- $_ := set $.Values "__rerun_interval_persist" "true" }}
{{- if hasKey $keypath "rerun_interval_persist" }}
{{- fail (print "NOT IMPLEMENTED: 'rerun_interval_persist' FOR '" $script "'") }}
{{- $_ := set $.Values "__rerun_interval_persist" $keypath.rerun_interval_persist }}
@ -98,13 +109,20 @@ cd "${exec_path}"
{{- $_ := set $.Values "__rerun_max_count" $keypath.rerun_max_count }}
{{- end }}
{{- $_ := set $.Values "__retry_interval" $.Values.__rerun_interval }}
{{- $_ := set $.Values "__retry_interval" (print $.Values.__rerun_interval) }}
{{- if hasKey $keypath "retry_interval" }}
{{- fail (print "NOT IMPLEMENTED: 'retry_interval' FOR '" $script "'") }}
{{- if not (eq ($keypath.retry_interval | toString) "infinity") }}
{{- if lt ($keypath.retry_interval | int) $exec_loop_sleep_interval }}
{{- fail (print "BAD 'retry_interval' FOR '" $script "': Got '" $keypath.retry_interval "', but expected >= '" $exec_loop_sleep_interval "'.") }}
{{- end }}
{{- if and (not (eq $.Values.__rerun_policy "always")) (not (eq $.Values.__rerun_policy "once_successfully")) }}
{{- fail (print "BAD COMBINATION: Must use 'rerun_policy' of 'always' or 'once_successfully' when defining a finite 'retry_interval'. Got 'rerun_policy' of '" $.Values.__rerun_policy "' and 'retry_interval' of '" $keypath.retry_interval "' for '" $script "'.") }}
{{- end }}
{{- end }}
{{- $_ := set $.Values "__retry_interval" $keypath.retry_interval }}
{{- end }}
{{- $_ := set $.Values "__retry_interval_persist" "false" }}
{{- $_ := set $.Values "__retry_interval_persist" "true" }}
{{- if hasKey $keypath "retry_interval_persist" }}
{{- fail (print "NOT IMPLEMENTED: 'retry_interval_persist' FOR '" $script "'") }}
{{- $_ := set $.Values "__retry_interval_persist" $keypath.retry_interval_persist }}
@ -115,15 +133,43 @@ cd "${exec_path}"
{{- fail (print "NOT IMPLEMENTED: 'retry_max_count' FOR '" $script "'") }}
{{- $_ := set $.Values "__retry_max_count" $keypath.retry_max_count }}
{{- end }}
cat <<'UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526' > {{ $script }}
{{ $keypath.data }}
UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526
chmod 700 {{ $script }}
# check rerun policy
hash_check=fail
if [[ {{ $.Values.__rerun_policy }} = always ]] || \
[[ ! -f ${hash}/exit_code ]] || \
([[ {{ $.Values.__rerun_policy }} = once_successfully ]] && \
[[ -f ${hash}/exit_code ]] && \
[[ $(cat ${hash}/exit_code) != 0 ]]); then
[[ $(cat ${hash}/exit_code) != 0 ]]); then
hash_check=pass
fi
# check rerun/retry interval
interval_check=fail
if [[ ! -f ${hash}/last_run_timestamp ]] || [[ ! -f ${hash}/exit_code ]]; then
interval_check=pass
elif [[ $(cat ${hash}/exit_code) = 0 ]]; then
if [[ {{ $.Values.__rerun_interval }} = infinite ]]; then
interval_check=pass
elif [[ $(date +"%s") -ge $(($(cat ${hash}/last_run_timestamp) + {{ $.Values.__rerun_interval }})) ]]; then
interval_check=pass
fi
elif [[ $(cat ${hash}/exit_code) != 0 ]]; then
if [[ {{ $.Values.__retry_interval }} = infinite ]]; then
interval_check=pass
elif [[ $(date +"%s") -ge $(($(cat ${hash}/last_run_timestamp) + {{ $.Values.__retry_interval }})) ]]; then
interval_check=pass
fi
fi
if [[ $hash_check = pass ]] && [[ $interval_check = pass ]]; then
if [[ -f ${hash}/exit_code ]]; then
# remove previous run record, in case this run is interrupted
rm ${hash}/exit_code
fi
# write timestamp at beginning of execution
echo $(date +"%s") > "${hash}/last_run_timestamp"
{{- if hasKey $keypath "env" }}
{{- range $env_key, $env_val := $keypath.env }}
{{ $env_key }}={{ $env_val | squote }} \
@ -135,7 +181,26 @@ UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526
{{ $arg | squote }} \
{{- end }}
{{- end }}
&& echo 0 > "${hash}/exit_code" || echo $? > "${hash}/exit_code"
&
pid=$!
time_waited=0
sleep_interval=5
timeout={{ $.Values.__timeout }}
while true; do
if [[ $time_waited -ge $timeout ]]; then
log.ERROR "Hit '$timeout' second timeout waiting for '{{ $script }}' - terminating."
# ask nicely first
kill $pid
sleep 10
# force kill if still running
ps $pid > /dev/null && kill -9 $pid
break
fi
ps $pid > /dev/null || break
sleep $sleep_interval
time_waited=$(($time_waited + $sleep_interval))
done
wait $pid && echo 0 > "${hash}/exit_code" || echo $? > "${hash}/exit_code"
{{- if hasKey $keypath "blocking_policy" }}
{{- if eq $keypath.blocking_policy "foreground_halt_pod_on_failure" }}
if [[ $(cat "${hash}/exit_code") != '0' ]]; then
@ -144,20 +209,16 @@ UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526
{{- end }}
{{- end }}
fi
{{ end }}
{{- end }}
{{- end }}
exit 0
UNIQUE_EOF_9c341059-25a0-4725-9489-1789e255e381
chmod 700 {{ .Values.conf.chroot_mnt_path | quote }}/tmp/exec_host_{{ .Chart.Version }}.sh
chroot {{ .Values.conf.chroot_mnt_path | quote }} /tmp/exec_host_{{ .Chart.Version }}.sh
sleep 1
echo 'INFO Putting the daemon to sleep.'
while [ 1 ]; do
sleep 300
while true; do
chroot {{ .Values.conf.chroot_mnt_path | quote }} /tmp/exec_host_{{ .Chart.Version }}.sh
sleep 2
echo 'INFO Putting the daemon to sleep.'
sleep {{ $exec_loop_sleep_interval }}
done
exit 0

View File

@ -209,6 +209,17 @@ The following set of options are fully implemeneted::
If any of that info changes, so will the hash, and it will be seen as a new
object which will be executed regardless of this setting.
``script_timeout`` may optionally be set to the number of seconds to wait for
script completion before termination. Default value is ``1800`` (30 min).
``rerun_interval`` may be optionally set to the number of seconds to wait
between rerunning a given script which ran successfully the previous time.
Default value is ``infinite``.
``retry_interval`` may be optionally set to the number of seconds to wait
between rerunning a given script which did not run successfully the previous
time. Default behavior is to match the ``rerun_interval``.
The following set of options are partially implemeneted::
``blocking_policy`` may optionally be set to ``background``, ``foreground``,
@ -223,30 +234,17 @@ The following set of options are partially implemeneted::
The following set of options are not yet implemeneted::
``script_timeout`` may optionally be set to the number of seconds to wait for
script completion before termination. Default value is ``3600`` (1 hour).
``rerun_interval`` may be optionally set to the number of seconds to wait
between rerunning a given script which ran successfully the previous time.
Default value is ``infinite``.
``rerun_interval_persist`` may be optionally set to ``true`` for
a given script. This allows a script to persist its rerun interval through a
pod/node restart. Otherwise, the time since last successful script execution
will not be considered on pod/node startup. Default value is ``false``.
``rerun_interval_persist`` may be optionally set to ``false`` for a given
script. This makes the script execute on pod/node startup regardless of the
interval since the last successful execution. Default value is ``true``.
``rerun_max_count`` may be optionally set to the maximum number of times a
succeeding script should be retried. Successful exec count does not persist
through pod/node restart. Default value is ``infinite``.
``retry_interval`` may be optionally set to the number of seconds to wait
between rerunning a given script which did not run successfully the previous
time. Default value is set to the ``rerun_interval``.
``retry_interval_persist`` may be optionally set to ``true`` for
a given script. This allows a script to persist its retry interval through a
pod/node restart. Otherwise, the time since last failed script execution
will not be considered on pod/node startup. Default value is ``false``.
``retry_interval_persist`` may be optionally set to ``false`` for a given
script. This makes the script execute on pod/node startup, regardless of the
time since the last execution. Default value is ``true``.
``retry_max_count`` may be optionally set to the maximum number of times a
failing script should be retried. Failed exec count does not persist

View File

@ -39,3 +39,15 @@
- upgrade-host
- start-zuul-console
- disable-local-nameserver
- hosts: all
vars_files:
- vars.yaml
vars:
work_dir: "{{ zuul.project.src_dir }}/{{ zuul_osh_infra_relative_path | default('') }}"
gather_facts: False
become: yes
roles:
- deploy-apparmor
tags:
- deploy-apparmor

View File

@ -1178,6 +1178,123 @@ manifests:
echo "[SUCCESS] exec test$(($i + 5)) passed successfully" >> "${TEST_RESULTS}"
done
# test timeout
local overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set17.yaml
echo 'conf:
exec:
011-timeout.sh:
timeout: 11
data: |
#!/bin/bash
sleep 60' > "${overrides_yaml}"
install_base "--values=${overrides_yaml}"
get_container_status exec
_test_clog_msg 'timeout waiting for'
echo '[SUCCESS] exec test17 passed successfully' >> "${TEST_RESULTS}"
# Test invalid timeout
overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set18.yaml
echo 'conf:
exec:
011-timeout.sh:
timeout: infinite
data: |
#!/bin/bash
sleep 60' > "${overrides_yaml}"
install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD .timeout. FOR' || \
(echo "[FAIL] exec test18 did not receive expected 'BAD .timeout. FOR' error" && exit 1)
echo '[SUCCESS] exec test18 passed successfully' >> "${TEST_RESULTS}"
# Test invalid rerun_interval (too short)
overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set19.yaml
echo 'conf:
exec:
012-rerun-interval.sh:
rerun_interval: 30
data: |
#!/bin/bash
true' > "${overrides_yaml}"
install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD .rerun_interval. FOR' || \
(echo "[FAIL] exec test19 did not receive expected 'BAD .rerun_interval. FOR' error" && exit 1)
echo '[SUCCESS] exec test19 passed successfully' >> "${TEST_RESULTS}"
# Test invalid retry_interval (too short)
overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set20.yaml
echo 'conf:
exec:
012-retry-interval.sh:
retry_interval: 30
data: |
#!/bin/bash
true' > "${overrides_yaml}"
install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD .retry_interval. FOR' || \
(echo "[FAIL] exec test20 did not receive expected 'BAD .retry_interval. FOR' error" && exit 1)
echo '[SUCCESS] exec test20 passed successfully' >> "${TEST_RESULTS}"
# Test invalid rerun_interval combination
overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set21.yaml
echo 'conf:
exec:
012-rerun-interval.sh:
rerun_interval: 60
rerun_policy: once_successfully
data: |
#!/bin/bash
true' > "${overrides_yaml}"
install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD COMBINATION' || \
(echo "[FAIL] exec test21 did not receive expected 'BAD COMBINATION' error" && exit 1)
echo '[SUCCESS] exec test21 passed successfully' >> "${TEST_RESULTS}"
# Test invalid retry_interval combination
overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set22.yaml
echo 'conf:
exec:
012-retry-interval.sh:
retry_interval: 60
rerun_policy: never
data: |
#!/bin/bash
true' > "${overrides_yaml}"
install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD COMBINATION' || \
(echo "[FAIL] exec test22 did not receive expected 'BAD COMBINATION' error" && exit 1)
echo '[SUCCESS] exec test22 passed successfully' >> "${TEST_RESULTS}"
# test rerun_interval
overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set23.yaml
echo 'conf:
exec:
012-rerun-interval.sh:
rerun_interval: 60
data: |
#!/bin/bash
echo script name: ${BASH_SOURCE} >> exec_testfile' > "${overrides_yaml}"
install_base "--values=${overrides_yaml}"
get_container_status exec
sleep 72
get_container_status exec
expected_result='script name: ./012-rerun-interval.sh
script name: ./012-rerun-interval.sh'
_test_exec_match "$expected_result" "${EXEC_DIR}/exec_testfile" "test23"
echo '[SUCCESS] exec test23 passed successfully' >> "${TEST_RESULTS}"
# test retry_interval
overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set24.yaml
echo 'conf:
exec:
012-retry-interval.sh:
retry_interval: 60
data: |
#!/bin/bash
echo script name: ${BASH_SOURCE} >> exec_testfile
false' > "${overrides_yaml}"
install_base "--values=${overrides_yaml}"
get_container_status exec
sleep 72
get_container_status exec
expected_result='script name: ./012-retry-interval.sh
script name: ./012-retry-interval.sh'
_test_exec_match "$expected_result" "${EXEC_DIR}/exec_testfile" "test24"
echo '[SUCCESS] exec test24 passed successfully' >> "${TEST_RESULTS}"
}
# test daemonset value overrides for hosts and labels