From 05f11e6348f0cae8736d5f7b293982a265ea5c45 Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Tue, 4 Apr 2017 13:21:30 +0300 Subject: [PATCH] Improve tolerance for failed nodes Raise SSH reconnects to 20 Disable any_errors_fatal Depends on https://github.com/kubernetes-incubator/kargo/pull/1201/ Change-Id: I0d20c9e0e80f87561c50957fdff1e576ec89646c --- utils/jenkins/kargo_deploy.sh | 3 ++- utils/kargo/kargo_default_common.yaml | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/utils/jenkins/kargo_deploy.sh b/utils/jenkins/kargo_deploy.sh index 49fa2e4..d3fea0e 100755 --- a/utils/jenkins/kargo_deploy.sh +++ b/utils/jenkins/kargo_deploy.sh @@ -142,7 +142,8 @@ function with_ansible { until admin_node_command \ ANSIBLE_CONFIG=$ADMIN_WORKSPACE/utils/kargo/ansible.cfg \ ansible-playbook \ - --ssh-extra-args "-A\ -o\ StrictHostKeyChecking=no" -u ${ADMIN_USER} -b \ + --ssh-extra-args "-A\ -o\ StrictHostKeyChecking=no\ -o\ ConnectionAttempts=20" \ + -u ${ADMIN_USER} -b \ --become-user=root -i $ADMIN_WORKSPACE/inventory/inventory.cfg \ --forks=$ANSIBLE_FORKS --timeout $ANSIBLE_TIMEOUT $DEFAULT_OPTS \ -e ansible_ssh_user=${ADMIN_USER} \ diff --git a/utils/kargo/kargo_default_common.yaml b/utils/kargo/kargo_default_common.yaml index 0e8b513..ab571b5 100644 --- a/utils/kargo/kargo_default_common.yaml +++ b/utils/kargo/kargo_default_common.yaml @@ -50,6 +50,9 @@ upstream_dns_servers: # has some bugs when DHCP is enabled. resolvconf_mode: host_resolvconf +# Continue deploying other hosts even if one failed +any_errors_fatal: false + # Tweak kubelet monitoring parameters to node/endpoint node flapping kubelet_status_update_frequency: "20s" kube_controller_node_monitor_grace_period: "2m"