From 5a304af6e1f424fd5c5e2bba907428bc0d402cfd Mon Sep 17 00:00:00 2001 From: Rei Oliveira Date: Fri, 15 Mar 2024 11:40:26 -0300 Subject: [PATCH] Only wait for essential pods in cert recovery The certificate recovery role will trigger a restart of every pod in the k8s cluster so that they can be updated with the latest certificate information. After pods restart the procedure waits every pod to recover and become READY. This change modifies that behaviour to only wait for essential pods to recover, being those in the core namespaces armada, cert-manager, flux-helm and kube-system. Test case: PASS: Run certificate recovery with crashing pods in a custom namespace Closes-Bug: 2058751 Signed-off-by: Rei Oliveira Change-Id: I3ea403a3e324ecbb5f2c1f56d6ce1c8bd80fabee --- .../tasks/recover-k8s-leaf-certificates.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/playbookconfig/src/playbooks/roles/common/recover-subcloud-certificates/tasks/recover-k8s-leaf-certificates.yml b/playbookconfig/src/playbooks/roles/common/recover-subcloud-certificates/tasks/recover-k8s-leaf-certificates.yml index 9a2f2f1b3..abd1e970c 100644 --- a/playbookconfig/src/playbooks/roles/common/recover-subcloud-certificates/tasks/recover-k8s-leaf-certificates.yml +++ b/playbookconfig/src/playbooks/roles/common/recover-subcloud-certificates/tasks/recover-k8s-leaf-certificates.yml @@ -81,8 +81,9 @@ - name: Wait pods to restart (become READY) on controller shell: >- kubectl get po -l '!job-name' -A --no-headers -o - 'custom-columns=NAME:.metadata.name, + 'custom-columns=NAME:.metadata.name, NAMESPACE:.metadata.namespace, READY:.status.containerStatuses[*].ready,NODE:.spec.nodeName' + | grep "armada\|cert-manager\|flux-helm\|kube-system" | grep -v calico-node | grep $(hostname) | grep -cv true