Merge "Integrate Hashicorp Vault into platform B&R"

This commit is contained in:
Zuul 2024-05-09 21:39:50 +00:00 committed by Gerrit Code Review
commit a51946c563
9 changed files with 553 additions and 41 deletions

View File

@ -98,6 +98,18 @@ openstack_backup_filename_prefix: "{{ inventory_hostname }}_{{ openstack_app_nam
#
dc_vault_backup_filename_prefix: "{{ inventory_hostname }}_dc_vault_backup"
# This is the default value for including Hashicorp vault into the platform backup process.
# This value can be overridden by the user when calling for platform backup playbook,
# to include or not include the Hashicorp vault backup.
# If the Hashicorp vault application is either uploaded only or non-existent,
# the backup process will be omitted regardless of what this value is.
backup_hc_vault: false
# The hashicorp vault backup tarball will be named in this format:
# <dc_vault_backup_filename_prefix>_<timestamp>.tgz
#
hc_vault_backup_filename_prefix: "{{ inventory_hostname }}_hc_vault_backup"
restore_cinder_glance_data: false
# Default directory where the system backup tarballs fetched from the

View File

@ -16,6 +16,8 @@
ldap_schema_path: "{{ '/etc/openldap/schema' if os_release == 'centos' else '/etc/ldap/schema' }}"
backup_registry_filesystem_required: "{{ backup_registry_filesystem | bool }}"
should_use_old_image_backup: "{{ backup_user_images|bool == true }}"
include_hc_vault: "{{ backup_hc_vault | bool }}"
omit_hc_vault: false
- name: Do StarlingX backup
@ -64,6 +66,61 @@
msg: "default-registry-key not found. Platform backup cannot proceed without it."
when: kube_system_default_registry_key.rc != 0
- name: Check Hashicorp vault status
block:
- name: Check if vault is applied
shell: |
source /etc/platform/openrc
system application-show vault --format value --column status
register: vault_applied_exists
- name: Omit vault if status is empty or uploaded
set_fact:
include_hc_vault: false
omit_hc_vault: true
when: >-
vault_applied_exists.stdout | length == 0 or
vault_applied_exists.stdout == "uploaded"
- name: Fail vault if status is not applied
fail:
msg: "Hashicorp vault application is {{ vault_applied_exists.stdout }}, not applied."
when: vault_applied_exists.stdout != "applied"
when: include_hc_vault | bool
- name: Indicate if Hashicorp vault is omitted from status check
debug:
msg: "Hashicorp vault backup will be omitted because vault is not applied."
when: omit_hc_vault | bool
- name: Hashicorp vault precheck
block:
- name: Find vault manager pod
shell: >-
kubectl get pods -n vault | grep "vault-manager" | cut -d " " -f 1
register: vault_manager_pod_name
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
- name: Fail if vault manager pod is not found
fail:
msg: "Vault manager pod is not found"
when: vault_manager_pod_name.stdout | length == 0
- name: Check vault system health
shell: >-
kubectl exec -n "vault" "{{ vault_manager_pod_name.stdout }}" --
bash -c "source /opt/script/init.sh; snapshotPreCheck" 2>&1
register: vault_system_health
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
- name: Fail if vault health check returns error
fail:
msg: "Vault system health check returned error"
when: vault_system_health.rc != 0
when: include_hc_vault | bool
- name: Send application lifecycle notifications for pre-backup semantic check
command: /usr/bin/sysinv-utils notify backup-semantic-check
register: backup_semantic_check_notification_result
@ -143,6 +200,13 @@
state: directory
register: postgres_dir
- name: Create hashicorp vault temp dir
file:
path: "{{ tempdir.path }}/hc_vault_dir"
state: directory
register: hc_vault_dir
when: include_hc_vault | bool
- name: Backup roles, table spaces and schemas for databases.
shell: >-
sudo -u postgres pg_dumpall
@ -508,6 +572,33 @@
{{ pre_etcd_backup_notification_result.stderr }}.
when: pre_etcd_backup_notification_result.rc != 0
# Vault snapshot should be taken before the backup of etcd database.
# A k8s secret is created that is associated with the snapshot.
- name: Run hashicorp vault backupldap_schema_path
block:
- name: Include hashicorp vault backup role
include_role:
name: vault/vault_backup
vars:
vault_backup_dir: "{{ hc_vault_dir.path }}"
vault_encrypt: false
encrypt_hc_vault_secret: ""
op_mode: "platform"
- name: Find result files
find:
paths: "{{ hc_vault_dir.path }}"
patterns: "hc-vault-snapshot-*.tar*"
register: hc_vault_backup_result
- name: Fail if incorrect number of file created from Hashicorp vault backup
fail:
msg: >
There was an error with the Hashicorp vault backup process.
Incorrect number of files produced.
when: hc_vault_backup_result.matched != 2
when: include_hc_vault | bool
- name: Create etcd snapshot temp dir
file:
path: "{{ tempdir.path }}/etcd-snapshot"
@ -591,6 +682,7 @@
user_images_backup_file: "{{ user_images_backup_filename_prefix }}_{{ backup_timestamp }}.tgz"
openstack_backup_file: "{{ openstack_backup_filename_prefix }}_{{ backup_timestamp }}.tgz"
dc_vault_backup_file: "{{ dc_vault_backup_filename_prefix }}_{{ backup_timestamp }}.tgz"
hc_vault_backup_file: "{{ hc_vault_backup_filename_prefix }}_{{ backup_timestamp }}.tgz"
- name: Set backup files absolute path
set_fact:
@ -598,6 +690,7 @@
user_images_backup_file_path: "{{ backup_dir }}/{{ user_images_backup_file }}"
openstack_backup_file_path: "{{ backup_dir }}/{{ openstack_backup_file }}"
dc_vault_backup_file_path: "{{ backup_dir }}/{{ dc_vault_backup_file }}"
hc_vault_backup_file_path: "{{ backup_dir }}/{{ hc_vault_backup_file }}"
- name: Save user uploaded images from local registry to an archive
include_tasks: export-user-local-registry-images.yml
@ -695,6 +788,24 @@
failed_when: tar_cmd.rc >= 2 or tar_cmd.rc < 0
when: check_mariadb_pod.rc == 0 or openstack_status.stdout == "uploaded"
- name: Create a tgz archive for Hashicorp vault backup
shell: >-
tar
--use-compress-program={{ compress_program }}
--exclude {{ exclude_targets | map('regex_replace', '^/', '')
| list | join(' --exclude ') }}
-cf {{ hc_vault_backup_file_path }}
$(ls -d
{{ hc_vault_dir.path }}
2> /dev/null)
args:
warn: false
# Changing the failed_when behavior to prevent the backup to fail on "file changed as we read it", which
# makes tar return 1
register: tar_cmd
failed_when: tar_cmd.rc >= 2 or tar_cmd.rc < 0
when: include_hc_vault | bool
- name: Notify the user backup tar file(s) are available
debug:
msg: >-
@ -714,12 +825,20 @@
- name: Transfer openstack backup tar files to the local machine if it exists
fetch:
src: "{{ openstack_backup_file_path}}"
src: "{{ openstack_backup_file_path }}"
dest: "{{ host_backup_dir }}/"
flat: yes
when: check_mariadb_pod.rc == 0 or openstack_status.stdout == "uploaded"
no_log: true
- name: Transfer for Hashicorp vault backup tar files to the local machine if it exists
fetch:
src: "{{ hc_vault_backup_file_path }}"
dest: "{{ host_backup_dir }}/"
flat: yes
when: include_hc_vault | bool
no_log: true
# TODO transfer docker image archive which may be very big during remote play.
# Fetch module fills the memory and has a very slow transfer rate due to base64 encoding
# Maybe use synchronize module after upgrading ansible, backup-restore/transfer-file

View File

@ -0,0 +1,234 @@
#!/bin/bash
#
# Copyright (c) 2024 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# This script validates if the hashicorp vault application is
# ready to be restored, and if not, attempts to reapply application
VAULT_NS="vault"
VAULT_REAPPLIED=false
# List of pauses
# app remove:
# 60s == VAULT_REMOVE_TRIES @ VAULT_REAPPLY_WAITTIME intervals
# PVC delete:
# 120s == PVC_DELETE_TRIES @ VAULT_REAPPLY_WAITTIME intervals
# cluster-key delete:
# 60s == CLUSTER_KEY_DELETE_TRIES @ VAULT_REAPPLY_WAITTIME intervals
# app apply:
# 300s == VAULT_APPLY_TRIES @ VAULT_REAPPLY_WAITTIME intervals
# post apply wait time:
# 30s == VAULT_UNSEAL_WAITTIME
# unseal per pod:
# 60s == SEALED_STATUS_TRIES @ SEALED_STATUS_WAITTIME intervals
# Number of tries for each action
MAIN_TRIES=2
SEALED_STATUS_TRIES=6
VAULT_REMOVE_TRIES=5
PVC_DELETE_TRIES=12
CLUSTER_KEY_DELETE_TRIES=6
VAULT_APPLY_TRIES=30
# Wait times
SEALED_STATUS_WAITTIME=10
VAULT_REAPPLY_WAITTIME=10
VAULT_UNSEAL_WAITTIME=30
# Function to clean vault and reapply.
function reapplyVault {
if $VAULT_REAPPLIED; then
echo "Vault reapply already tried. Previous apply likely failed."
return 1
fi
# application-remove
system application-remove vault
rst=""
for tries in $(seq $VAULT_REMOVE_TRIES); do
sleep $VAULT_REAPPLY_WAITTIME
rst="$(system application-show vault --format value --column status)"
if [[ "$rst" == "uploaded" ]]; then
echo "Vault remove completed"
break
fi
done
if [[ "$rst" != "uploaded" ]]; then
echo "Vault Reapply: application-remove failed"
return 1
fi
# remove PVC resource
kubectl delete pvc -n $VAULT_NS --all
remainingPVC=-1
for tries in $(seq $PVC_DELETE_TRIES); do
sleep $VAULT_REAPPLY_WAITTIME
remainingPVC="$(kubectl get pvc -n $VAULT_NS \
--no-headers=true | wc -l)"
if [[ $remainingPVC -eq 0 ]]; then
echo "Vault PVC removal completed"
break
fi
done
if [[ $remainingPVC -ne 0 ]]; then
echo "remove pvc resource failed"
return 1
fi
# remove vault cluster-key secrets
clusterKeys="$(kubectl get secrets -n vault \
| grep 'cluster-key' | awk '{print $1}')"
for cKey in $clusterKeys; do
kubectl delete secret -n $VAULT_NS "$cKey"
cKeyDelete=$?
if [[ $cKeyDelete -ne 0 ]]; then
echo "kubectl-delete-secret returned error"
return 1
fi
done
remainingCKey=-1
for tries in $(seq $CLUSTER_KEY_DELETE_TRIES); do
sleep $VAULT_REAPPLY_WAITTIME
remainingCKey="$(kubectl get secrets -n vault \
| grep 'cluster-key' | wc -l)"
if [[ remainingCKey -eq 0 ]]; then
echo "Vault secret removal completed"
break
fi
done
if [[ $remainingCKey -ne 0 ]]; then
echo "remove cluster key secret failed"
return 1
fi
# application-apply
system application-apply vault
rst=""
for tries in $(seq $VAULT_APPLY_TRIES); do
sleep $VAULT_REAPPLY_WAITTIME
rst="$(system application-show vault --format value --column status)"
if [[ "$rst" == "applied" ]]; then
echo "Vault apply completed"
break
fi
done
if [[ "$rst" != "applied" ]]; then
echo "Vault Reapply: application-remove failed"
return 1
fi
# Wait for vault manager to initiate and unseal new pods.
# Pod unseal validation is done in the main
sleep $VAULT_UNSEAL_WAITTIME
# reapply completed. return back to main
VAULT_REAPPLIED=true
return 0
}
###
# Main
#
JPATHFULL='{range .items[*]}{.metadata.name}{" "}'\
'{.metadata.labels.vault-sealed}{"\n"}{end}'
JPATH="$(printf '%s\n' $JPATHFULL | tr '\n' ' ')"
echo "Validating vault status"
source "/etc/platform/openrc"
for validateTries in $(seq $MAIN_TRIES); do
echo "Attempting validation number $validateTries"
# check if vault application is applied
rst="$(system application-show vault --format value --column status)"
if [[ "$rst" != "applied" ]]; then
# if not, run recovery
echo "Vault not applied. Attempting reapply..."
reapplyVault
reapplyVaultRC=$?
if [[ reapplyVaultRC -eq 0 ]]; then
echo "Vault reapply completed. Reattempting validation."
continue
else
echo "Vault reapply failed for trying to" \
"fix not-applied vault application." \
"Unable to ready vault for restore."
exit 1
fi
fi
# check if there is a running vault pod
numRunningPods="$(kubectl get pods -n $VAULT_NS | \
grep "^sva-vault-[0-9] " | grep "Running" | wc -l)"
if [[ $numRunningPods -eq 0 ]]; then
# if not, run recovery
echo "No vault pods are running. Attempting reapply..."
reapplyVault
reapplyVaultRC=$?
if [[ $reapplyVaultRC -eq 0 ]]; then
echo "Vault reapply completed. Reattempting validation."
continue
else
echo "Vault reapply failed for trying to" \
"fix no running vault pods." \
"Unable to ready vault for restore."
exit 1
fi
fi
# check for sealed status
sealedPods=0
prevSealedPods=0
sealedExists=true
while [[ $SEALED_STATUS_TRIES -gt 0 ]]; do
# get number of sealed pods
sealedPods="$( kubectl get pods -n $VAULT_NS -o jsonpath="$JPATH" | \
grep "^sva-vault-[0-9] " | grep "true" | wc -l )"
# check if there are no sealed pods, if so mark success and break loop
if [[ $sealedPods -eq 0 ]]; then
sealedExists=false
break
fi
# if number of sealed pods decreased, reset wait counter
if [[ $sealedPods -lt $prevSealedPods ]]; then
SEALED_STATUS_TRIES=5
else
SEALED_STATUS_TRIES=$(($SEALED_STATUS_TRIES - 1))
fi
# wait for pods to unseal
sleep $SEALED_STATUS_WAITTIME
prevSealedPods=$sealedPods
done
# if there are still sealed pods, attempt reapply
if $sealedExists; then
echo "There are unsealable pods. Attempting reapply..."
reapplyVault
reapplyVaultRC=$?
if [[ $reapplyVaultRC -eq 0 ]]; then
echo "Vault reapply completed. Reattempting validation."
continue
else
echo "Vault reapply failed for trying to" \
"fix sealed vault pods." \
"Unable to ready vault for restore."
exit 1
fi
fi
# all test passed. exit
echo "All validation passed. Vault application is ready to be restored."
exit 0
done
echo "All tries exhausted. Unable to ready vault for restore."
exit 1

View File

@ -5,9 +5,9 @@
# SPDX-License-Identifier: Apache-2.0
#
- name: Validate initial_backup_dir is supplied
- name: Validate if initial_backup_dir is supplied
fail:
msg: "--initial_backup_dir variable not provided"
msg: "initial_backup_dir variable not provided"
when: not initial_backup_dir
- name: Validate initial_backup_dir exists
@ -22,29 +22,62 @@
- name: Set vault backup directory fact
set_fact:
vault_backup_dir: "{{ initial_backup_dir }}/vault"
vault_backup_dir: "{{ initial_backup_dir }}/hc_vault"
- name: Create vault subdirectory in initial_backup_dir
file:
path: "{{ vault_backup_dir }}"
state: directory
mode: 0755
- name: Check if encrypt is enabled
set_fact:
vault_encrypt: true
when: encrypt_hc_vault_secret | length > 0
# check if vault is applied
- name: Check if vault is applied
shell: |
source /etc/platform/openrc
system application-show vault --format value --column status
register: vault_applied_exists
- name: Check vault apply for backup
block:
- name: Check if vault is applied
shell: |
source /etc/platform/openrc
system application-show vault --format value --column status
register: vault_applied_exists
- name: Fail if vault is not applied
fail:
msg: "Vault application is not applied"
when: vault_applied_exists.stdout != "applied"
- name: Fail if vault is not applied
fail:
msg: "Vault application is not applied"
when: vault_applied_exists.stdout != "applied"
when: vault_mode == "backup"
- name: Validate vault health for restore.
block:
- name: Set backup file path
set_fact:
backup_filepath: "{{ initial_backup_dir }}/{{ backup_filename }}"
- name: Find backup tarball
shell: |
ls {{ backup_filepath }}
register: backup_tarball
- name: Fail if vault backup tarball not found
fail:
msg: "Vault snapshot tarball: {{ backup_filename }} was not found"
when: backup_tarball.stdout | length == 0
- name: Validate if vault application is ready to be restored
script: validate_recover_vault.sh
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
register: validate_vault_result
failed_when: validate_vault_result.rc != 0
rescue:
- name: Display vault validation script output if exists
debug:
msg: "{{ validate_vault_result.stdout }}"
when: validate_vault_result is defined
when: vault_mode == "restore"
- name: Find vault manager pod
shell: >-
@ -57,17 +90,3 @@
fail:
msg: "Vault manager pod is not found"
when: vault_manager_pod_name.stdout | length == 0
# check vault system health
- name: Check vault system health
shell: >-
kubectl exec -n "vault" "{{ vault_manager_pod_name.stdout }}" --
bash -c "source /opt/script/init.sh; snapshotPreCheck" 2>&1
register: vault_system_health
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
- name: Fail if vault health check returns error
fail:
msg: "Vault system health check returned error"
when: vault_system_health.rc != 0

View File

@ -222,7 +222,7 @@ fi
rndtmp="$( get_unique_string )"
secret="${K8S_SECRET_PREFIX}-$rndtmp"
fname="${OUTPUTDIR}/snapshot-${rndtmp}.tar"
fname="${OUTPUTDIR}/hc-vault-snapshot-${rndtmp}.tar"
metaf="${fname}.metadata"
# get the snapshot

View File

@ -5,6 +5,15 @@
# SPDX-License-Identifier: Apache-2.0
#
- name: Check vault system health
shell: >-
kubectl exec -n "vault" "{{ vault_manager_pod_name.stdout }}" --
bash -c "source /opt/script/init.sh; snapshotPreCheck" 2>&1
register: vault_system_health
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
failed_when: vault_system_health.rc != 0
- name: Create vault snapshot
block:
- name: Create vault snapshot with default encryption
@ -26,3 +35,80 @@
shell: >-
kubectl exec -n "vault" "{{ vault_manager_pod_name.stdout }}" --
bash -c "source /opt/script/init.sh; if [ -f $PAUSEFILE ]; then rm -f $PAUSEFILE; fi" 2>&1
rescue:
- name: Clean up vault subdir if in standalone mode
file:
path: "{{ vault_backup_dir }}"
state: absent
when: op_mode == "standalone"
- name: Package vault if running in standalone mode
block:
- name: Check if pigz package is installed
block:
- name: Issue command to pkg manager
command: "{{ 'rpm -q' if os_release == 'centos' else 'dpkg -l' }} pigz"
args:
warn: false
failed_when: false
register: check
- set_fact:
pigz_check: "{{ 'succeeded' if check.rc == 0 else 'failed' }}"
when: os_release in ["centos", "debian"]
- name: Check if pigz package is installed
package:
name: pigz
state: present
check_mode: true
register: pigz_check
when: os_release not in ["centos", "debian"]
- name: Check number of platform cores
shell: |
source /etc/platform/openrc
system host-cpu-list $(hostname) --nowrap | grep " Platform " | wc -l
register: num_platform_cores
- name: Set compress program for backup tarball
set_fact:
compress_program: >-
"{{ 'pigz' if num_platform_cores.stdout | int >= 4 and
pigz_check is succeeded else 'gzip' }}"
- name: Use current timestamp as backups timestamp
set_fact:
backup_timestamp: "{{ lookup('pipe', 'date +%Y_%m_%d_%H_%M_%S') }}"
- name: Attach timestamp to backups filename
set_fact:
hc_vault_backup_file: "{{ hc_vault_backup_filename_prefix }}_{{ backup_timestamp }}.tgz"
- name: Set backup files absolute path
set_fact:
hc_vault_backup_file_path: "{{ initial_backup_dir }}/{{ hc_vault_backup_file }}"
- name: Create a tgz archive for Hashicorp vault backup
shell: >-
tar
--use-compress-program={{ compress_program }}
-cf {{ hc_vault_backup_file_path }}
$(ls -d
{{ vault_backup_dir }}
2> /dev/null)
args:
warn: false
# Changing the failed_when behavior to prevent the backup to fail on "file changed as we read it", which
# makes tar return 1
register: tar_cmd
failed_when: tar_cmd.rc >= 2 or tar_cmd.rc < 0
- name: Cleanup vault subdir
file:
path: "{{ vault_backup_dir }}"
state: absent
when: op_mode == "standalone"

View File

@ -5,31 +5,46 @@
# SPDX-License-Identifier: Apache-2.0
#
- name: Set backup file path
set_fact:
backup_filepath: "{{ vault_backup_dir }}/{{ backup_filename }}"
- name: Unpackage the backup tarball
command: >-
tar --use-compress-program=pigz -C {{ vault_backup_dir }} -xpf {{ backup_filepath }}
--wildcards --transform='s,.*/,,'
args:
warn: false
- name: Find snapshot
shell: |
ls {{ backup_filepath }}
register: snapshot_tar_name
- name: Find the snapshot file
command: >-
find {{ vault_backup_dir }} -name "hc-vault-snapshot-*.tar"
register: backup_snapshot_file
- name: Fail if vault snapshot tar not found
- name: Fail if snapshot file was not found
fail:
msg: "Vault snapshot tarball: {{ backup_filename }} was not found"
when: snapshot_tar_name.stdout | length == 0
msg: "Backup snapshot was not found in {{ backup_filepath }}"
when: backup_snapshot_file.stdout | length == 0
- name: Change snapshot file permissions
file:
path: "{{ backup_snapshot_file.stdout }}"
mode: 0755
- name: Find vault manager pod
shell: >-
kubectl get pods -n vault | grep "vault-manager" | cut -d " " -f 1
register: vault_manager_pod_name
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
# call vault_restore.sh
- name: Restore vault from the snapshot
block:
- name: Restore vault snapshot with default encryption
script: vault_restore.sh {{ backup_filepath }}
script: vault_restore.sh {{ backup_snapshot_file.stdout }}
when: not vault_encrypt
register: vault_restore_script
failed_when: vault_restore_script.rc != 0
- name: Restore vault snapshot with custom encryption
script: vault_restore.sh {{ backup_filepath }} '--decrypt' "custom_var"
script: vault_restore.sh {{ backup_snapshot_file.stdout }} '--decrypt' "custom_var"
when: vault_encrypt
register: vault_restore_script
failed_when: vault_restore_script.rc != 0
@ -41,3 +56,14 @@
shell: >-
kubectl exec -n "vault" "{{ vault_manager_pod_name.stdout }}" --
bash -c "source /opt/script/init.sh; if [ -f $PAUSEFILE ]; then rm -f $PAUSEFILE; fi" 2>&1
rescue:
- name: Clean up vault subdir if in standalone mode
file:
path: "{{ vault_backup_dir }}"
state: absent
when: op_mode == "standalone"
- name: Cleanup vault subdir
file:
path: "{{ vault_backup_dir }}"
state: absent

View File

@ -4,6 +4,10 @@
#
# SPDX-License-Identifier: Apache-2.0
#
# requires one variable passed:
# initial_backup_dir = The resulting backup package will be found here.
- hosts: all
gather_facts: no
@ -13,7 +17,10 @@
vars:
password_change: false
vault_encrypt: false
# override encrypt_hc_vault_secret when calling the playbook to enable extra encryption
encrypt_hc_vault_secret: ""
vault_mode: "backup"
op_mode: "standalone"
roles:
- role: common/prepare-env

View File

@ -4,6 +4,11 @@
#
# SPDX-License-Identifier: Apache-2.0
#
# requires two variables passed:
# initial_backup_dir = the directory where the vault backup package will be found
# backup_filename = filename for vault backup package
- hosts: all
gather_facts: no
@ -13,10 +18,14 @@
vars:
password_change: false
vault_encrypt: false
# override encrypt_hc_vault_secret when calling the playbook to enable extra encryption
encrypt_hc_vault_secret: ""
vault_mode: "restore"
op_mode: "standalone"
roles:
- role: common/prepare-env
- role: vault/prepare_env
become: yes
- role: vault/vault_restore
become: yes