From 2fa26b2821232b04a91e907491371c88eb1e7d7d Mon Sep 17 00:00:00 2001 From: Stephen Taylor Date: Wed, 30 Mar 2022 15:01:12 -0600 Subject: [PATCH] [ceph-osd] Add a disruptive OSD restart to the post-apply job Currently the ceph-osd post-apply job always restarts OSDs without disruption. This requires waiting for a healthy cluster state in betweeen failure domain restarts, which isn't possible in some upgrade scenarios. In those scenarios where disruption is acceptable and a simultaneous restart of all OSDs is required, the disruptive_osd_restart value now provides this option. Change-Id: I64bfc30382e86c22b0f577d85fceef0d5c106d94 --- ceph-osd/Chart.yaml | 2 +- ceph-osd/templates/bin/_post-apply.sh.tpl | 26 ++++++++++++++++------- ceph-osd/templates/job-post-apply.yaml | 2 ++ ceph-osd/values.yaml | 5 +++++ releasenotes/notes/ceph-osd.yaml | 1 + 5 files changed, 27 insertions(+), 9 deletions(-) diff --git a/ceph-osd/Chart.yaml b/ceph-osd/Chart.yaml index cf8380944..b6c55d547 100644 --- a/ceph-osd/Chart.yaml +++ b/ceph-osd/Chart.yaml @@ -15,6 +15,6 @@ apiVersion: v1 appVersion: v1.0.0 description: OpenStack-Helm Ceph OSD name: ceph-osd -version: 0.1.36 +version: 0.1.37 home: https://github.com/ceph/ceph ... diff --git a/ceph-osd/templates/bin/_post-apply.sh.tpl b/ceph-osd/templates/bin/_post-apply.sh.tpl index 59dd7f8e0..fcf43a985 100644 --- a/ceph-osd/templates/bin/_post-apply.sh.tpl +++ b/ceph-osd/templates/bin/_post-apply.sh.tpl @@ -188,14 +188,24 @@ echo "Latest revision of the helm chart(s) is : $max_release" if [[ $max_release -gt 1 ]]; then if [[ $require_upgrade -gt 0 ]]; then - echo "waiting for inactive pgs and degraded objects before upgrade" - wait_for_pgs - wait_for_degraded_and_misplaced_objects - ceph -s - ceph osd "set" noout - echo "lets restart the osds rack by rack" - restart_by_rack - ceph osd "unset" noout + if [[ "$DISRUPTIVE_OSD_RESTART" == "true" ]]; then + echo "restarting all osds simultaneously" + kubectl -n $CEPH_NAMESPACE delete pod -l component=osd + sleep 60 + echo "waiting for pgs to become active and for degraded objects to recover" + wait_for_pgs + wait_for_degraded_objects + ceph -s + else + echo "waiting for inactive pgs and degraded objects before upgrade" + wait_for_pgs + wait_for_degraded_and_misplaced_objects + ceph -s + ceph osd "set" noout + echo "lets restart the osds rack by rack" + restart_by_rack + ceph osd "unset" noout + fi fi #lets check all the ceph-osd daemonsets diff --git a/ceph-osd/templates/job-post-apply.yaml b/ceph-osd/templates/job-post-apply.yaml index e248def9b..6e9a34707 100644 --- a/ceph-osd/templates/job-post-apply.yaml +++ b/ceph-osd/templates/job-post-apply.yaml @@ -102,6 +102,8 @@ spec: value: {{ .Release.Name }} - name: REQUIRED_PERCENT_OF_OSDS value: {{ .Values.conf.ceph.target.required_percent_of_osds | ceil | quote }} + - name: DISRUPTIVE_OSD_RESTART + value: {{ .Values.conf.storage.disruptive_osd_restart | quote }} command: - /tmp/post-apply.sh volumeMounts: diff --git a/ceph-osd/values.yaml b/ceph-osd/values.yaml index e0c386826..09c41e985 100644 --- a/ceph-osd/values.yaml +++ b/ceph-osd/values.yaml @@ -288,6 +288,11 @@ conf: # type: directory # location: /var/lib/openstack-helm/ceph/osd/journal-one + # The post-apply job will restart OSDs without disruption by default. Set + # this value to "true" to restart all OSDs at once. This will accomplish + # OSD restarts more quickly with disruption. + disruptive_osd_restart: "false" + # NOTE(portdirect): for heterogeneous clusters the overrides section can be used to define # OSD pods that will be deployed upon specifc nodes. # overrides: diff --git a/releasenotes/notes/ceph-osd.yaml b/releasenotes/notes/ceph-osd.yaml index 10af48c58..33b33b4f2 100644 --- a/releasenotes/notes/ceph-osd.yaml +++ b/releasenotes/notes/ceph-osd.yaml @@ -37,4 +37,5 @@ ceph-osd: - 0.1.34 Remove wait for misplaced objects during OSD restarts - 0.1.35 Consolidate mon_endpoints discovery - 0.1.36 Add OSD device location pre-check + - 0.1.37 Add a disruptive OSD restart to the post-apply job ...