openstack-helm-infra/ceph-osd/templates/bin/osd/ceph-volume/_init-with-ceph-volume.sh.tpl

499 lines
19 KiB
Smarty

#!/bin/bash
{{/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/}}
set -ex
source /tmp/osd-common-ceph-volume.sh
: "${OSD_FORCE_REPAIR:=1}"
# We do not want to zap journal disk. Tracking this option seperatly.
: "${JOURNAL_FORCE_ZAP:=0}"
if [ "x${STORAGE_TYPE%-*}" == "xbluestore" ]; then
export OSD_BLUESTORE=1
fi
if [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then
export OSD_DEVICE="/var/lib/ceph/osd"
else
export OSD_DEVICE=$(readlink -f ${STORAGE_LOCATION})
fi
if [ "x$JOURNAL_TYPE" == "xdirectory" ]; then
export OSD_JOURNAL="/var/lib/ceph/journal"
else
export OSD_JOURNAL=$(readlink -f ${JOURNAL_LOCATION})
fi
# Renames a single VG if necessary
function rename_vg {
local physical_disk=$1
local old_vg_name=$(locked pvdisplay ${physical_disk} | awk '/VG Name/{print $3}')
local vg_name=$(get_vg_name_from_device ${physical_disk})
if [[ "${old_vg_name}" ]] && [[ "${vg_name}" != "${old_vg_name}" ]]; then
locked vgrename ${old_vg_name} ${vg_name}
fi
}
# Renames all LVs associated with an OSD as necesasry
function rename_lvs {
local data_disk=$1
local vg_name=$(locked pvdisplay ${data_disk} | awk '/VG Name/{print $3}')
if [[ "${vg_name}" ]]; then
# Rename the OSD volume if necessary
local old_lv_name=$(locked lvdisplay ${vg_name} | awk '/LV Name/{print $3}')
local lv_name=$(get_lv_name_from_device ${data_disk} lv)
if [[ "${old_lv_name}" ]] && [[ "${lv_name}" != "${old_lv_name}" ]]; then
locked lvrename ${vg_name} ${old_lv_name} ${lv_name}
fi
# Rename the OSD's block.db volume if necessary, referenced by UUID
local lv_tag=$(get_lvm_tag_from_device ${data_disk} ceph.db_uuid)
if [[ "${lv_tag}" ]]; then
local lv_device=$(lvdisplay | grep -B4 "${lv_tag}" | awk '/LV Path/{print $3}')
if [[ "${lv_device}" ]]; then
local db_vg=$(echo ${lv_device} | awk -F "/" '{print $3}')
old_lv_name=$(echo ${lv_device} | awk -F "/" '{print $4}')
local db_name=$(get_lv_name_from_device ${data_disk} db)
if [[ "${old_lv_name}" ]] && [[ "${db_name}" != "${old_lv_name}" ]]; then
locked lvrename ${db_vg} ${old_lv_name} ${db_name}
fi
fi
fi
# Rename the OSD's WAL volume if necessary, referenced by UUID
lv_tag=$(get_lvm_tag_from_device ${data_disk} ceph.wal_uuid)
if [[ "${lv_tag}" ]]; then
local lv_device=$(lvdisplay | grep -B4 "${lv_tag}" | awk '/LV Path/{print $3}')
if [[ "${lv_device}" ]]; then
local wal_vg=$(echo ${lv_device} | awk -F "/" '{print $3}')
old_lv_name=$(echo ${lv_device} | awk -F "/" '{print $4}')
local wal_name=$(get_lv_name_from_device ${data_disk} wal)
if [[ "${old_lv_name}" ]] && [[ "${wal_name}" != "${old_lv_name}" ]]; then
locked lvrename ${wal_vg} ${old_lv_name} ${wal_name}
fi
fi
fi
fi
}
# Fixes up the tags that reference block, db, and wal logical_volumes
# NOTE: This updates tags based on current VG and LV names, so any necessary
# renaming should be completed prior to calling this
function update_lv_tags {
local data_disk=$1
local pv_uuid=$(pvdisplay ${data_disk} | awk '/PV UUID/{print $3}')
if [[ "${pv_uuid}" ]]; then
local volumes="$(lvs --no-headings | grep -e "${pv_uuid}")"
local block_device db_device wal_device vg_name
local old_block_device old_db_device old_wal_device
# Build OSD device paths from current VG and LV names
while read lv vg other_stuff; do
if [[ "${lv}" == "$(get_lv_name_from_device ${data_disk} lv)" ]]; then
block_device="/dev/${vg}/${lv}"
old_block_device=$(get_lvm_tag_from_volume ${block_device} ceph.block_device)
fi
if [[ "${lv}" == "$(get_lv_name_from_device ${data_disk} db)" ]]; then
db_device="/dev/${vg}/${lv}"
old_db_device=$(get_lvm_tag_from_volume ${block_device} ceph.db_device)
fi
if [[ "${lv}" == "$(get_lv_name_from_device ${data_disk} wal)" ]]; then
wal_device="/dev/${vg}/${lv}"
old_wal_device=$(get_lvm_tag_from_volume ${block_device} ceph.wal_device)
fi
done <<< ${volumes}
# Set new tags on all of the volumes using paths built above
while read lv vg other_stuff; do
if [[ "${block_device}" ]]; then
if [[ "${old_block_device}" ]]; then
locked lvchange --deltag "ceph.block_device=${old_block_device}" /dev/${vg}/${lv}
fi
locked lvchange --addtag "ceph.block_device=${block_device}" /dev/${vg}/${lv}
fi
if [[ "${db_device}" ]]; then
if [[ "${old_db_device}" ]]; then
locked lvchange --deltag "ceph.db_device=${old_db_device}" /dev/${vg}/${lv}
fi
locked lvchange --addtag "ceph.db_device=${db_device}" /dev/${vg}/${lv}
fi
if [[ "${wal_device}" ]]; then
if [[ "${old_wal_device}" ]]; then
locked lvchange --deltag "ceph.wal_device=${old_wal_device}" /dev/${vg}/${lv}
fi
locked lvchange --addtag "ceph.wal_device=${wal_device}" /dev/${vg}/${lv}
fi
done <<< ${volumes}
fi
}
# Settle LVM changes before inspecting volumes
udev_settle
# Rename VGs first
if [[ "${OSD_DEVICE}" ]]; then
OSD_DEVICE=$(readlink -f ${OSD_DEVICE})
rename_vg ${OSD_DEVICE}
fi
if [[ "${BLOCK_DB}" ]]; then
BLOCK_DB=$(readlink -f ${BLOCK_DB})
rename_vg ${BLOCK_DB}
fi
if [[ "${BLOCK_WAL}" ]]; then
BLOCK_WAL=$(readlink -f ${BLOCK_WAL})
rename_vg ${BLOCK_WAL}
fi
# Rename LVs after VGs are correct
rename_lvs ${OSD_DEVICE}
# Update tags (all VG and LV names should be correct before calling this)
update_lv_tags ${OSD_DEVICE}
# Settle LVM changes again after any changes have been made
udev_settle
function prep_device {
local BLOCK_DEVICE=$1
local BLOCK_DEVICE_SIZE=$2
local device_type=$3
local data_disk=$4
local vg_name lv_name VG DEVICE_OSD_ID logical_devices logical_volume
vg_name=$(get_vg_name_from_device ${BLOCK_DEVICE})
lv_name=$(get_lv_name_from_device ${data_disk} ${device_type})
VG=$(vgs --noheadings -o vg_name -S "vg_name=${vg_name}" | tr -d '[:space:]')
if [[ $VG ]]; then
DEVICE_OSD_ID=$(get_osd_id_from_volume "/dev/${vg_name}/${lv_name}")
CEPH_LVM_PREPARE=1
if [ -n "${OSD_ID}" ]; then
if [ "${DEVICE_OSD_ID}" == "${OSD_ID}" ]; then
CEPH_LVM_PREPARE=0
else
disk_zap "${OSD_DEVICE}"
fi
fi
else
logical_devices=$(get_lvm_path_from_device "pv_name=~${BLOCK_DEVICE},lv_name=~${lv_name}")
if [[ -n "$logical_devices" ]]; then
dmsetup remove $logical_devices
disk_zap "${OSD_DEVICE}"
CEPH_LVM_PREPARE=1
fi
random_uuid=$(uuidgen)
locked vgcreate "ceph-vg-${random_uuid}" "${BLOCK_DEVICE}"
VG=$(get_vg_name_from_device ${BLOCK_DEVICE})
locked vgrename "ceph-vg-${random_uuid}" "${VG}"
fi
logical_volume=$(lvs --noheadings -o lv_name -S "lv_name=${lv_name}" | tr -d '[:space:]')
if [[ $logical_volume != "${lv_name}" ]]; then
locked lvcreate -L "${BLOCK_DEVICE_SIZE}" -n "${lv_name}" "${VG}"
fi
if [[ "${device_type}" == "db" ]]; then
BLOCK_DB="${VG}/${lv_name}"
elif [[ "${device_type}" == "wal" ]]; then
BLOCK_WAL="${VG}/${lv_name}"
fi
}
function osd_disk_prepare {
if [[ -z "${OSD_DEVICE}" ]]; then
echo "ERROR- You must provide a device to build your OSD ie: /dev/sdb"
exit 1
fi
if [[ ! -b "${OSD_DEVICE}" ]]; then
echo "ERROR- The device pointed by OSD_DEVICE ($OSD_DEVICE) doesn't exist !"
exit 1
fi
if [ ! -e $OSD_BOOTSTRAP_KEYRING ]; then
echo "ERROR- $OSD_BOOTSTRAP_KEYRING must exist. You can extract it from your current monitor by running 'ceph auth get client.bootstrap-osd -o $OSD_BOOTSTRAP_KEYRING'"
exit 1
fi
timeout 10 ceph ${CLI_OPTS} --name client.bootstrap-osd --keyring $OSD_BOOTSTRAP_KEYRING health || exit 1
#search for some ceph metadata on the disk based on the status of the disk/lvm in filestore
CEPH_DISK_USED=0
CEPH_LVM_PREPARE=1
udev_settle
OSD_ID=$(get_osd_id_from_device ${OSD_DEVICE})
OSD_FSID=$(get_cluster_fsid_from_device ${OSD_DEVICE})
CLUSTER_FSID=$(ceph-conf --lookup fsid)
DISK_ZAPPED=0
if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
if [[ ! -z ${OSD_ID} ]]; then
DM_NUM=$(dmsetup ls | grep $(lsblk -J ${OSD_DEVICE} | jq -r '.blockdevices[].children[].name') | awk '{print $2}' | cut -d':' -f2 | cut -d')' -f1)
DM_DEV="/dev/dm-"${DM_NUM}
elif [[ $(sgdisk --print ${OSD_DEVICE} | grep "F800") ]]; then
DM_DEV=${OSD_DEVICE}$(sgdisk --print ${OSD_DEVICE} | grep "F800" | awk '{print $1}')
CEPH_DISK_USED=1
else
if [[ ${OSD_FORCE_REPAIR} -eq 1 ]]; then
echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_REPAIR is enabled so we are zapping the device anyway"
disk_zap ${OSD_DEVICE}
DISK_ZAPPED=1
else
echo "Regarding parted, device ${OSD_DEVICE} is inconsistent/broken/weird."
echo "It would be too dangerous to destroy it without any notification."
echo "Please set OSD_FORCE_REPAIR to '1' if you really want to zap this disk."
exit 1
fi
fi
else
if [[ ! -z "${OSD_FSID}" ]]; then
if [[ "${OSD_FSID}" == "${CLUSTER_FSID}" ]]; then
if [[ ! -z "${OSD_ID}" ]]; then
if ceph --name client.bootstrap-osd --keyring $OSD_BOOTSTRAP_KEYRING osd ls |grep -w ${OSD_ID}; then
echo "Running bluestore mode and ${OSD_DEVICE} already bootstrapped"
elif [[ $OSD_FORCE_REPAIR -eq 1 ]]; then
echo "OSD initialized for this cluster, but OSD ID not found in the cluster, reinitializing"
else
echo "OSD initialized for this cluster, but OSD ID not found in the cluster"
fi
fi
else
echo "OSD initialized for a different cluster, zapping it"
disk_zap ${OSD_DEVICE}
udev_settle
fi
elif [[ $(sgdisk --print ${OSD_DEVICE} | grep "F800") ]]; then
DM_DEV=${OSD_DEVICE}$(sgdisk --print ${OSD_DEVICE} | grep "F800" | awk '{print $1}')
CEPH_DISK_USED=1
else
dm_lv_name="$(get_lv_name_from_device ${OSD_DEVICE} lv | sed 's/-/--/g')"
if [[ ! -z "${dm_lv_name}" ]] && [[ ! -z "$(dmsetup ls | grep ${dm_lv_name})" ]]; then
CEPH_DISK_USED=1
fi
if [[ ${OSD_FORCE_REPAIR} -eq 1 ]] && [ ${CEPH_DISK_USED} -ne 1 ]; then
echo "${OSD_DEVICE} isn't clean, zapping it because OSD_FORCE_REPAIR is enabled"
disk_zap ${OSD_DEVICE}
else
echo "${OSD_DEVICE} isn't clean, but OSD_FORCE_REPAIR isn't enabled."
echo "Please set OSD_FORCE_REPAIR to '1' if you want to zap this disk."
exit 1
fi
fi
fi
if [ ${OSD_FORCE_REPAIR} -eq 1 ] && [ ! -z ${DM_DEV} ]; then
if [ -b $DM_DEV ]; then
local cephFSID=$(ceph-conf --lookup fsid)
if [ ! -z "${cephFSID}" ]; then
local tmpmnt=$(mktemp -d)
mount ${DM_DEV} ${tmpmnt}
if [ "${OSD_BLUESTORE:-0}" -ne 1 ] && [ "x$JOURNAL_TYPE" != "xdirectory" ]; then
# we only care about journals for filestore.
if [ -f "${tmpmnt}/whoami" ]; then
OSD_JOURNAL_DISK=$(readlink -f "${tmpmnt}/journal")
local osd_id=$(cat "${tmpmnt}/whoami")
if [ ! -b "${OSD_JOURNAL_DISK}" ]; then
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
local jdev=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
if [ ${jdev} == ${OSD_JOURNAL} ]; then
echo "It appears that ${OSD_DEVICE} is missing the journal at ${OSD_JOURNAL}."
echo "Because OSD_FORCE_REPAIR is set, we will wipe the metadata of the OSD and zap it."
rm -rf ${tmpmnt}/ceph_fsid
else
echo "It appears that ${OSD_DEVICE} is missing the journal at ${OSD_JOURNAL_DISK}."
echo "Because OSD_FORCE_REPAIR is set and paritions are manually defined, we will"
echo "attempt to recreate the missing journal device partitions."
osd_journal_create ${OSD_JOURNAL}
ln -sf /dev/disk/by-partuuid/${OSD_JOURNAL_UUID} ${tmpmnt}/journal
echo ${OSD_JOURNAL_UUID} | tee ${tmpmnt}/journal_uuid
chown ceph. ${OSD_JOURNAL}
# During OSD start we will format the journal and set the fsid
touch ${tmpmnt}/run_mkjournal
fi
fi
else
echo "It looks like ${OSD_DEVICE} has a ceph data partition but is missing it's metadata."
echo "The device may contain inconsistent metadata or be corrupted."
echo "Because OSD_FORCE_REPAIR is set, we will wipe the metadata of the OSD and zap it."
rm -rf ${tmpmnt}/ceph_fsid
fi
fi
if [ -f "${tmpmnt}/ceph_fsid" ]; then
osdFSID=$(cat "${tmpmnt}/ceph_fsid")
if [ ${osdFSID} != ${cephFSID} ]; then
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a different (or old) ceph cluster."
echo "The OSD FSID is ${osdFSID} while this cluster is ${cephFSID}"
echo "Because OSD_FORCE_REPAIR was set, we will zap this device."
zap_extra_partitions ${tmpmnt}
umount ${tmpmnt}
disk_zap ${OSD_DEVICE}
else
umount ${tmpmnt}
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a this ceph cluster."
echo "OSD_FORCE_REPAIR is set, but will be ignored and the device will not be zapped."
echo "Moving on, trying to activate the OSD now."
fi
else
echo "It looks like ${OSD_DEVICE} has a ceph data partition but no FSID."
echo "Because OSD_FORCE_REPAIR was set, we will zap this device."
zap_extra_partitions ${tmpmnt}
umount ${tmpmnt}
disk_zap ${OSD_DEVICE}
fi
else
echo "Unable to determine the FSID of the current cluster."
echo "OSD_FORCE_REPAIR is set, but this OSD will not be zapped."
echo "Moving on, trying to activate the OSD now."
return
fi
else
echo "parted says ${DM_DEV} should exist, but we do not see it."
echo "We will ignore OSD_FORCE_REPAIR and try to use the device as-is"
echo "Moving on, trying to activate the OSD now."
return
fi
else
echo "INFO- It looks like ${OSD_DEVICE} is an OSD LVM"
echo "Moving on, trying to prepare and activate the OSD LVM now."
fi
if [[ ${CEPH_DISK_USED} -eq 1 ]]; then
CLI_OPTS="${CLI_OPTS} --data ${OSD_DEVICE}"
ceph-volume simple scan --force ${OSD_DEVICE}$(sgdisk --print ${OSD_DEVICE} | grep "F800" | awk '{print $1}')
elif [[ ${CEPH_LVM_PREPARE} -eq 1 ]] || [[ ${DISK_ZAPPED} -eq 1 ]]; then
udev_settle
vg_name=$(get_vg_name_from_device ${OSD_DEVICE})
if [[ "${vg_name}" ]]; then
OSD_VG=${vg_name}
else
random_uuid=$(uuidgen)
vgcreate ceph-vg-${random_uuid} ${OSD_DEVICE}
vg_name=$(get_vg_name_from_device ${OSD_DEVICE})
vgrename ceph-vg-${random_uuid} ${vg_name}
OSD_VG=${vg_name}
fi
lv_name=$(get_lv_name_from_device ${OSD_DEVICE} lv)
if [[ ! "$(lvdisplay | awk '/LV Name/{print $3}' | grep ${lv_name})" ]]; then
lvcreate --yes -l 100%FREE -n ${lv_name} ${OSD_VG}
fi
OSD_LV=${OSD_VG}/${lv_name}
CLI_OPTS="${CLI_OPTS} --data ${OSD_LV}"
CEPH_LVM_PREPARE=1
udev_settle
fi
if [ "${OSD_BLUESTORE:-0}" -eq 1 ] && [ ${CEPH_DISK_USED} -eq 0 ] ; then
if [[ ${BLOCK_DB} ]]; then
block_db_string=$(echo ${BLOCK_DB} | awk -F "/" '{print $2 "-" $3}')
fi
if [[ ${BLOCK_WAL} ]]; then
block_wal_string=$(echo ${BLOCK_WAL} | awk -F "/" '{print $2 "-" $3}')
fi
if [[ ${BLOCK_DB} && ${BLOCK_WAL} ]]; then
prep_device "${BLOCK_DB}" "${BLOCK_DB_SIZE}" "db" "${OSD_DEVICE}"
prep_device "${BLOCK_WAL}" "${BLOCK_WAL_SIZE}" "wal" "${OSD_DEVICE}"
elif [[ -z ${BLOCK_DB} && ${BLOCK_WAL} ]]; then
prep_device "${BLOCK_WAL}" "${BLOCK_WAL_SIZE}" "wal" "${OSD_DEVICE}"
elif [[ ${BLOCK_DB} && -z ${BLOCK_WAL} ]]; then
prep_device "${BLOCK_DB}" "${BLOCK_DB_SIZE}" "db" "${OSD_DEVICE}"
fi
else
if pvdisplay ${OSD_DEVICE} | awk '/VG Name/{print $3}' | grep "ceph"; then
CEPH_LVM_PREPARE=0
fi
fi
if [ "${OSD_BLUESTORE:-0}" -eq 1 ]; then
CLI_OPTS="${CLI_OPTS} --bluestore"
if [ ! -z "$BLOCK_DB" ]; then
CLI_OPTS="${CLI_OPTS} --block.db ${BLOCK_DB}"
fi
if [ ! -z "$BLOCK_WAL" ]; then
CLI_OPTS="${CLI_OPTS} --block.wal ${BLOCK_WAL}"
fi
else
# we only care about journals for filestore.
osd_journal_prepare
CLI_OPTS="${CLI_OPTS} --data ${OSD_DEVICE} --journal ${OSD_JOURNAL}"
udev_settle
fi
if [ ! -z "$DEVICE_CLASS" ]; then
CLI_OPTS="${CLI_OPTS} --crush-device-class ${DEVICE_CLASS}"
fi
if [[ CEPH_LVM_PREPARE -eq 1 ]]; then
locked ceph-volume lvm -v prepare ${CLI_OPTS}
udev_settle
fi
}
function osd_journal_create {
local osd_journal=${1}
local osd_journal_partition=$(echo ${osd_journal} | sed 's/[^0-9]//g')
local jdev=$(echo ${osd_journal} | sed 's/[0-9]//g')
if [ -b "${jdev}" ]; then
sgdisk --new=${osd_journal_partition}:0:+${OSD_JOURNAL_SIZE}M \
--change-name='${osd_journal_partition}:ceph journal' \
--partition-guid=${osd_journal_partition}:${OSD_JOURNAL_UUID} \
--typecode=${osd_journal_partition}:45b0969e-9b03-4f30-b4c6-b4b80ceff106 --mbrtogpt -- ${jdev}
OSD_JOURNAL=$(dev_part ${jdev} ${osd_journal_partition})
udev_settle
else
echo "The backing device ${jdev} for ${OSD_JOURNAL} does not exist on this system."
exit 1
fi
}
function osd_journal_prepare {
if [ -n "${OSD_JOURNAL}" ]; then
if [ -b ${OSD_JOURNAL} ]; then
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
OSD_JOURNAL_PARTITION=$(echo ${OSD_JOURNAL} | sed 's/[^0-9]//g')
local jdev=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
if [ -z "${OSD_JOURNAL_PARTITION}" ]; then
OSD_JOURNAL=$(dev_part ${jdev} ${OSD_JOURNAL_PARTITION})
else
OSD_JOURNAL=${OSD_JOURNAL}
fi
elif [ "x$JOURNAL_TYPE" != "xdirectory" ]; then
# The block device exists but doesn't appear to be paritioned, we will proceed with parititioning the device.
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
until [ -b ${OSD_JOURNAL} ]; do
osd_journal_create ${OSD_JOURNAL}
done
fi
chown ceph. ${OSD_JOURNAL};
elif [ "x$JOURNAL_TYPE" != "xdirectory" ]; then
echo "No journal device specified. OSD and journal will share ${OSD_DEVICE}"
echo "For better performance on HDD, consider moving your journal to a separate device"
fi
CLI_OPTS="${CLI_OPTS} --filestore"
}
if ! [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then
osd_disk_prepare
fi