Remove replace-osd action
At present this action does not work. While looking to repair the functionality I found a number of issues with the current implementation. For now I suggest we remove this functionality, and at some point we may consider replacing it with a `remove-disk` action. Sync in relevant changes from charms.ceph Depends-On: Id61b87927c43d807aacc93cf05ec8f88d91b7a39 Change-Id: Ic71d304ff65a05ab7249f4dd07adc45429a323e9
This commit is contained in:
parent
8b2303e863
commit
2c3eae272f
11
actions.yaml
11
actions.yaml
|
@ -24,17 +24,6 @@ resume:
|
|||
description: |
|
||||
Set the local osd units in the charm to 'in'. Note that the pause option
|
||||
does NOT stop the osd processes.
|
||||
replace-osd:
|
||||
description: Replace a failed osd with a fresh disk
|
||||
params:
|
||||
osd-number:
|
||||
type: integer
|
||||
description: The osd number to operate on. Example 99. Hint you can get this information from `ceph osd tree`.
|
||||
replacement-device:
|
||||
type: string
|
||||
description: The replacement device to use. Example /dev/sdb.
|
||||
required: [osd-number, replacement-device]
|
||||
additionalProperties: false
|
||||
list-disks:
|
||||
description: List the unmounted disk on the specified unit
|
||||
add-disk:
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
replace_osd.py
|
|
@ -1,99 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright 2016 Canonical Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.append('hooks/')
|
||||
sys.path.append('lib/')
|
||||
|
||||
import charmhelpers.core.hookenv as hookenv
|
||||
|
||||
import ceph.utils
|
||||
|
||||
"""
|
||||
Given a OSD number this script will attempt to turn that back into a mount
|
||||
point and then replace the OSD with a new one.
|
||||
"""
|
||||
|
||||
|
||||
def get_disk_stats():
|
||||
try:
|
||||
# https://www.kernel.org/doc/Documentation/iostats.txt
|
||||
with open('/proc/diskstats', 'rt', encoding='UTF-8') as diskstats:
|
||||
return diskstats.readlines()
|
||||
except IOError as err:
|
||||
hookenv.log('Could not open /proc/diskstats. Error: {}'
|
||||
.format(str(err)))
|
||||
hookenv.action_fail(
|
||||
'replace-osd failed because /proc/diskstats could not '
|
||||
'be opened {}'.format(str(err)))
|
||||
return None
|
||||
|
||||
|
||||
def lookup_device_name(major_number, minor_number):
|
||||
"""
|
||||
|
||||
:param major_number: int. The major device number
|
||||
:param minor_number: int. The minor device number
|
||||
:return: string. The name of the device. Example: /dev/sda.
|
||||
Returns None on error.
|
||||
"""
|
||||
diskstats = get_disk_stats()
|
||||
for line in diskstats:
|
||||
parts = line.split()
|
||||
if not len(parts) > 3:
|
||||
# Skip bogus lines
|
||||
continue
|
||||
try:
|
||||
if int(parts[0]) is major_number and int(parts[1]) is \
|
||||
minor_number:
|
||||
# Found our device. Return its name
|
||||
return parts[2]
|
||||
except ValueError as value_err:
|
||||
hookenv.log('Could not convert {} or {} into an integer. Error: {}'
|
||||
.format(parts[0], parts[1], str(value_err)))
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def get_device_number(osd_number):
|
||||
"""
|
||||
This function will return a tuple of (major_number, minor_number)
|
||||
device number for the given osd.
|
||||
:param osd_number: int
|
||||
:rtype : (major_number,minor_number)
|
||||
"""
|
||||
path = "/var/lib/ceph/osd/ceph-{}".format(osd_number)
|
||||
info = os.lstat(path)
|
||||
major_number = os.major(info.st_dev)
|
||||
minor_number = os.minor(info.st_dev)
|
||||
return major_number, minor_number
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
dead_osd_number = hookenv.action_get("osd-number")
|
||||
replacement_device = hookenv.action_get("replacement-device")
|
||||
major, minor = get_device_number(dead_osd_number)
|
||||
device_name = lookup_device_name(major, minor)
|
||||
osd_format = hookenv.config('osd-format')
|
||||
osd_journal = hookenv.config('osd-journal')
|
||||
|
||||
ceph.utils.replace_osd(dead_osd_number=dead_osd_number,
|
||||
dead_osd_device="/dev/{}".format(device_name),
|
||||
new_osd_device=replacement_device,
|
||||
osd_format=osd_format,
|
||||
osd_journal=osd_journal)
|
|
@ -13,8 +13,6 @@
|
|||
# limitations under the License.
|
||||
|
||||
import collections
|
||||
import ctypes
|
||||
import errno
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
|
@ -25,7 +23,6 @@ import socket
|
|||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import shutil
|
||||
import uuid
|
||||
|
||||
from datetime import datetime
|
||||
|
@ -38,7 +35,6 @@ from charmhelpers.core.host import (
|
|||
cmp_pkgrevno,
|
||||
lsb_release,
|
||||
mkdir,
|
||||
mounts,
|
||||
owner,
|
||||
service_restart,
|
||||
service_start,
|
||||
|
@ -835,114 +831,6 @@ CEPH_PARTITIONS = [
|
|||
]
|
||||
|
||||
|
||||
def umount(mount_point):
|
||||
"""This function unmounts a mounted directory forcibly. This will
|
||||
be used for unmounting broken hard drive mounts which may hang.
|
||||
|
||||
If umount returns EBUSY this will lazy unmount.
|
||||
|
||||
:param mount_point: str. A String representing the filesystem mount point
|
||||
:returns: int. Returns 0 on success. errno otherwise.
|
||||
"""
|
||||
libc_path = ctypes.util.find_library("c")
|
||||
libc = ctypes.CDLL(libc_path, use_errno=True)
|
||||
|
||||
# First try to umount with MNT_FORCE
|
||||
ret = libc.umount(mount_point, 1)
|
||||
if ret < 0:
|
||||
err = ctypes.get_errno()
|
||||
if err == errno.EBUSY:
|
||||
# Detach from try. IE lazy umount
|
||||
ret = libc.umount(mount_point, 2)
|
||||
if ret < 0:
|
||||
err = ctypes.get_errno()
|
||||
return err
|
||||
return 0
|
||||
else:
|
||||
return err
|
||||
return 0
|
||||
|
||||
|
||||
def replace_osd(dead_osd_number,
|
||||
dead_osd_device,
|
||||
new_osd_device,
|
||||
osd_format,
|
||||
osd_journal,
|
||||
reformat_osd=False,
|
||||
ignore_errors=False):
|
||||
"""This function will automate the replacement of a failed osd disk as much
|
||||
as possible. It will revoke the keys for the old osd, remove it from the
|
||||
crush map and then add a new osd into the cluster.
|
||||
|
||||
:param dead_osd_number: The osd number found in ceph osd tree. Example: 99
|
||||
:param dead_osd_device: The physical device. Example: /dev/sda
|
||||
:param osd_format:
|
||||
:param osd_journal:
|
||||
:param reformat_osd:
|
||||
:param ignore_errors:
|
||||
"""
|
||||
host_mounts = mounts()
|
||||
mount_point = None
|
||||
for mount in host_mounts:
|
||||
if mount[1] == dead_osd_device:
|
||||
mount_point = mount[0]
|
||||
# need to convert dev to osd number
|
||||
# also need to get the mounted drive so we can tell the admin to
|
||||
# replace it
|
||||
try:
|
||||
# Drop this osd out of the cluster. This will begin a
|
||||
# rebalance operation
|
||||
status_set('maintenance', 'Removing osd {}'.format(dead_osd_number))
|
||||
subprocess.check_output([
|
||||
'ceph',
|
||||
'--id',
|
||||
'osd-upgrade',
|
||||
'osd', 'out',
|
||||
'osd.{}'.format(dead_osd_number)])
|
||||
|
||||
# Kill the osd process if it's not already dead
|
||||
if systemd():
|
||||
service_stop('ceph-osd@{}'.format(dead_osd_number))
|
||||
else:
|
||||
subprocess.check_output(['stop', 'ceph-osd', 'id={}'.format(
|
||||
dead_osd_number)])
|
||||
# umount if still mounted
|
||||
ret = umount(mount_point)
|
||||
if ret < 0:
|
||||
raise RuntimeError('umount {} failed with error: {}'.format(
|
||||
mount_point, os.strerror(ret)))
|
||||
# Clean up the old mount point
|
||||
shutil.rmtree(mount_point)
|
||||
subprocess.check_output([
|
||||
'ceph',
|
||||
'--id',
|
||||
'osd-upgrade',
|
||||
'osd', 'crush', 'remove',
|
||||
'osd.{}'.format(dead_osd_number)])
|
||||
# Revoke the OSDs access keys
|
||||
subprocess.check_output([
|
||||
'ceph',
|
||||
'--id',
|
||||
'osd-upgrade',
|
||||
'auth', 'del',
|
||||
'osd.{}'.format(dead_osd_number)])
|
||||
subprocess.check_output([
|
||||
'ceph',
|
||||
'--id',
|
||||
'osd-upgrade',
|
||||
'osd', 'rm',
|
||||
'osd.{}'.format(dead_osd_number)])
|
||||
status_set('maintenance', 'Setting up replacement osd {}'.format(
|
||||
new_osd_device))
|
||||
osdize(new_osd_device,
|
||||
osd_format,
|
||||
osd_journal,
|
||||
reformat_osd,
|
||||
ignore_errors)
|
||||
except subprocess.CalledProcessError as e:
|
||||
log('replace_osd failed with error: ' + e.output)
|
||||
|
||||
|
||||
def get_partition_list(dev):
|
||||
"""Lists the partitions of a block device.
|
||||
|
||||
|
@ -2248,19 +2136,14 @@ def wait_on_previous_node(upgrade_key, service, previous_node, version):
|
|||
def get_upgrade_position(osd_sorted_list, match_name):
|
||||
"""Return the upgrade position for the given osd.
|
||||
|
||||
:param osd_sorted_list: Osds sorted
|
||||
:type osd_sorted_list: [str]
|
||||
:param match_name: The osd name to match
|
||||
:type match_name: str
|
||||
:returns: The position of the name
|
||||
:rtype: int
|
||||
:raises: ValueError if name is not found
|
||||
:param osd_sorted_list: list. Osds sorted
|
||||
:param match_name: str. The osd name to match
|
||||
:returns: int. The position or None if not found
|
||||
"""
|
||||
for index, item in enumerate(osd_sorted_list):
|
||||
if item.name == match_name:
|
||||
return index
|
||||
raise ValueError("osd name '{}' not found in get_upgrade_position list"
|
||||
.format(match_name))
|
||||
return None
|
||||
|
||||
|
||||
# Edge cases:
|
||||
|
|
|
@ -1,128 +0,0 @@
|
|||
# Copyright 2016 Canonical Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import errno
|
||||
import posix
|
||||
|
||||
from mock import call, Mock, patch
|
||||
|
||||
import test_utils
|
||||
import ceph.utils as ceph
|
||||
import replace_osd
|
||||
|
||||
TO_PATCH = [
|
||||
'ctypes',
|
||||
'status_set',
|
||||
]
|
||||
|
||||
proc_data = [
|
||||
' 8 0 sda 2291336 263100 108136080 1186276 28844343 28798167 '
|
||||
'2145908072 49433216 0 7550032 50630100\n',
|
||||
' 8 1 sda1 1379 1636 8314 692 75 17 1656 0 0 496 692\n',
|
||||
' 8 2 sda2 1 0 2 0 0 0 0 0 0 0 0\n',
|
||||
]
|
||||
|
||||
|
||||
def umount_busy(*args):
|
||||
# MNT_FORCE
|
||||
if args[1] == 1:
|
||||
return -1
|
||||
# MNT_DETACH
|
||||
if args[1] == 2:
|
||||
return 0
|
||||
|
||||
|
||||
class ReplaceOsdTestCase(test_utils.CharmTestCase):
|
||||
def setUp(self):
|
||||
super(ReplaceOsdTestCase, self).setUp(ceph, TO_PATCH)
|
||||
|
||||
def test_umount_ebusy(self):
|
||||
self.ctypes.util.find_library.return_value = 'libc.so.6'
|
||||
umount_mock = Mock()
|
||||
self.ctypes.CDLL.return_value = umount_mock
|
||||
umount_mock.umount.side_effect = umount_busy
|
||||
self.ctypes.get_errno.return_value = errno.EBUSY
|
||||
|
||||
ret = ceph.umount('/some/osd/mount')
|
||||
umount_mock.assert_has_calls([
|
||||
call.umount('/some/osd/mount', 1),
|
||||
call.umount('/some/osd/mount', 2),
|
||||
])
|
||||
assert ret == 0
|
||||
|
||||
def test_umount(self):
|
||||
self.ctypes.util.find_library.return_value = 'libc.so.6'
|
||||
umount_mock = Mock()
|
||||
self.ctypes.CDLL.return_value = umount_mock
|
||||
umount_mock.umount.return_value = 0
|
||||
|
||||
ret = ceph.umount('/some/osd/mount')
|
||||
umount_mock.assert_has_calls([
|
||||
call.umount('/some/osd/mount', 1),
|
||||
])
|
||||
assert ret == 0
|
||||
|
||||
@patch.object(ceph, 'mounts')
|
||||
@patch.object(ceph.subprocess, 'check_output')
|
||||
@patch.object(ceph, 'umount')
|
||||
@patch.object(ceph, 'osdize')
|
||||
@patch.object(ceph, 'shutil')
|
||||
@patch.object(ceph, 'systemd')
|
||||
@patch.object(ceph, 'ceph_user')
|
||||
def test_replace_osd(self, ceph_user, systemd, shutil, osdize, umount,
|
||||
check_output, mounts):
|
||||
ceph_user.return_value = "ceph"
|
||||
mounts.return_value = [['/var/lib/ceph/osd/ceph-a', '/dev/sda']]
|
||||
check_output.return_value = True
|
||||
self.status_set.return_value = None
|
||||
systemd.return_value = False
|
||||
umount.return_value = 0
|
||||
osdize.return_value = None
|
||||
shutil.rmtree.return_value = None
|
||||
ceph.replace_osd(dead_osd_number=0,
|
||||
dead_osd_device='/dev/sda',
|
||||
new_osd_device='/dev/sdb',
|
||||
osd_format=True,
|
||||
osd_journal=None,
|
||||
reformat_osd=False,
|
||||
ignore_errors=False)
|
||||
check_output.assert_has_calls(
|
||||
[
|
||||
call(['ceph', '--id', 'osd-upgrade',
|
||||
'osd', 'out', 'osd.0']),
|
||||
call(['stop', 'ceph-osd', 'id=0']),
|
||||
call(['ceph', '--id',
|
||||
'osd-upgrade', 'osd', 'crush', 'remove', 'osd.0']),
|
||||
call(['ceph', '--id',
|
||||
'osd-upgrade', 'auth', 'del', 'osd.0']),
|
||||
call(['ceph', '--id',
|
||||
'osd-upgrade', 'osd', 'rm', 'osd.0'])
|
||||
]
|
||||
)
|
||||
|
||||
@patch('replace_osd.get_disk_stats')
|
||||
def test_lookup_device_name(self, disk_stats):
|
||||
disk_stats.return_value = proc_data
|
||||
dev_name = replace_osd.lookup_device_name(major_number=8,
|
||||
minor_number=0)
|
||||
assert dev_name == 'sda', "dev_name: {}".format(dev_name)
|
||||
|
||||
@patch('replace_osd.os.lstat')
|
||||
def test_get_device_number(self, lstat):
|
||||
lstat.return_value = posix.stat_result([
|
||||
16877, 16, 51729, 3, 0, 0, 217, 0, 1458086872, 1458086872
|
||||
])
|
||||
major, minor = replace_osd.get_device_number(1)
|
||||
assert major == 202
|
||||
assert minor == 17
|
Loading…
Reference in New Issue