Resync ceph helpers for misc fixes

Pickup fixes for upgrade from Jewel -> Luminous.

Change-Id: Id0694b5116e604efbe1c5259de332ae0c4bae574
Closes-Bug: 1742082
Closes-Bug: 1742083
Closes-Bug: 1742120
Closes-Bug: 1742079
Closes-Bug: 1742408
This commit is contained in:
James Page 2018-01-10 09:12:20 +00:00
parent 4e411761bf
commit 1f0593649c
2 changed files with 57 additions and 34 deletions

View File

@ -338,7 +338,7 @@ def save_service(service_name, service):
service['groups'] = {}
return monitor_key_set(service='admin',
key="cephx.services.{}".format(service_name),
value=json.dumps(service))
value=json.dumps(service, sort_keys=True))
def save_group(group, group_name):
@ -346,7 +346,7 @@ def save_group(group, group_name):
group_key = get_group_key(group_name=group_name)
return monitor_key_set(service='admin',
key=group_key,
value=json.dumps(group))
value=json.dumps(group, sort_keys=True))
def get_group_key(group_name):

View File

@ -1311,7 +1311,8 @@ def bootstrap_monitor_cluster(secret):
# Ceph >= 0.61.3 needs this for ceph-mon fs creation
mkdir('/var/run/ceph', owner=ceph_user(),
group=ceph_user(), perms=0o755)
mkdir(path, owner=ceph_user(), group=ceph_user())
mkdir(path, owner=ceph_user(), group=ceph_user(),
perms=0o755)
# end changes for Ceph >= 0.61.3
try:
add_keyring_to_ceph(keyring,
@ -1673,12 +1674,23 @@ def roll_monitor_cluster(new_version, upgrade_key):
service='mon',
my_name=my_name,
version=new_version)
# NOTE(jamespage):
# Wait until all monitors have upgraded before bootstrapping
# the ceph-mgr daemons due to use of new mgr keyring profiles
if new_version == 'luminous':
wait_for_all_monitors_to_upgrade(new_version=new_version,
upgrade_key=upgrade_key)
bootstrap_manager()
except ValueError:
log("Failed to find {} in list {}.".format(
my_name, mon_sorted_list))
status_set('blocked', 'failed to upgrade monitor')
# TODO(jamespage):
# Mimic support will need to ensure that ceph-mgr daemons are also
# restarted during upgrades - probably through use of one of the
# high level systemd targets shipped by the packaging.
def upgrade_monitor(new_version):
"""Upgrade the current ceph monitor to the new version
@ -1699,26 +1711,31 @@ def upgrade_monitor(new_version):
sys.exit(1)
try:
if systemd():
for mon_id in get_local_mon_ids():
service_stop('ceph-mon@{}'.format(mon_id))
service_stop('ceph-mon')
else:
service_stop('ceph-mon-all')
apt_install(packages=determine_packages(), fatal=True)
owner = ceph_user()
# Ensure the files and directories under /var/lib/ceph is chowned
# properly as part of the move to the Jewel release, which moved the
# ceph daemons to running as ceph:ceph instead of root:root.
if new_version == 'jewel':
# Ensure the ownership of Ceph's directories is correct
owner = ceph_user()
chownr(path=os.path.join(os.sep, "var", "lib", "ceph"),
owner=owner,
group=owner,
follow_links=True)
# Ensure that mon directory is user writable
hostname = socket.gethostname()
path = '/var/lib/ceph/mon/ceph-{}'.format(hostname)
mkdir(path, owner=ceph_user(), group=ceph_user(),
perms=0o755)
if systemd():
for mon_id in get_local_mon_ids():
service_start('ceph-mon@{}'.format(mon_id))
service_start('ceph-mon')
else:
service_start('ceph-mon-all')
except subprocess.CalledProcessError as err:
@ -1799,25 +1816,28 @@ def wait_on_previous_node(upgrade_key, service, previous_node, version):
previous_node_start_time = monitor_key_get(
upgrade_key,
"{}_{}_{}_start".format(service, previous_node, version))
if (current_timestamp - (10 * 60)) > previous_node_start_time:
# Previous node is probably dead. Lets move on
if previous_node_start_time is not None:
log(
"Waited 10 mins on node {}. current time: {} > "
"previous node start time: {} Moving on".format(
previous_node,
(current_timestamp - (10 * 60)),
previous_node_start_time))
return
else:
# I have to wait. Sleep a random amount of time and then
# check if I can lock,upgrade and roll.
wait_time = random.randrange(5, 30)
log('waiting for {} seconds'.format(wait_time))
time.sleep(wait_time)
previous_node_finished = monitor_key_exists(
upgrade_key,
"{}_{}_{}_done".format(service, previous_node, version))
if (previous_node_start_time is not None and
((current_timestamp - (10 * 60)) >
float(previous_node_start_time))):
# NOTE(jamespage):
# Previous node is probably dead as we've been waiting
# for 10 minutes - lets move on and upgrade
log("Waited 10 mins on node {}. current time: {} > "
"previous node start time: {} Moving on".format(
previous_node,
(current_timestamp - (10 * 60)),
previous_node_start_time))
return
# NOTE(jamespage)
# Previous node has not started, or started less than
# 10 minutes ago - sleep a random amount of time and
# then check again.
wait_time = random.randrange(5, 30)
log('waiting for {} seconds'.format(wait_time))
time.sleep(wait_time)
previous_node_finished = monitor_key_exists(
upgrade_key,
"{}_{}_{}_done".format(service, previous_node, version))
def get_upgrade_position(osd_sorted_list, match_name):
@ -1874,7 +1894,7 @@ def roll_osd_cluster(new_version, upgrade_key):
version=new_version)
else:
# Check if the previous node has finished
status_set('blocked',
status_set('waiting',
'Waiting on {} to finish upgrading'.format(
osd_sorted_list[position - 1].name))
wait_on_previous_node(
@ -1922,7 +1942,10 @@ def upgrade_osd(new_version):
# way to update the code on the node.
if not dirs_need_ownership_update('osd'):
log('Restarting all OSDs to load new binaries', DEBUG)
service_restart('ceph-osd-all')
if systemd():
service_restart('ceph-osd.target')
else:
service_restart('ceph-osd-all')
return
# Need to change the ownership of all directories which are not OSD
@ -2148,11 +2171,11 @@ def dirs_need_ownership_update(service):
return False
# A dict of valid ceph upgrade paths. Mapping is old -> new
UPGRADE_PATHS = {
'firefly': 'hammer',
'hammer': 'jewel',
'jewel': 'luminous',
}
UPGRADE_PATHS = collections.OrderedDict([
('firefly', 'hammer'),
('hammer', 'jewel'),
('jewel', 'luminous'),
])
# Map UCA codenames to ceph codenames
UCA_CODENAME_MAP = {