Resync ceph helpers for misc fixes

Pickup fixes for upgrade from Jewel -> Luminous.

Change-Id: Id0694b5116e604efbe1c5259de332ae0c4bae574
Closes-Bug: 1742082
Closes-Bug: 1742083
Closes-Bug: 1742120
Closes-Bug: 1742079
Closes-Bug: 1742408
This commit is contained in:
James Page 2018-01-10 09:12:20 +00:00
parent 4e411761bf
commit 1f0593649c
2 changed files with 57 additions and 34 deletions

View File

@ -338,7 +338,7 @@ def save_service(service_name, service):
service['groups'] = {} service['groups'] = {}
return monitor_key_set(service='admin', return monitor_key_set(service='admin',
key="cephx.services.{}".format(service_name), key="cephx.services.{}".format(service_name),
value=json.dumps(service)) value=json.dumps(service, sort_keys=True))
def save_group(group, group_name): def save_group(group, group_name):
@ -346,7 +346,7 @@ def save_group(group, group_name):
group_key = get_group_key(group_name=group_name) group_key = get_group_key(group_name=group_name)
return monitor_key_set(service='admin', return monitor_key_set(service='admin',
key=group_key, key=group_key,
value=json.dumps(group)) value=json.dumps(group, sort_keys=True))
def get_group_key(group_name): def get_group_key(group_name):

View File

@ -1311,7 +1311,8 @@ def bootstrap_monitor_cluster(secret):
# Ceph >= 0.61.3 needs this for ceph-mon fs creation # Ceph >= 0.61.3 needs this for ceph-mon fs creation
mkdir('/var/run/ceph', owner=ceph_user(), mkdir('/var/run/ceph', owner=ceph_user(),
group=ceph_user(), perms=0o755) group=ceph_user(), perms=0o755)
mkdir(path, owner=ceph_user(), group=ceph_user()) mkdir(path, owner=ceph_user(), group=ceph_user(),
perms=0o755)
# end changes for Ceph >= 0.61.3 # end changes for Ceph >= 0.61.3
try: try:
add_keyring_to_ceph(keyring, add_keyring_to_ceph(keyring,
@ -1673,12 +1674,23 @@ def roll_monitor_cluster(new_version, upgrade_key):
service='mon', service='mon',
my_name=my_name, my_name=my_name,
version=new_version) version=new_version)
# NOTE(jamespage):
# Wait until all monitors have upgraded before bootstrapping
# the ceph-mgr daemons due to use of new mgr keyring profiles
if new_version == 'luminous':
wait_for_all_monitors_to_upgrade(new_version=new_version,
upgrade_key=upgrade_key)
bootstrap_manager()
except ValueError: except ValueError:
log("Failed to find {} in list {}.".format( log("Failed to find {} in list {}.".format(
my_name, mon_sorted_list)) my_name, mon_sorted_list))
status_set('blocked', 'failed to upgrade monitor') status_set('blocked', 'failed to upgrade monitor')
# TODO(jamespage):
# Mimic support will need to ensure that ceph-mgr daemons are also
# restarted during upgrades - probably through use of one of the
# high level systemd targets shipped by the packaging.
def upgrade_monitor(new_version): def upgrade_monitor(new_version):
"""Upgrade the current ceph monitor to the new version """Upgrade the current ceph monitor to the new version
@ -1699,26 +1711,31 @@ def upgrade_monitor(new_version):
sys.exit(1) sys.exit(1)
try: try:
if systemd(): if systemd():
for mon_id in get_local_mon_ids(): service_stop('ceph-mon')
service_stop('ceph-mon@{}'.format(mon_id))
else: else:
service_stop('ceph-mon-all') service_stop('ceph-mon-all')
apt_install(packages=determine_packages(), fatal=True) apt_install(packages=determine_packages(), fatal=True)
owner = ceph_user()
# Ensure the files and directories under /var/lib/ceph is chowned # Ensure the files and directories under /var/lib/ceph is chowned
# properly as part of the move to the Jewel release, which moved the # properly as part of the move to the Jewel release, which moved the
# ceph daemons to running as ceph:ceph instead of root:root. # ceph daemons to running as ceph:ceph instead of root:root.
if new_version == 'jewel': if new_version == 'jewel':
# Ensure the ownership of Ceph's directories is correct # Ensure the ownership of Ceph's directories is correct
owner = ceph_user()
chownr(path=os.path.join(os.sep, "var", "lib", "ceph"), chownr(path=os.path.join(os.sep, "var", "lib", "ceph"),
owner=owner, owner=owner,
group=owner, group=owner,
follow_links=True) follow_links=True)
# Ensure that mon directory is user writable
hostname = socket.gethostname()
path = '/var/lib/ceph/mon/ceph-{}'.format(hostname)
mkdir(path, owner=ceph_user(), group=ceph_user(),
perms=0o755)
if systemd(): if systemd():
for mon_id in get_local_mon_ids(): service_start('ceph-mon')
service_start('ceph-mon@{}'.format(mon_id))
else: else:
service_start('ceph-mon-all') service_start('ceph-mon-all')
except subprocess.CalledProcessError as err: except subprocess.CalledProcessError as err:
@ -1799,25 +1816,28 @@ def wait_on_previous_node(upgrade_key, service, previous_node, version):
previous_node_start_time = monitor_key_get( previous_node_start_time = monitor_key_get(
upgrade_key, upgrade_key,
"{}_{}_{}_start".format(service, previous_node, version)) "{}_{}_{}_start".format(service, previous_node, version))
if (current_timestamp - (10 * 60)) > previous_node_start_time: if (previous_node_start_time is not None and
# Previous node is probably dead. Lets move on ((current_timestamp - (10 * 60)) >
if previous_node_start_time is not None: float(previous_node_start_time))):
log( # NOTE(jamespage):
"Waited 10 mins on node {}. current time: {} > " # Previous node is probably dead as we've been waiting
"previous node start time: {} Moving on".format( # for 10 minutes - lets move on and upgrade
previous_node, log("Waited 10 mins on node {}. current time: {} > "
(current_timestamp - (10 * 60)), "previous node start time: {} Moving on".format(
previous_node_start_time)) previous_node,
return (current_timestamp - (10 * 60)),
else: previous_node_start_time))
# I have to wait. Sleep a random amount of time and then return
# check if I can lock,upgrade and roll. # NOTE(jamespage)
wait_time = random.randrange(5, 30) # Previous node has not started, or started less than
log('waiting for {} seconds'.format(wait_time)) # 10 minutes ago - sleep a random amount of time and
time.sleep(wait_time) # then check again.
previous_node_finished = monitor_key_exists( wait_time = random.randrange(5, 30)
upgrade_key, log('waiting for {} seconds'.format(wait_time))
"{}_{}_{}_done".format(service, previous_node, version)) time.sleep(wait_time)
previous_node_finished = monitor_key_exists(
upgrade_key,
"{}_{}_{}_done".format(service, previous_node, version))
def get_upgrade_position(osd_sorted_list, match_name): def get_upgrade_position(osd_sorted_list, match_name):
@ -1874,7 +1894,7 @@ def roll_osd_cluster(new_version, upgrade_key):
version=new_version) version=new_version)
else: else:
# Check if the previous node has finished # Check if the previous node has finished
status_set('blocked', status_set('waiting',
'Waiting on {} to finish upgrading'.format( 'Waiting on {} to finish upgrading'.format(
osd_sorted_list[position - 1].name)) osd_sorted_list[position - 1].name))
wait_on_previous_node( wait_on_previous_node(
@ -1922,7 +1942,10 @@ def upgrade_osd(new_version):
# way to update the code on the node. # way to update the code on the node.
if not dirs_need_ownership_update('osd'): if not dirs_need_ownership_update('osd'):
log('Restarting all OSDs to load new binaries', DEBUG) log('Restarting all OSDs to load new binaries', DEBUG)
service_restart('ceph-osd-all') if systemd():
service_restart('ceph-osd.target')
else:
service_restart('ceph-osd-all')
return return
# Need to change the ownership of all directories which are not OSD # Need to change the ownership of all directories which are not OSD
@ -2148,11 +2171,11 @@ def dirs_need_ownership_update(service):
return False return False
# A dict of valid ceph upgrade paths. Mapping is old -> new # A dict of valid ceph upgrade paths. Mapping is old -> new
UPGRADE_PATHS = { UPGRADE_PATHS = collections.OrderedDict([
'firefly': 'hammer', ('firefly', 'hammer'),
'hammer': 'jewel', ('hammer', 'jewel'),
'jewel': 'luminous', ('jewel', 'luminous'),
} ])
# Map UCA codenames to ceph codenames # Map UCA codenames to ceph codenames
UCA_CODENAME_MAP = { UCA_CODENAME_MAP = {