From c421aa742909f78c8b7c9a4548874795b70dad87 Mon Sep 17 00:00:00 2001 From: Billy Olsen Date: Mon, 6 Feb 2017 21:59:51 -0700 Subject: [PATCH] Roll osd ownership changes through node Change the OSD upgrade path so that the file ownership change for the OSD directories are run one OSD at a time rather than all of the OSDs at once. Partial-Bug: #1662591 Change-Id: I3a1cf05207c070a8699e7ba749a0587b619d4679 --- ceph/__init__.py | 294 +++++++++++++++++++++++++--- unit_tests/test_osd_upgrade_roll.py | 122 ++++++++++-- 2 files changed, 366 insertions(+), 50 deletions(-) diff --git a/ceph/__init__.py b/ceph/__init__.py index e87aef9..1f6196a 100644 --- a/ceph/__init__.py +++ b/ceph/__init__.py @@ -26,19 +26,27 @@ import errno import shutil import pyudev +from datetime import datetime + from charmhelpers.core import hookenv from charmhelpers.core.host import ( - mkdir, chownr, - service_restart, + cmp_pkgrevno, lsb_release, - cmp_pkgrevno, service_stop, mounts, service_start) + mkdir, + mounts, + owner, + service_restart, + service_start, + service_stop) from charmhelpers.core.hookenv import ( - log, - ERROR, cached, + config, + log, status_set, - WARNING, DEBUG, config) + DEBUG, + ERROR, + WARNING) from charmhelpers.core.services import render_template from charmhelpers.fetch import ( apt_cache, @@ -55,6 +63,9 @@ from charmhelpers.contrib.storage.linux.utils import ( from charmhelpers.contrib.openstack.utils import ( get_os_codename_install_source) +CEPH_BASE_DIR = os.path.join(os.sep, 'var', 'lib', 'ceph') +OSD_BASE_DIR = os.path.join(CEPH_BASE_DIR, 'osd') + LEADER = 'leader' PEON = 'peon' QUORUM = [LEADER, PEON] @@ -556,6 +567,42 @@ def get_osd_tree(service): raise +def _get_child_dirs(path): + """Returns a list of directory names in the specified path. + + :param path: a full path listing of the parent directory to return child + directory names + :return: list. A list of child directories under the parent directory + :raises: ValueError if the specified path does not exist or is not a + directory, + OSError if an error occurs reading the directory listing + """ + if not os.path.exists(path): + raise ValueError('Specfied path "%s" does not exist' % path) + if not os.path.isdir(path): + raise ValueError('Specified path "%s" is not a directory' % path) + + return filter(os.path.isdir, os.path.listdir(path)) + + +def _get_osd_num_from_dirname(dirname): + """Parses the dirname and returns the OSD id. + + Parses a string in the form of 'ceph-{osd#}' and returns the osd number + from the directory name. + + :param dirname: the directory name to return the OSD number from + :return int: the osd number the directory name corresponds to + :raises ValueError: if the osd number cannot be parsed from the provided + directory name. + """ + match = re.search('ceph-(?P\d+)', dirname) + if not match: + raise ValueError("dirname not in correct format: %s" % dirname) + + return match.group('osd_id') + + def get_local_osd_ids(): """ This will list the /var/lib/ceph/osd/* directories and try @@ -1635,42 +1682,198 @@ def upgrade_osd(new_version): add_source(config('source'), config('key')) apt_update(fatal=True) except subprocess.CalledProcessError as err: - log("Adding the ceph source failed with message: {}".format( + log("Adding the ceph sources failed with message: {}".format( err.message)) status_set("blocked", "Upgrade to {} failed".format(new_version)) sys.exit(1) + try: - if systemd(): - for osd_id in get_local_osd_ids(): - service_stop('ceph-osd@{}'.format(osd_id)) - else: - service_stop('ceph-osd-all') + # Upgrade the packages before restarting the daemons. + status_set('maintenance', 'Upgrading packages to %s' % new_version) apt_install(packages=PACKAGES, fatal=True) - # Ensure the files and directories under /var/lib/ceph is chowned - # properly as part of the move to the Jewel release, which moved the - # ceph daemons to running as ceph:ceph instead of root:root. Only do - # it when necessary as this is an expensive operation to run. - if new_version == 'jewel': - owner = ceph_user() - status_set('maintenance', 'Updating file ownership for OSDs') - chownr(path=os.path.join(os.sep, "var", "lib", "ceph"), - owner=owner, - group=owner, - follow_links=True) + # If the upgrade does not need an ownership update of any of the + # directories in the osd service directory, then simply restart + # all of the OSDs at the same time as this will be the fastest + # way to update the code on the node. + if not dirs_need_ownership_update('osd'): + log('Restarting all OSDs to load new binaries', DEBUG) + service_restart('ceph-osd-all') + return - if systemd(): - for osd_id in get_local_osd_ids(): - service_start('ceph-osd@{}'.format(osd_id)) - else: - service_start('ceph-osd-all') - except subprocess.CalledProcessError as err: + # Need to change the ownership of all directories which are not OSD + # directories as well. + # TODO - this should probably be moved to the general upgrade function + # and done before mon/osd. + update_owner(CEPH_BASE_DIR, recurse_dirs=False) + non_osd_dirs = filter(lambda x: not x == 'osd', + os.listdir(CEPH_BASE_DIR)) + non_osd_dirs = map(lambda x: os.path.join(CEPH_BASE_DIR, x), + non_osd_dirs) + for path in non_osd_dirs: + update_owner(path) + + # Fast service restart wasn't an option because each of the OSD + # directories need the ownership updated for all the files on + # the OSD. Walk through the OSDs one-by-one upgrading the OSD. + for osd_dir in _get_child_dirs(OSD_BASE_DIR): + try: + osd_num = _get_osd_num_from_dirname(osd_dir) + _upgrade_single_osd(osd_num, osd_dir) + except ValueError as ex: + # Directory could not be parsed - junk directory? + log('Could not parse osd directory %s: %s' % (osd_dir, ex), + WARNING) + continue + + except (subprocess.CalledProcessError, IOError) as err: log("Stopping ceph and upgrading packages failed " "with message: {}".format(err.message)) status_set("blocked", "Upgrade to {} failed".format(new_version)) sys.exit(1) +def _upgrade_single_osd(osd_num, osd_dir): + """Upgrades the single OSD directory. + + :param osd_num: the num of the OSD + :param osd_dir: the directory of the OSD to upgrade + :raises CalledProcessError: if an error occurs in a command issued as part + of the upgrade process + :raises IOError: if an error occurs reading/writing to a file as part + of the upgrade process + """ + stop_osd(osd_num) + disable_osd(osd_num) + update_owner(os.path.join(OSD_BASE_DIR, osd_dir)) + enable_osd(osd_num) + start_osd(osd_num) + + +def stop_osd(osd_num): + """Stops the specified OSD number. + + :param osd_num: the osd number to stop + """ + if systemd(): + service_stop('ceph-osd@{}'.format(osd_num)) + else: + service_stop('ceph-osd', id=osd_num) + + +def start_osd(osd_num): + """Starts the specified OSD number. + + :param osd_num: the osd number to start. + """ + if systemd(): + service_start('ceph-osd@{}'.format(osd_num)) + else: + service_start('ceph-osd', id=osd_num) + + +def disable_osd(osd_num): + """Disables the specified OSD number. + + Ensures that the specified osd will not be automatically started at the + next reboot of the system. Due to differences between init systems, + this method cannot make any guarantees that the specified osd cannot be + started manually. + + :param osd_num: the osd id which should be disabled. + :raises CalledProcessError: if an error occurs invoking the systemd cmd + to disable the OSD + :raises IOError, OSError: if the attempt to read/remove the ready file in + an upstart enabled system fails + """ + if systemd(): + # When running under systemd, the individual ceph-osd daemons run as + # templated units and can be directly addressed by referring to the + # templated service name ceph-osd@. Additionally, systemd + # allows one to disable a specific templated unit by running the + # 'systemctl disable ceph-osd@' command. When disabled, the + # OSD should remain disabled until re-enabled via systemd. + # Note: disabling an already disabled service in systemd returns 0, so + # no need to check whether it is enabled or not. + cmd = ['systemctl', 'disable', 'ceph-osd@{}'.format(osd_num)] + subprocess.check_call(cmd) + else: + # Neither upstart nor the ceph-osd upstart script provides for + # disabling the starting of an OSD automatically. The specific OSD + # cannot be prevented from running manually, however it can be + # prevented from running automatically on reboot by removing the + # 'ready' file in the OSD's root directory. This is due to the + # ceph-osd-all upstart script checking for the presence of this file + # before starting the OSD. + ready_file = os.path.join(OSD_BASE_DIR, 'ceph-{}'.format(osd_num), + 'ready') + if os.path.exists(ready_file): + os.unlink(ready_file) + + +def enable_osd(osd_num): + """Enables the specified OSD number. + + Ensures that the specified osd_num will be enabled and ready to start + automatically in the event of a reboot. + + :param osd_num: the osd id which should be enabled. + :raises CalledProcessError: if the call to the systemd command issued + fails when enabling the service + :raises IOError: if the attempt to write the ready file in an usptart + enabled system fails + """ + if systemd(): + cmd = ['systemctl', 'enable', 'ceph-osd@{}'.format(osd_num)] + subprocess.check_call(cmd) + else: + # When running on upstart, the OSDs are started via the ceph-osd-all + # upstart script which will only start the osd if it has a 'ready' + # file. Make sure that file exists. + ready_file = os.path.join(OSD_BASE_DIR, 'ceph-{}'.format(osd_num), + 'ready') + with open(ready_file, 'w') as f: + f.write('ready') + + # Make sure the correct user owns the file. It shouldn't be necessary + # as the upstart script should run with root privileges, but its better + # to have all the files matching ownership. + update_owner(ready_file) + + +def update_owner(path, recurse_dirs=True): + """Changes the ownership of the specified path. + + Changes the ownership of the specified path to the new ceph daemon user + using the system's native chown functionality. This may take awhile, + so this method will issue a set_status for any changes of ownership which + recurses into directory structures. + + :param path: the path to recursively change ownership for + :param recurse_dirs: boolean indicating whether to recursively change the + ownership of all the files in a path's subtree or to + simply change the ownership of the path. + :raises CalledProcessError: if an error occurs issuing the chown system + command + """ + user = ceph_user() + user_group = '{ceph_user}:{ceph_user}'.format(ceph_user=user) + cmd = ['chown', user_group, path] + if os.path.isdir(path) and recurse_dirs: + status_set('maintenance', ('Updating ownership of %s to %s' % + (path.split('/')[-1], user))) + cmd.insert('-R', 1) + + log('Changing ownership of {path} to {user}'.format( + path=path, user=user_group), DEBUG) + start = datetime.now() + subprocess.check_call(cmd) + elapsed_time = (datetime.now() - start) + + log('Took {secs} seconds to change the ownership of path: {path}'.format( + secs=elapsed_time.total_seconds(), path=path), DEBUG) + + def list_pools(service): """ This will list the current pools that Ceph has @@ -1689,6 +1892,39 @@ def list_pools(service): log("rados lspools failed with error: {}".format(err.output)) raise + +def dirs_need_ownership_update(service): + """Determines if directories still need change of ownership. + + Examines the set of directories under the /var/lib/ceph/{service} directory + and determines if they have the correct ownership or not. This is + necessary due to the upgrade from Hammer to Jewel where the daemon user + changes from root: to ceph:. + + :param service: the name of the service folder to check (e.g. osd, mon) + :return: boolean. True if the directories need a change of ownership, + False otherwise. + :raises IOError: if an error occurs reading the file stats from one of + the child directories. + :raises OSError: if the specified path does not exist or some other error + """ + expected_owner = expected_group = ceph_user() + path = os.path.join(CEPH_BASE_DIR, service) + for child in _get_child_dirs(path): + child_path = os.path.join(path, child) + curr_owner, curr_group = owner(child_path) + + if (curr_owner == expected_owner) and (curr_group == expected_group): + continue + + log('Directory "%s" needs its ownership updated' % child_path, + DEBUG) + return True + + # All child directories had the expected ownership + return False + + # A dict of valid ceph upgrade paths. Mapping is old -> new UPGRADE_PATHS = { 'firefly': 'hammer', diff --git a/unit_tests/test_osd_upgrade_roll.py b/unit_tests/test_osd_upgrade_roll.py index 4a878ab..2e58c88 100644 --- a/unit_tests/test_osd_upgrade_roll.py +++ b/unit_tests/test_osd_upgrade_roll.py @@ -13,9 +13,11 @@ # limitations under the License. import time +import os import unittest +import sys -from mock import patch, call +from mock import patch, call, mock_open import ceph from ceph import CrushLocation @@ -61,10 +63,10 @@ def monitor_key_side_effect(*args): class UpgradeRollingTestCase(unittest.TestCase): + @patch('ceph.dirs_need_ownership_update') @patch('ceph.apt_install') @patch('ceph.chownr') - @patch('ceph.service_stop') - @patch('ceph.service_start') + @patch('ceph.service_restart') @patch('ceph.log') @patch('ceph.status_set') @patch('ceph.apt_update') @@ -75,16 +77,16 @@ class UpgradeRollingTestCase(unittest.TestCase): @patch('ceph.config') def test_upgrade_osd_hammer(self, config, get_version, systemd, local_osds, add_source, apt_update, status_set, log, - service_start, service_stop, chownr, - apt_install): + service_restart, chownr, apt_install, + dirs_need_ownership_update): config.side_effect = config_side_effect get_version.side_effect = [0.80, 0.94] systemd.return_value = False local_osds.return_value = [0, 1, 2] + dirs_need_ownership_update.return_value = False ceph.upgrade_osd('hammer') - service_stop.assert_called_with('ceph-osd-all') - service_start.assert_called_with('ceph-osd-all') + service_restart.assert_called_with('ceph-osd-all') status_set.assert_has_calls([ call('maintenance', 'Upgrading osd'), ]) @@ -97,10 +99,12 @@ class UpgradeRollingTestCase(unittest.TestCase): # Make sure on an Upgrade to Hammer that chownr was NOT called. assert not chownr.called + @patch('ceph._upgrade_single_osd') + @patch('ceph.update_owner') + @patch('os.listdir') + @patch('ceph._get_child_dirs') + @patch('ceph.dirs_need_ownership_update') @patch('ceph.apt_install') - @patch('ceph.chownr') - @patch('ceph.service_stop') - @patch('ceph.service_start') @patch('ceph.log') @patch('ceph.status_set') @patch('ceph.apt_update') @@ -111,19 +115,31 @@ class UpgradeRollingTestCase(unittest.TestCase): @patch('ceph.config') def test_upgrade_osd_jewel(self, config, get_version, systemd, local_osds, add_source, apt_update, status_set, - log, service_start, service_stop, chownr, - apt_install): + log, apt_install, dirs_need_ownership_update, + _get_child_dirs, listdir, update_owner, + _upgrade_single_osd): config.side_effect = config_side_effect get_version.side_effect = [0.94, 10.1] systemd.return_value = False local_osds.return_value = [0, 1, 2] + listdir.return_value = ['osd', 'mon', 'fs'] + _get_child_dirs.return_value = ['ceph-0', 'ceph-1', 'ceph-2'] + dirs_need_ownership_update.return_value = True ceph.upgrade_osd('jewel') - service_stop.assert_called_with('ceph-osd-all') - service_start.assert_called_with('ceph-osd-all') + update_owner.assert_has_calls([ + call(ceph.CEPH_BASE_DIR, recurse_dirs=False), + call(os.path.join(ceph.CEPH_BASE_DIR, 'mon')), + call(os.path.join(ceph.CEPH_BASE_DIR, 'fs')), + ]) + _upgrade_single_osd.assert_has_calls([ + call('0', 'ceph-0'), + call('1', 'ceph-1'), + call('2', 'ceph-2'), + ]) status_set.assert_has_calls([ call('maintenance', 'Upgrading osd'), - call('maintenance', 'Updating file ownership for OSDs') + call('maintenance', 'Upgrading packages to jewel') ]) log.assert_has_calls( [ @@ -131,12 +147,76 @@ class UpgradeRollingTestCase(unittest.TestCase): call('Upgrading to: jewel') ] ) - chownr.assert_has_calls( - [ - call(group='ceph', owner='ceph', path='/var/lib/ceph', - follow_links=True) - ] - ) + + @patch.object(ceph, 'stop_osd') + @patch.object(ceph, 'disable_osd') + @patch.object(ceph, 'update_owner') + @patch.object(ceph, 'enable_osd') + @patch.object(ceph, 'start_osd') + def test_upgrade_single_osd(self, start_osd, enable_osd, update_owner, + disable_osd, stop_osd): + ceph._upgrade_single_osd(1, 'ceph-1') + stop_osd.assert_called_with(1) + disable_osd.assert_called_with(1) + update_owner.assert_called_with('/var/lib/ceph/osd/ceph-1') + enable_osd.assert_called_with(1) + start_osd.assert_called_with(1) + + @patch.object(ceph, 'systemd') + @patch.object(ceph, 'service_stop') + def test_stop_osd(self, service_stop, systemd): + systemd.return_value = False + ceph.stop_osd(1) + service_stop.assert_called_with('ceph-osd', id=1) + + systemd.return_value = True + ceph.stop_osd(2) + service_stop.assert_called_with('ceph-osd@2') + + @patch.object(ceph, 'systemd') + @patch.object(ceph, 'service_start') + def test_start_osd(self, service_start, systemd): + systemd.return_value = False + ceph.start_osd(1) + service_start.assert_called_with('ceph-osd', id=1) + + systemd.return_value = True + ceph.start_osd(2) + service_start.assert_called_with('ceph-osd@2') + + @patch('subprocess.check_call') + @patch('os.path.exists') + @patch('os.unlink') + @patch('ceph.systemd') + def test_disable_osd(self, systemd, unlink, exists, check_call): + systemd.return_value = True + ceph.disable_osd(4) + check_call.assert_called_with(['systemctl', 'disable', 'ceph-osd@4']) + + exists.return_value = True + systemd.return_value = False + ceph.disable_osd(3) + unlink.assert_called_with('/var/lib/ceph/osd/ceph-3/ready') + + @patch('subprocess.check_call') + @patch('ceph.update_owner') + @patch('ceph.systemd') + def test_enable_osd(self, systemd, update_owner, check_call): + systemd.return_value = True + ceph.enable_osd(5) + check_call.assert_called_with(['systemctl', 'enable', 'ceph-osd@5']) + + systemd.return_value = False + mo = mock_open() + # Detect which builtin open version we need to mock based on + # the python version. + bs = 'builtins' if sys.version_info > (3, 0) else '__builtin__' + with patch('%s.open' % bs, mo): + ceph.enable_osd(6) + mo.assert_called_once_with('/var/lib/ceph/osd/ceph-6/ready', 'w') + handle = mo() + handle.write.assert_called_with('ready') + update_owner.assert_called_with('/var/lib/ceph/osd/ceph-6/ready') @patch('ceph.socket') @patch('ceph.get_osd_tree')