From 5795ecb94f943eea8bb1cfbf4ce957e144262763 Mon Sep 17 00:00:00 2001 From: Corey Bryant Date: Wed, 18 Mar 2020 15:18:05 +0000 Subject: [PATCH] Maintain OSD state on upgrade Ensure each OSD reaches its pre-restart state before proceeding after restart. This prevents the charm from finalizing the upgrade prior to OSDs recovering after upgrade. For example, if the state is 'active' prior to restart, then it must reach 'active' after restart, at which point the upgrade will be allowed to complete. Change-Id: I66359963b5e8e792d23b7989f07b7ac67afcd9b0 Closes-Bug: #1821028 (cherry picked from commit 26114b2a544b4b2e3bb2b43839fd18e3d395d95a) --- .gitreview | 1 + ceph/utils.py | 112 +++++++++++++++++++++++++--- tox.ini | 5 ++ unit_tests/test_osd_upgrade_roll.py | 58 +++++++++++++- 4 files changed, 164 insertions(+), 12 deletions(-) diff --git a/.gitreview b/.gitreview index 10a3500..dc88a31 100644 --- a/.gitreview +++ b/.gitreview @@ -2,3 +2,4 @@ host=review.opendev.org port=29418 project=openstack/charms.ceph.git +defaultbranch=stable/20.02 diff --git a/ceph/utils.py b/ceph/utils.py index 7c97078..325978c 100644 --- a/ceph/utils.py +++ b/ceph/utils.py @@ -25,6 +25,7 @@ import sys import time import uuid +from contextlib import contextmanager from datetime import datetime from charmhelpers.core import hookenv @@ -2414,10 +2415,11 @@ def upgrade_osd(new_version): # way to update the code on the node. if not dirs_need_ownership_update('osd'): log('Restarting all OSDs to load new binaries', DEBUG) - if systemd(): - service_restart('ceph-osd.target') - else: - service_restart('ceph-osd-all') + with maintain_all_osd_states(): + if systemd(): + service_restart('ceph-osd.target') + else: + service_restart('ceph-osd-all') return # Need to change the ownership of all directories which are not OSD @@ -2462,11 +2464,12 @@ def _upgrade_single_osd(osd_num, osd_dir): :raises IOError: if an error occurs reading/writing to a file as part of the upgrade process """ - stop_osd(osd_num) - disable_osd(osd_num) - update_owner(osd_dir) - enable_osd(osd_num) - start_osd(osd_num) + with maintain_osd_state(osd_num): + stop_osd(osd_num) + disable_osd(osd_num) + update_owner(osd_dir) + enable_osd(osd_num) + start_osd(osd_num) def stop_osd(osd_num): @@ -2593,6 +2596,97 @@ def update_owner(path, recurse_dirs=True): secs=elapsed_time.total_seconds(), path=path), DEBUG) +def get_osd_state(osd_num, osd_goal_state=None): + """Get OSD state or loop until OSD state matches OSD goal state. + + If osd_goal_state is None, just return the current OSD state. + If osd_goal_state is not None, loop until the current OSD state matches + the OSD goal state. + + :param osd_num: the osd id to get state for + :param osd_goal_state: (Optional) string indicating state to wait for + Defaults to None + :returns: Returns a str, the OSD state. + :rtype: str + """ + while True: + asok = "/var/run/ceph/ceph-osd.{}.asok".format(osd_num) + cmd = [ + 'ceph', + 'daemon', + asok, + 'status' + ] + try: + result = json.loads(str(subprocess + .check_output(cmd) + .decode('UTF-8'))) + except (subprocess.CalledProcessError, ValueError) as e: + log("{}".format(e), level=DEBUG) + continue + osd_state = result['state'] + log("OSD {} state: {}, goal state: {}".format( + osd_num, osd_state, osd_goal_state), level=DEBUG) + if not osd_goal_state: + return osd_state + if osd_state == osd_goal_state: + return osd_state + + +def get_all_osd_states(osd_goal_states=None): + """Get all OSD states or loop until all OSD states match OSD goal states. + + If osd_goal_states is None, just return a dictionary of current OSD states. + If osd_goal_states is not None, loop until the current OSD states match + the OSD goal states. + + :param osd_goal_states: (Optional) dict indicating states to wait for + Defaults to None + :returns: Returns a dictionary of current OSD states. + :rtype: dict + """ + osd_states = {} + for osd_num in get_local_osd_ids(): + if not osd_goal_states: + osd_states[osd_num] = get_osd_state(osd_num) + else: + osd_states[osd_num] = get_osd_state( + osd_num, + osd_goal_state=osd_goal_states[osd_num]) + return osd_states + + +@contextmanager +def maintain_osd_state(osd_num): + """Ensure the state of an OSD is maintained. + + Ensures the state of an OSD is the same at the end of a block nested + in a with statement as it was at the beginning of the block. + + :param osd_num: the osd id to maintain state for + """ + osd_state = get_osd_state(osd_num) + try: + yield + finally: + get_osd_state(osd_num, osd_goal_state=osd_state) + + +@contextmanager +def maintain_all_osd_states(): + """Ensure all local OSD states are maintained. + + Ensures the states of all local OSDs are the same at the end of a + block nested in a with statement as they were at the beginning of + the block. + """ + osd_states = get_all_osd_states() + try: + yield + finally: + get_all_osd_states(osd_goal_states=osd_states) + + def list_pools(client='admin'): """This will list the current pools that Ceph has diff --git a/tox.ini b/tox.ini index aeeabd0..3bfc32c 100644 --- a/tox.ini +++ b/tox.ini @@ -28,6 +28,11 @@ basepython = python3.7 deps = -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt +[testenv:py38] +basepython = python3.8 +deps = -r{toxinidir}/requirements.txt + -r{toxinidir}/test-requirements.txt + [testenv:pep8] basepython = python3 deps = -r{toxinidir}/requirements.txt diff --git a/unit_tests/test_osd_upgrade_roll.py b/unit_tests/test_osd_upgrade_roll.py index 9c9fffd..369fe68 100644 --- a/unit_tests/test_osd_upgrade_roll.py +++ b/unit_tests/test_osd_upgrade_roll.py @@ -15,6 +15,7 @@ import os import sys import time +import subprocess import unittest from mock import patch, call, mock_open @@ -63,6 +64,7 @@ def monitor_key_side_effect(*args): class UpgradeRollingTestCase(unittest.TestCase): + @patch.object(ceph.utils, 'get_osd_state') @patch.object(ceph.utils, 'determine_packages') @patch.object(ceph.utils, 'dirs_need_ownership_update') @patch.object(ceph.utils, 'apt_install') @@ -80,12 +82,13 @@ class UpgradeRollingTestCase(unittest.TestCase): add_source, apt_update, status_set, log, service_restart, chownr, apt_install, dirs_need_ownership_update, - _determine_packages): + _determine_packages, get_osd_state): config.side_effect = config_side_effect get_version.side_effect = [0.80, 0.94] systemd.return_value = False local_osds.return_value = [0, 1, 2] dirs_need_ownership_update.return_value = False + get_osd_state.side_effect = ['active'] * 6 ceph.utils.upgrade_osd('hammer') service_restart.assert_called_with('ceph-osd-all') @@ -98,6 +101,12 @@ class UpgradeRollingTestCase(unittest.TestCase): call('Upgrading to: hammer') ] ) + get_osd_state.assert_has_calls([ + call(0), call(1), call(2), + call(0, osd_goal_state='active'), + call(1, osd_goal_state='active'), + call(2, osd_goal_state='active'), + ]) # Make sure on an Upgrade to Hammer that chownr was NOT called. assert not chownr.called @@ -151,6 +160,7 @@ class UpgradeRollingTestCase(unittest.TestCase): ] ) + @patch.object(ceph.utils, 'get_osd_state') @patch.object(ceph.utils, 'determine_packages') @patch.object(ceph.utils, 'service_restart') @patch.object(ceph.utils, '_upgrade_single_osd') @@ -175,7 +185,7 @@ class UpgradeRollingTestCase(unittest.TestCase): dirs_need_ownership_update, _get_child_dirs, listdir, update_owner, _upgrade_single_osd, service_restart, - _determine_packages): + _determine_packages, get_osd_state): config.side_effect = config_side_effect get_version.side_effect = [10.2, 12.2] systemd.return_value = True @@ -183,6 +193,7 @@ class UpgradeRollingTestCase(unittest.TestCase): listdir.return_value = ['osd', 'mon', 'fs'] _get_child_dirs.return_value = ['ceph-0', 'ceph-1', 'ceph-2'] dirs_need_ownership_update.return_value = False + get_osd_state.side_effect = ['active'] * 6 ceph.utils.upgrade_osd('luminous') service_restart.assert_called_with('ceph-osd.target') @@ -198,20 +209,33 @@ class UpgradeRollingTestCase(unittest.TestCase): call('Upgrading to: luminous') ] ) + get_osd_state.assert_has_calls([ + call(0), call(1), call(2), + call(0, osd_goal_state='active'), + call(1, osd_goal_state='active'), + call(2, osd_goal_state='active'), + ]) + @patch.object(ceph.utils, 'get_osd_state') @patch.object(ceph.utils, 'stop_osd') @patch.object(ceph.utils, 'disable_osd') @patch.object(ceph.utils, 'update_owner') @patch.object(ceph.utils, 'enable_osd') @patch.object(ceph.utils, 'start_osd') def test_upgrade_single_osd(self, start_osd, enable_osd, update_owner, - disable_osd, stop_osd): + disable_osd, stop_osd, get_osd_state): + get_osd_state.side_effect = ['active'] * 2 + ceph.utils._upgrade_single_osd(1, '/var/lib/ceph/osd/ceph-1') stop_osd.assert_called_with(1) disable_osd.assert_called_with(1) update_owner.assert_called_with('/var/lib/ceph/osd/ceph-1') enable_osd.assert_called_with(1) start_osd.assert_called_with(1) + get_osd_state.assert_has_calls([ + call(1), + call(1, osd_goal_state='active'), + ]) @patch.object(ceph.utils, 'systemd') @patch.object(ceph.utils, 'service_stop') @@ -269,6 +293,34 @@ class UpgradeRollingTestCase(unittest.TestCase): handle.write.assert_called_with('ready') update_owner.assert_called_with('/var/lib/ceph/osd/ceph-6/ready') + @patch('subprocess.check_output') + @patch.object(ceph.utils, 'log') + def test_get_osd_state(self, log, check_output): + check_output.side_effect = [ + subprocess.CalledProcessError(returncode=2, cmd=["bad"]), + ValueError("bad value"), + '{"state":"active"}'.encode()] * 2 + + osd_state = ceph.utils.get_osd_state(2) + check_output.assert_called_with( + ['ceph', 'daemon', '/var/run/ceph/ceph-osd.2.asok', 'status']) + log.assert_has_calls([ + call("Command '['bad']' returned non-zero exit status 2.", + level='DEBUG'), + call('bad value', level='DEBUG'), + call('OSD 2 state: active, goal state: None', level='DEBUG')]) + self.assertEqual(osd_state, 'active') + + osd_state = ceph.utils.get_osd_state(2, osd_goal_state='active') + check_output.assert_called_with( + ['ceph', 'daemon', '/var/run/ceph/ceph-osd.2.asok', 'status']) + log.assert_has_calls([ + call("Command '['bad']' returned non-zero exit status 2.", + level='DEBUG'), + call('bad value', level='DEBUG'), + call('OSD 2 state: active, goal state: None', level='DEBUG')]) + self.assertEqual(osd_state, 'active') + @patch.object(ceph.utils, 'socket') @patch.object(ceph.utils, 'get_osd_tree') @patch.object(ceph.utils, 'log')