From 3eea37b85b1abc72786a2b24baf01141b4d95f08 Mon Sep 17 00:00:00 2001 From: imacdonn Date: Thu, 4 Oct 2018 21:27:18 +0000 Subject: [PATCH] Handle online_data_migrations exceptions When online_data_migrations raise exceptions, nova/cinder-manage catches the exceptions, prints fairly useless "something didn't work" messages, and moves on. Two issues: 1) The user(/admin) has no way to see what actually failed (exception detail is not logged) 2) The command returns exit status 0, as if all possible migrations have been completed successfully - this can cause failures to get missed, especially if automated This change adds logging of the exceptions, and introduces a new exit status of 2, which indicates that no updates took effect in the last batch attempt, but some are (still) failing, which requires intervention. Change-Id: Ib684091af0b19e62396f6becc78c656c49a60504 Closes-Bug: #1796192 --- doc/source/cli/nova-manage.rst | 19 +++++--- doc/source/user/upgrade.rst | 20 +++++---- nova/cmd/manage.py | 27 ++++++++++-- nova/tests/unit/test_nova_manage.py | 43 +++++++++++++++---- ...grations-exit-status-9de5ea7836d0e368.yaml | 13 ++++++ 5 files changed, 96 insertions(+), 26 deletions(-) create mode 100644 releasenotes/notes/nova-manage-online-migrations-exit-status-9de5ea7836d0e368.yaml diff --git a/doc/source/cli/nova-manage.rst b/doc/source/cli/nova-manage.rst index 3874b5e7f554..fae94102c72b 100644 --- a/doc/source/cli/nova-manage.rst +++ b/doc/source/cli/nova-manage.rst @@ -86,16 +86,25 @@ Nova Database Lists and optionally deletes database records where instance_uuid is NULL. ``nova-manage db online_data_migrations [--max-count]`` - Perform data migration to update all live data. Return exit code 0 if - migrations were successful or exit code 1 for partial updates. This command - should be called after upgrading database schema and nova services on all - controller nodes. If the command exits with partial updates (exit code 1) - the command will need to be called again. + Perform data migration to update all live data. ``--max-count`` controls the maximum number of objects to migrate in a given call. If not specified, migration will occur in batches of 50 until fully complete. + Returns exit code 0 if no (further) updates are possible, 1 if the ``--max-count`` + option was used and some updates were completed successfully (even if others generated + errors), 2 if some updates generated errors and no other migrations were able to take + effect in the last batch attempted, or 127 if invalid input is provided (e.g. + non-numeric max-count). + + This command should be called after upgrading database schema and nova services on + all controller nodes. If it exits with partial updates (exit status 1) it should + be called again, even if some updates initially generated errors, because some updates + may depend on others having completed. If it exits with status 2, intervention is + required to resolve the issue causing remaining updates to fail. It should be + considered successfully completed only when the exit status is 0. + ``nova-manage db ironic_flavor_migration [--all] [--host] [--node] [--resource_class]`` Perform the ironic flavor migration process against the database while services are offline. This is `not recommended` for most diff --git a/doc/source/user/upgrade.rst b/doc/source/user/upgrade.rst index 38a32854d3d3..ad1e490a37f0 100644 --- a/doc/source/user/upgrade.rst +++ b/doc/source/user/upgrade.rst @@ -140,14 +140,18 @@ same time. ``nova-manage db online_data_migrations --max-count ``. Note that you can use the ``--max-count`` argument to reduce the load this operation will place on the database, which allows you to run a - small chunk of the migrations until all of the work is done. Each - time it is run, it will show a summary of completed and remaining - records. You run this command until you see completed and - remaining records as zeros. The chunk size you should use depend - on your infrastructure and how much additional load you can - impose on the database. To reduce load, perform smaller batches - with delays between chunks. To reduce time to completion, run - larger batches. + small chunk of the migrations until all of the work is done. The chunk size + you should use depends on your infrastructure and how much additional load + you can impose on the database. To reduce load, perform smaller batches + with delays between chunks. To reduce time to completion, run larger batches. + Each time it is run, the command will show a summary of completed and remaining + records. If using the ``--max-count`` option, the command should be rerun + while it returns exit status 1 (which indicates that some migrations took + effect, and more work may remain to be done), even if some migrations + produce errors. If all possible migrations have completed and some are + still producing errors, exit status 2 will be returned. In this case, the + cause of the errors should be investigated and resolved. Migrations should be + considered successfully completed only when the command returns exit status 0. * At this point, you must also ensure you update the configuration, to stop using any deprecated features or options, and perform any required work diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py index 186bed6c9479..dc87e3f7ee11 100644 --- a/nova/cmd/manage.py +++ b/nova/cmd/manage.py @@ -75,6 +75,8 @@ from nova.virt import ironic CONF = nova.conf.CONF +LOG = logging.getLogger(__name__) + QUOTAS = quota.QUOTAS # Keep this list sorted and one entry per line for readability. @@ -670,14 +672,18 @@ Error: %s""") % six.text_type(e)) def _run_migration(self, ctxt, max_count): ran = 0 + exceptions = False migrations = {} for migration_meth in self.online_migrations: count = max_count - ran try: found, done = migration_meth(ctxt, count) except Exception: - print(_("Error attempting to run %(method)s") % dict( - method=migration_meth)) + msg = (_("Error attempting to run %(method)s") % dict( + method=migration_meth)) + print(msg) + LOG.exception(msg) + exceptions = True found = done = 0 name = migration_meth.__name__ @@ -691,7 +697,7 @@ Error: %s""") % six.text_type(e)) ran += done if ran >= max_count: break - return migrations + return migrations, exceptions @args('--max-count', metavar='', dest='max_count', help='Maximum number of objects to consider') @@ -713,8 +719,9 @@ Error: %s""") % six.text_type(e)) ran = None migration_info = {} + exceptions = False while ran is None or ran != 0: - migrations = self._run_migration(ctxt, max_count) + migrations, exceptions = self._run_migration(ctxt, max_count) ran = 0 for name in migrations: migration_info.setdefault(name, (0, 0)) @@ -734,6 +741,18 @@ Error: %s""") % six.text_type(e)) t.add_row([name, info[0], info[1]]) print(t) + # NOTE(imacdonn): In the "unlimited" case, the loop above will only + # terminate when all possible migrations have been effected. If we're + # still getting exceptions, there's a problem that requires + # intervention. In the max-count case, exceptions are only considered + # fatal if no work was done by any other migrations ("not ran"), + # because otherwise work may still remain to be done, and that work + # may resolve dependencies for the failing migrations. + if exceptions and (unlimited or not ran): + print(_("Some migrations failed unexpectedly. Check log for " + "details.")) + return 2 + # TODO(mriedem): Potentially add another return code for # "there are more migrations, but not completable right now" return ran and 1 or 0 diff --git a/nova/tests/unit/test_nova_manage.py b/nova/tests/unit/test_nova_manage.py index 66dcf6f9c078..6aeeeaa08140 100644 --- a/nova/tests/unit/test_nova_manage.py +++ b/nova/tests/unit/test_nova_manage.py @@ -799,13 +799,38 @@ Running batches of 50 until complete self.assertEqual(0, total[0]) self.assertEqual([50, 50, 50, 50], runs) - def test_online_migrations_error(self): - fake_migration = mock.MagicMock() - fake_migration.side_effect = Exception - fake_migration.__name__ = 'fake' - command_cls = self._fake_db_command((fake_migration,)) + @mock.patch('nova.context.get_admin_context') + def test_online_migrations_error(self, mock_get_context): + good_remaining = [50] + + def good_migration(context, count): + self.assertEqual(mock_get_context.return_value, context) + found = good_remaining[0] + done = min(found, count) + good_remaining[0] -= done + return found, done + + bad_migration = mock.MagicMock() + bad_migration.side_effect = test.TestingException + bad_migration.__name__ = 'bad' + + command_cls = self._fake_db_command((bad_migration, good_migration)) command = command_cls() - command.online_data_migrations(None) + + # bad_migration raises an exception, but it could be because + # good_migration had not completed yet. We should get 1 in this case, + # because some work was done, and the command should be reiterated. + self.assertEqual(1, command.online_data_migrations(max_count=50)) + + # When running this for the second time, there's no work left for + # good_migration to do, but bad_migration still fails - should + # get 2 this time. + self.assertEqual(2, command.online_data_migrations(max_count=50)) + + # When --max-count is not used, we should get 2 if all possible + # migrations completed but some raise exceptions + good_remaining = [125] + self.assertEqual(2, command.online_data_migrations(None)) def test_online_migrations_bad_max(self): self.assertEqual(127, @@ -817,19 +842,19 @@ Running batches of 50 until complete def test_online_migrations_no_max(self): with mock.patch.object(self.commands, '_run_migration') as rm: - rm.return_value = {} + rm.return_value = {}, False self.assertEqual(0, self.commands.online_data_migrations()) def test_online_migrations_finished(self): with mock.patch.object(self.commands, '_run_migration') as rm: - rm.return_value = {} + rm.return_value = {}, False self.assertEqual(0, self.commands.online_data_migrations(max_count=5)) def test_online_migrations_not_finished(self): with mock.patch.object(self.commands, '_run_migration') as rm: - rm.return_value = {'mig': (10, 5)} + rm.return_value = {'mig': (10, 5)}, False self.assertEqual(1, self.commands.online_data_migrations(max_count=5)) diff --git a/releasenotes/notes/nova-manage-online-migrations-exit-status-9de5ea7836d0e368.yaml b/releasenotes/notes/nova-manage-online-migrations-exit-status-9de5ea7836d0e368.yaml new file mode 100644 index 000000000000..5957d603bcb6 --- /dev/null +++ b/releasenotes/notes/nova-manage-online-migrations-exit-status-9de5ea7836d0e368.yaml @@ -0,0 +1,13 @@ +--- +upgrade: + - | + The ``nova-manage db online_data_migrations`` command now returns exit + status 2 in the case where some migrations failed (raised exceptions) and + no others were completed successfully from the last batch attempted. This + should be considered a fatal condition that requires intervention. Exit + status 1 will be returned in the case where the ``--max-count`` option was + used and some migrations failed but others succeeded (updated at least one + row), because more work may remain for the non-failing migrations, and + their completion may be a dependency for the failing ones. The command + should be reiterated while it returns exit status 1, and considered + completed successfully only when it returns exit status 0.