From 83fd8ac0bfd3d9e5dd4ad48bfcb845d80095b845 Mon Sep 17 00:00:00 2001 From: int32bit Date: Mon, 22 Jan 2018 17:05:53 +0800 Subject: [PATCH] Set server status to ERROR if rebuild failed Currently there is no indication that the rebuild was refused, and worse, we may have a wrong imageref for the instance. This patch set the instance to ERROR status if rebuild failed in the scheduling stage. The user can rebuild the instance with valid image to get it out of ERROR state and reset with right instance metadata and properties. Closes-Bug: 1744325 Conflicts: nova/conductor/manager.py nova/tests/functional/regressions/test_bug_1713783.py nova/tests/functional/test_servers.py nova/tests/unit/conductor/test_conductor.py NOTE(melwitt): The conflicts were because of log translation, the fact that the regression test for bug 1713783 doesn't exist in Ocata, there are addtional rebuild functional tests in Pike that don't exist in Ocata, and the select_destinations method has additional parameters in Pike that don't exist in Ocata. Change-Id: Ibb7bee15a3d4ee6f0ef53ba12e8b41f65a1fe999 (cherry picked from commit d03a890a34f632adc9a19a33c8a5aebbccec50e4) (cherry picked from commit 22a39b8f9b76d5f330a3f185e7d40f208a6a47f3) --- nova/conductor/manager.py | 9 +++++++-- nova/tests/functional/test_server_group.py | 8 ++++---- nova/tests/functional/test_servers.py | 4 ++++ nova/tests/unit/conductor/test_conductor.py | 14 +++++++++++--- ...4325-rebuild-error-status-9e2da03f3f81fd6e.yaml | 7 +++++++ 5 files changed, 33 insertions(+), 9 deletions(-) create mode 100644 releasenotes/notes/bug-1744325-rebuild-error-status-9e2da03f3f81fd6e.yaml diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index 6ad52f9454be..3380422eac9a 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -16,6 +16,7 @@ import contextlib import copy +import sys from oslo_config import cfg from oslo_log import log as logging @@ -735,19 +736,23 @@ class ComputeTaskManager(base.Base): with excutils.save_and_reraise_exception(): self._set_vm_state_and_notify(context, instance.uuid, 'rebuild_server', - {'vm_state': instance.vm_state, + {'vm_state': vm_states.ERROR, 'task_state': None}, ex, request_spec) LOG.warning(_LW("No valid host found for rebuild"), instance=instance) + compute_utils.add_instance_fault_from_exc(context, + instance, ex, sys.exc_info()) except exception.UnsupportedPolicyException as ex: with excutils.save_and_reraise_exception(): self._set_vm_state_and_notify(context, instance.uuid, 'rebuild_server', - {'vm_state': instance.vm_state, + {'vm_state': vm_states.ERROR, 'task_state': None}, ex, request_spec) LOG.warning(_LW("Server with unsupported policy " "cannot be rebuilt"), instance=instance) + compute_utils.add_instance_fault_from_exc(context, + instance, ex, sys.exc_info()) try: migration = objects.Migration.get_by_instance_and_status( diff --git a/nova/tests/functional/test_server_group.py b/nova/tests/functional/test_server_group.py index 36c6e288e591..b3db0169b045 100644 --- a/nova/tests/functional/test_server_group.py +++ b/nova/tests/functional/test_server_group.py @@ -455,7 +455,7 @@ class ServerGroupTestV21(ServerGroupTestBase): self.admin_api.post_server_action(servers[1]['id'], post) server_after_failed_evac = self._wait_for_state_change( - self.admin_api, servers[1], 'ACTIVE') + self.admin_api, servers[1], 'ERROR') # assert that after a failed evac the server active on the same host # as before @@ -479,7 +479,7 @@ class ServerGroupTestV21(ServerGroupTestBase): self.admin_api.post_server_action(servers[1]['id'], post) server_after_failed_evac = self._wait_for_state_change( - self.admin_api, servers[1], 'ACTIVE') + self.admin_api, servers[1], 'ERROR') # assert that after a failed evac the server active on the same host # as before @@ -653,7 +653,7 @@ class ServerGroupTestV215(ServerGroupTestV21): self.admin_api.post_server_action(servers[1]['id'], post) server_after_failed_evac = self._wait_for_state_change( - self.admin_api, servers[1], 'ACTIVE') + self.admin_api, servers[1], 'ERROR') # assert that after a failed evac the server active on the same host # as before @@ -677,7 +677,7 @@ class ServerGroupTestV215(ServerGroupTestV21): self.admin_api.post_server_action(servers[1]['id'], post) server_after_failed_evac = self._wait_for_state_change( - self.admin_api, servers[1], 'ACTIVE') + self.admin_api, servers[1], 'ERROR') # assert that after a failed evac the server active on the same host # as before diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index d40ae35dcb24..dfe0fd588360 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -1004,3 +1004,7 @@ class ServerRebuildTestCase(integrated_helpers._IntegratedTestBase, # even though the rebuild should not work. server = self.api.get_server(server['id']) self.assertEqual(rebuild_image_ref, server['image']['id']) + + # The server should be in ERROR state + self.assertEqual('ERROR', server['status']) + self.assertIn('No valid host', server['fault']['message']) diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py index b18009778f82..e4882565d3aa 100644 --- a/nova/tests/unit/conductor/test_conductor.py +++ b/nova/tests/unit/conductor/test_conductor.py @@ -1268,8 +1268,10 @@ class _BaseTaskTestCase(object): 'select_destinations', side_effect=exc.NoValidHost(reason='')), mock.patch('nova.scheduler.utils.build_request_spec', - return_value=request_spec) - ) as (rebuild_mock, sig_mock, fp_mock, select_dest_mock, bs_mock): + return_value=request_spec), + mock.patch.object(scheduler_utils, 'set_vm_state_and_notify') + ) as (rebuild_mock, sig_mock, fp_mock, + select_dest_mock, bs_mock, set_vm_state_and_notify_mock): self.assertRaises(exc.NoValidHost, self.conductor_manager.rebuild_instance, context=self.context, instance=inst_obj, @@ -1277,7 +1279,11 @@ class _BaseTaskTestCase(object): fp_mock.assert_called_once_with(self.context, request_spec, filter_properties) select_dest_mock.assert_called_once_with(self.context, fake_spec) + self.assertEqual( + set_vm_state_and_notify_mock.call_args[0][4]['vm_state'], + vm_states.ERROR) self.assertFalse(rebuild_mock.called) + self.assertIn('No valid host', inst_obj.fault.message) @mock.patch.object(conductor_manager.compute_rpcapi.ComputeAPI, 'rebuild_instance') @@ -1309,12 +1315,14 @@ class _BaseTaskTestCase(object): self.context, inst_obj, **rebuild_args) - updates = {'vm_state': vm_states.ACTIVE, 'task_state': None} + updates = {'vm_state': vm_states.ERROR, 'task_state': None} state_mock.assert_called_once_with(self.context, inst_obj.uuid, 'rebuild_server', updates, exception, request_spec) self.assertFalse(select_dest_mock.called) self.assertFalse(rebuild_mock.called) + self.assertIn('ServerGroup policy is not supported', + inst_obj.fault.message) def test_rebuild_instance_evacuate_migration_record(self): inst_obj = self._create_fake_instance_obj() diff --git a/releasenotes/notes/bug-1744325-rebuild-error-status-9e2da03f3f81fd6e.yaml b/releasenotes/notes/bug-1744325-rebuild-error-status-9e2da03f3f81fd6e.yaml new file mode 100644 index 000000000000..086d4094fb4d --- /dev/null +++ b/releasenotes/notes/bug-1744325-rebuild-error-status-9e2da03f3f81fd6e.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + If scheduling fails during rebuild the server instance will go to ERROR + state and a fault will be recorded. `Bug 1744325`_ + + .. _Bug 1744325: https://bugs.launchpad.net/nova/+bug/1744325