From 53bcf0b1eeb4d34cbbec6200026c9bac5921db97 Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Tue, 1 Oct 2019 12:03:17 -0400 Subject: [PATCH] Handle get_host_availability_zone error during reschedule If a build fails and reschedules to a cell conductor which does not have access to the API DB, the call to get_host_availability_zone will fail with a CantStartEngineError because it's trying to do an "up-call" to the API DB for host aggregate info. The reschedule fails and the instance is stuck in BUILD status without a fault injected for determining what went wrong. This change simply handles the failure and cleans up so the instance is put into a terminal (ERROR) state. NOTE(mriedem): The fill_provider_mapping mock on the unit test is removed since that method did not exist in Stein, it was introduced in Train: I76f777e4f354b92c55dbd52a20039e504434b3a1 Change-Id: I6bfa6fa767403fb936a6ae340b8687eb161732fc Partial-Bug: #1781286 (cherry picked from commit 38fb7f82abd7fffc00ebc050ee5230f1137e76d8) (cherry picked from commit b5e6c389d733d4dbd94380add7e3fa6c4d1e3fa8) --- nova/conductor/manager.py | 16 ++++++-- nova/tests/unit/conductor/test_conductor.py | 42 +++++++++++++++++++++ 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index 4069487e9cd8..e3e6cabc302a 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -737,9 +737,19 @@ class ComputeTaskManager(base.Base): context, instance, exc, legacy_request_spec, requested_networks) return - instance.availability_zone = ( - availability_zones.get_host_availability_zone(context, - host.service_host)) + + try: + instance.availability_zone = ( + availability_zones.get_host_availability_zone(context, + host.service_host)) + except Exception as exc: + # Put the instance into ERROR state, set task_state to None, + # inject a fault, etc. + self._cleanup_when_reschedule_fails( + context, instance, exc, legacy_request_spec, + requested_networks) + continue + try: # NOTE(danms): This saves the az change above, refreshes our # instance, and tells us if it has been deleted underneath us diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py index e8dd919a8f60..418955f3a024 100644 --- a/nova/tests/unit/conductor/test_conductor.py +++ b/nova/tests/unit/conductor/test_conductor.py @@ -3149,6 +3149,48 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): tb = mock_notify.call_args_list[0][0][6] self.assertIn('Traceback (most recent call last):', tb) + @mock.patch('nova.scheduler.utils.claim_resources', return_value=True) + @mock.patch('nova.availability_zones.get_host_availability_zone', + side_effect=db_exc.CantStartEngineError) + @mock.patch('nova.conductor.manager.ComputeTaskManager.' + '_cleanup_when_reschedule_fails') + @mock.patch('nova.objects.Instance.save') + def test_build_reschedule_get_az_error(self, mock_save, mock_cleanup, + mock_get_az, mock_claim): + """Tests a scenario where rescheduling during a build fails trying to + get the AZ for the selected host will put the instance into a terminal + (ERROR) state. + """ + instance = fake_instance.fake_instance_obj(self.context) + image = objects.ImageMeta() + requested_networks = objects.NetworkRequestList() + request_spec = fake_request_spec.fake_spec_obj() + host_lists = copy.deepcopy(fake_host_lists_alt) + filter_props = {} + # Pre-populate the filter properties with the initial host we tried to + # build on which failed and triggered a reschedule. + host1 = host_lists[0].pop(0) + scheduler_utils.populate_filter_properties(filter_props, host1) + # We have to save off the first alternate we try since build_instances + # modifies the host_lists list. + host2 = host_lists[0][0] + + self.conductor.build_instances( + self.context, [instance], image, filter_props, + mock.sentinel.admin_password, mock.sentinel.injected_files, + requested_networks, mock.sentinel.security_groups, + request_spec=request_spec, host_lists=host_lists) + + mock_claim.assert_called_once() + mock_get_az.assert_called_once_with(self.context, host2.service_host) + mock_cleanup.assert_called_once_with( + self.context, instance, + test.MatchType(db_exc.CantStartEngineError), test.MatchType(dict), + requested_networks) + # Assert that we did not continue processing the instance once we + # handled the error. + mock_save.assert_not_called() + def test_cleanup_allocated_networks_none_requested(self): # Tests that we don't deallocate networks if 'none' were specifically # requested.