Handle get_host_availability_zone error during reschedule

If a build fails and reschedules to a cell conductor which does not have access to the API DB, the call to get_host_availability_zone will fail with a CantStartEngineError because it's trying to do an "up-call" to the API DB for host aggregate info. The reschedule fails and the instance is stuck in BUILD status without a fault injected for determining what went wrong. This change simply handles the failure and cleans up so the instance is put into a terminal (ERROR) state. Conflicts: nova/tests/unit/conductor/test_conductor.py NOTE(mriedem): The conflict is due to not having change Ibfb0a6db5920d921c4fc7cabf3f4d2838ea7f421 in Rocky. Also note that the call to _cleanup_when_reschedule_fails does not pass a "legacy_request_spec" variable since change If8a13f74d2b3c99f05365eb49dcfa01d9042fefa is not in Rocky. Change-Id: I6bfa6fa767403fb936a6ae340b8687eb161732fc Partial-Bug: #1781286 (cherry picked from commit 38fb7f82ab) (cherry picked from commit b5e6c389d7) (cherry picked from commit 53bcf0b1ee)
2019-10-01 12:03:17 -04:00 · 2019-10-01 12:03:17 -04:00 · 5a98135650
parent 20de81c7c0
commit 5a98135650
2 changed files with 55 additions and 3 deletions
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@ -698,9 +698,19 @@ class ComputeTaskManager(base.Base):
                        context, instance, exc, request_spec,
                        requested_networks)
                    return
-            instance.availability_zone = (
-                availability_zones.get_host_availability_zone(context,
-                        host.service_host))
+
+            try:
+                instance.availability_zone = (
+                    availability_zones.get_host_availability_zone(context,
+                            host.service_host))
+            except Exception as exc:
+                # Put the instance into ERROR state, set task_state to None,
+                # inject a fault, etc.
+                self._cleanup_when_reschedule_fails(
+                    context, instance, exc, request_spec,
+                    requested_networks)
+                continue
+
            try:
                # NOTE(danms): This saves the az change above, refreshes our
                # instance, and tells us if it has been deleted underneath us
--- a/nova/tests/unit/conductor/test_conductor.py
+++ b/nova/tests/unit/conductor/test_conductor.py
@ -2842,6 +2842,48 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
                    requested_networks, mock.sentinel.secgroups)
                mock_save.assert_called_once_with()

+    @mock.patch('nova.scheduler.utils.claim_resources', return_value=True)
+    @mock.patch('nova.availability_zones.get_host_availability_zone',
+                side_effect=db_exc.CantStartEngineError)
+    @mock.patch('nova.conductor.manager.ComputeTaskManager.'
+                '_cleanup_when_reschedule_fails')
+    @mock.patch('nova.objects.Instance.save')
+    def test_build_reschedule_get_az_error(self, mock_save, mock_cleanup,
+                                           mock_get_az, mock_claim):
+        """Tests a scenario where rescheduling during a build fails trying to
+        get the AZ for the selected host will put the instance into a terminal
+        (ERROR) state.
+        """
+        instance = fake_instance.fake_instance_obj(self.context)
+        image = objects.ImageMeta()
+        requested_networks = objects.NetworkRequestList()
+        request_spec = fake_request_spec.fake_spec_obj()
+        host_lists = copy.deepcopy(fake_host_lists_alt)
+        filter_props = {}
+        # Pre-populate the filter properties with the initial host we tried to
+        # build on which failed and triggered a reschedule.
+        host1 = host_lists[0].pop(0)
+        scheduler_utils.populate_filter_properties(filter_props, host1)
+        # We have to save off the first alternate we try since build_instances
+        # modifies the host_lists list.
+        host2 = host_lists[0][0]
+
+        self.conductor.build_instances(
+            self.context, [instance], image, filter_props,
+            mock.sentinel.admin_password, mock.sentinel.injected_files,
+            requested_networks, mock.sentinel.security_groups,
+            request_spec=request_spec, host_lists=host_lists)
+
+        mock_claim.assert_called_once()
+        mock_get_az.assert_called_once_with(self.context, host2.service_host)
+        mock_cleanup.assert_called_once_with(
+            self.context, instance,
+            test.MatchType(db_exc.CantStartEngineError), test.MatchType(dict),
+            requested_networks)
+        # Assert that we did not continue processing the instance once we
+        # handled the error.
+        mock_save.assert_not_called()
+
    def test_cleanup_allocated_networks_none_requested(self):
        # Tests that we don't deallocate networks if 'none' were specifically
        # requested.