From fcc2b9e33ed2b3d9f469b458e0f46011fe9883ac Mon Sep 17 00:00:00 2001
From: Erik Olof Gunnar Andersson <eandersson@blizzard.com>
Date: Thu, 25 Jul 2019 20:19:40 -0700
Subject: [PATCH] Cleanup when hitting MaxRetriesExceeded from no
 host_available

Prior to this patch there was a condition when no
host_available was true and an exception would get
raised without first cleaning up the instance.
This causes instances to get indefinitely stuck in
a scheduling state.

This patch fixes this by calling the clean up function
and then exits build_instances using a return statement.

The related functional regression recreate test is updated
to show this fixes the bug.

Change-Id: I6a2c63a4c33e783100208fd3f45eb52aad49e3d6
Closes-bug: #1837955
(cherry picked from commit b98d4ba6d54f5ca2999a8fe6b6d7dfcc134061df)
---
 nova/conductor/manager.py                     |  6 +++-
 .../regressions/test_bug_1837955.py           | 30 ++++------------
 nova/tests/unit/conductor/test_conductor.py   | 34 ++++++++++++-------
 3 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py
index 39cbf428e541..ed7531e34ce2 100644
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@@ -720,7 +720,11 @@ class ComputeTaskManager(base.Base):
                     msg = ("Exhausted all hosts available for retrying build "
                            "failures for instance %(instance_uuid)s." %
                            {"instance_uuid": instance.uuid})
-                    raise exception.MaxRetriesExceeded(reason=msg)
+                    exc = exception.MaxRetriesExceeded(reason=msg)
+                    self._cleanup_when_reschedule_fails(
+                        context, instance, exc, legacy_request_spec,
+                        requested_networks)
+                    return
             instance.availability_zone = (
                 availability_zones.get_host_availability_zone(context,
                         host.service_host))
diff --git a/nova/tests/functional/regressions/test_bug_1837955.py b/nova/tests/functional/regressions/test_bug_1837955.py
index 278e0366bea9..f52ba9b817f8 100644
--- a/nova/tests/functional/regressions/test_bug_1837955.py
+++ b/nova/tests/functional/regressions/test_bug_1837955.py
@@ -84,31 +84,15 @@ class BuildRescheduleClaimFailsTestCase(
             image_uuid=fake_image.get_valid_image_id(),
             networks=[{'port': self.neutron.port_1['id']}])
         server = self.api.post_server({'server': server})
-        # FIXME(mriedem): This is bug 1837955 where the status is stuck in
-        # BUILD rather than the vm_state being set to error and the task_state
-        # being set to None. Uncomment this when the bug is fixed.
-        # server = self._wait_for_state_change(self.api, server, 'ERROR')
+        server = self._wait_for_state_change(self.api, server, 'ERROR')
 
         # Wait for the MaxRetriesExceeded fault to be recorded.
         # set_vm_state_and_notify sets the vm_state to ERROR before the fault
         # is recorded but after the notification is sent. So wait for the
         # unversioned notification to show up and then get the fault.
-        # FIXME(mriedem): Uncomment this when bug 1837955 is fixed.
-        # self._wait_for_unversioned_notification(
-        #     'compute_task.build_instances')
-        # server = self.api.get_server(server['id'])
-        # self.assertIn('fault', server)
-        # self.assertIn('Exceeded maximum number of retries',
-        #               server['fault']['message'])
-
-        # TODO(mriedem): Remove this when the bug is fixed. We need to assert
-        # something before the bug is fixed to show the failure so check the
-        # logs.
-        for x in range(20):
-            logs = self.stdlog.logger.output
-            if 'MaxRetriesExceeded' in logs:
-                break
-            time.sleep(.5)
-        else:
-            self.fail('Timed out waiting for MaxRetriesExceeded to show up '
-                      'in the logs.')
+        self._wait_for_unversioned_notification(
+            'compute_task.build_instances')
+        server = self.api.get_server(server['id'])
+        self.assertIn('fault', server)
+        self.assertIn('Exceeded maximum number of retries',
+                      server['fault']['message'])
diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py
index e19d0b19da9a..ea6cf64dcd1a 100644
--- a/nova/tests/unit/conductor/test_conductor.py
+++ b/nova/tests/unit/conductor/test_conductor.py
@@ -693,28 +693,36 @@ class _BaseTaskTestCase(object):
             mock.call(self.context, instances[1].uuid)])
         self.assertFalse(mock_get_by_host.called)
 
-    @mock.patch("nova.scheduler.utils.claim_resources", return_value=False)
+    @mock.patch('nova.compute.utils.notify_about_compute_task_error')
     @mock.patch.object(objects.Instance, 'save')
-    def test_build_instances_exhaust_host_list(self, _mock_save, mock_claim):
+    def test_build_instances_exhaust_host_list(self, _mock_save, mock_notify):
         # A list of three alternate hosts for one instance
         host_lists = copy.deepcopy(fake_host_lists_alt)
         instance = fake_instance.fake_instance_obj(self.context)
         image = {'fake-data': 'should_pass_silently'}
-        expected_claim_count = len(host_lists[0])
 
         # build_instances() is a cast, we need to wait for it to complete
         self.useFixture(cast_as_call.CastAsCall(self))
+
+        self.conductor.build_instances(
+            context=self.context,
+            instances=[instance], image=image,
+            filter_properties={},
+            admin_password='admin_password',
+            injected_files='injected_files',
+            requested_networks=None,
+            security_groups='security_groups',
+            block_device_mapping=None,
+            legacy_bdm=None,
+            host_lists=host_lists
+        )
+
         # Since claim_resources() is mocked to always return False, we will run
-        # out of alternate hosts, and MaxRetriesExceeded should be raised.
-        self.assertRaises(exc.MaxRetriesExceeded,
-                self.conductor.build_instances, context=self.context,
-                instances=[instance], image=image, filter_properties={},
-                admin_password='admin_password',
-                injected_files='injected_files', requested_networks=None,
-                security_groups='security_groups',
-                block_device_mapping=None, legacy_bdm=None,
-                host_lists=host_lists)
-        self.assertEqual(expected_claim_count, mock_claim.call_count)
+        # out of alternate hosts, and complain about MaxRetriesExceeded.
+        mock_notify.assert_called_once_with(
+            self.context, 'build_instances',
+            instance.uuid, test.MatchType(dict), 'error',
+            test.MatchType(exc.MaxRetriesExceeded), test.MatchType(str))
 
     @mock.patch.object(conductor_manager.ComputeTaskManager,
             '_destroy_build_request')