Remove silent failure to find a node on rebuild

We have been ignoring the case where a rebuild or evacuate is triggered and we fail to find *any* node for our host. This appears to be a very old behavior, which I traced back ten years to this: https://review.opendev.org/c/openstack/nova/+/35851 which was merely fixing the failure to reset instance.node during an evacuate (which re-uses rebuild, which before that was a single-host operation). That patch intended to make a failure to find a node for our host a non-fatal error, but it just means we fall through that check with no node selected, which means we never update instance.node *and* run ResourceTracker code that will fail to find the node later. So, this makes it an explicit error, where we stop further processing, set the migration for the evacuation to 'failed', and send a notification for it. This is the same behavior as happens further down if we find that the instance has been deleted underneath us. Change-Id: I88b962aaeaa0554da4ab00906ac4d9e6deb43589
2023-04-14 12:58:56 -07:00 · 2023-04-14 12:58:56 -07:00 · fbf2515b4c
parent 2f86a8a088
commit fbf2515b4c
2 changed files with 26 additions and 7 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -3791,9 +3791,21 @@ class ComputeManager(manager.Manager):
                try:
                    compute_node = self._get_compute_info(context, self.host)
                    scheduled_node = compute_node.hypervisor_hostname
-                except exception.ComputeHostNotFound:
+                except exception.ComputeHostNotFound as e:
+                    # This means we were asked to rebuild one of our own
+                    # instances, or another instance as a target of an
+                    # evacuation, but we are unable to find a matching compute
+                    # node.
                    LOG.exception('Failed to get compute_info for %s',
                                  self.host)
+                    self._set_migration_status(migration, 'failed')
+                    self._notify_instance_rebuild_error(context, instance, e,
+                                                        bdms)
+                    raise exception.InstanceFaultRollback(
+                        inner_exception=exception.BuildAbortException(
+                        instance_uuid=instance.uuid,
+                        reason=e.format_message()))
+
            else:
                scheduled_node = instance.node

--- a/nova/tests/unit/compute/test_compute.py
+++ b/nova/tests/unit/compute/test_compute.py
@ -13513,7 +13513,8 @@ class EvacuateHostTestCase(BaseTestCase):
        super(EvacuateHostTestCase, self).tearDown()

    def _rebuild(self, on_shared_storage=True, migration=None,
-                 send_node=False, vm_states_is_stopped=False):
+                 send_node=False, vm_states_is_stopped=False,
+                 expect_error=False):
        network_api = self.compute.network_api
        ctxt = context.get_admin_context()

@ -13560,6 +13561,11 @@ class EvacuateHostTestCase(BaseTestCase):
                              action='power_off', phase='start'),
                    mock.call(ctxt, self.inst, self.inst.host,
                              action='power_off', phase='end')])
+            elif expect_error:
+                mock_notify_rebuild.assert_has_calls([
+                    mock.call(ctxt, self.inst, self.compute.host,
+                              phase='error', exception=mock.ANY, bdms=bdms)])
+                return
            else:
                mock_notify_rebuild.assert_has_calls([
                    mock.call(ctxt, self.inst, self.inst.host, phase='start',
@ -13614,14 +13620,15 @@ class EvacuateHostTestCase(BaseTestCase):
            mock.patch.object(self.compute, '_get_compute_info',
                              side_effect=fake_get_compute_info)
        ) as (mock_inst, mock_get):
-            self._rebuild()
+            self.assertRaises(exception.InstanceFaultRollback,
+                              self._rebuild, expect_error=True)

            # Should be on destination host
            instance = db.instance_get(self.context, self.inst.id)
-            self.assertEqual(instance['host'], self.compute.host)
-            self.assertIsNone(instance['node'])
-            self.assertTrue(mock_inst.called)
-            self.assertTrue(mock_get.called)
+            self.assertEqual('fake_host_2', instance['host'])
+            self.assertEqual('fakenode2', instance['node'])
+            mock_inst.assert_not_called()
+            mock_get.assert_called_once_with(mock.ANY, self.compute.host)

    def test_rebuild_on_host_node_passed(self):
        patch_get_info = mock.patch.object(self.compute, '_get_compute_info')