Remove silent failure to find a node on rebuild

We have been ignoring the case where a rebuild or evacuate is triggered
and we fail to find *any* node for our host. This appears to be a very
old behavior, which I traced back ten years to this:

https://review.opendev.org/c/openstack/nova/+/35851

which was merely fixing the failure to reset instance.node during an
evacuate (which re-uses rebuild, which before that was a single-host
operation). That patch intended to make a failure to find a node for
our host a non-fatal error, but it just means we fall through that
check with no node selected, which means we never update instance.node
*and* run ResourceTracker code that will fail to find the node later.

So, this makes it an explicit error, where we stop further processing,
set the migration for the evacuation to 'failed', and send a
notification for it. This is the same behavior as happens further
down if we find that the instance has been deleted underneath us.

Change-Id: I88b962aaeaa0554da4ab00906ac4d9e6deb43589
This commit is contained in:
Dan Smith 2023-04-14 12:58:56 -07:00
parent 2f86a8a088
commit fbf2515b4c
2 changed files with 26 additions and 7 deletions

View File

@ -3791,9 +3791,21 @@ class ComputeManager(manager.Manager):
try:
compute_node = self._get_compute_info(context, self.host)
scheduled_node = compute_node.hypervisor_hostname
except exception.ComputeHostNotFound:
except exception.ComputeHostNotFound as e:
# This means we were asked to rebuild one of our own
# instances, or another instance as a target of an
# evacuation, but we are unable to find a matching compute
# node.
LOG.exception('Failed to get compute_info for %s',
self.host)
self._set_migration_status(migration, 'failed')
self._notify_instance_rebuild_error(context, instance, e,
bdms)
raise exception.InstanceFaultRollback(
inner_exception=exception.BuildAbortException(
instance_uuid=instance.uuid,
reason=e.format_message()))
else:
scheduled_node = instance.node

View File

@ -13513,7 +13513,8 @@ class EvacuateHostTestCase(BaseTestCase):
super(EvacuateHostTestCase, self).tearDown()
def _rebuild(self, on_shared_storage=True, migration=None,
send_node=False, vm_states_is_stopped=False):
send_node=False, vm_states_is_stopped=False,
expect_error=False):
network_api = self.compute.network_api
ctxt = context.get_admin_context()
@ -13560,6 +13561,11 @@ class EvacuateHostTestCase(BaseTestCase):
action='power_off', phase='start'),
mock.call(ctxt, self.inst, self.inst.host,
action='power_off', phase='end')])
elif expect_error:
mock_notify_rebuild.assert_has_calls([
mock.call(ctxt, self.inst, self.compute.host,
phase='error', exception=mock.ANY, bdms=bdms)])
return
else:
mock_notify_rebuild.assert_has_calls([
mock.call(ctxt, self.inst, self.inst.host, phase='start',
@ -13614,14 +13620,15 @@ class EvacuateHostTestCase(BaseTestCase):
mock.patch.object(self.compute, '_get_compute_info',
side_effect=fake_get_compute_info)
) as (mock_inst, mock_get):
self._rebuild()
self.assertRaises(exception.InstanceFaultRollback,
self._rebuild, expect_error=True)
# Should be on destination host
instance = db.instance_get(self.context, self.inst.id)
self.assertEqual(instance['host'], self.compute.host)
self.assertIsNone(instance['node'])
self.assertTrue(mock_inst.called)
self.assertTrue(mock_get.called)
self.assertEqual('fake_host_2', instance['host'])
self.assertEqual('fakenode2', instance['node'])
mock_inst.assert_not_called()
mock_get.assert_called_once_with(mock.ANY, self.compute.host)
def test_rebuild_on_host_node_passed(self):
patch_get_info = mock.patch.object(self.compute, '_get_compute_info')