Remove silent failure to find a node on rebuild
We have been ignoring the case where a rebuild or evacuate is triggered and we fail to find *any* node for our host. This appears to be a very old behavior, which I traced back ten years to this: https://review.opendev.org/c/openstack/nova/+/35851 which was merely fixing the failure to reset instance.node during an evacuate (which re-uses rebuild, which before that was a single-host operation). That patch intended to make a failure to find a node for our host a non-fatal error, but it just means we fall through that check with no node selected, which means we never update instance.node *and* run ResourceTracker code that will fail to find the node later. So, this makes it an explicit error, where we stop further processing, set the migration for the evacuation to 'failed', and send a notification for it. This is the same behavior as happens further down if we find that the instance has been deleted underneath us. Change-Id: I88b962aaeaa0554da4ab00906ac4d9e6deb43589
This commit is contained in:
parent
2f86a8a088
commit
fbf2515b4c
|
@ -3791,9 +3791,21 @@ class ComputeManager(manager.Manager):
|
|||
try:
|
||||
compute_node = self._get_compute_info(context, self.host)
|
||||
scheduled_node = compute_node.hypervisor_hostname
|
||||
except exception.ComputeHostNotFound:
|
||||
except exception.ComputeHostNotFound as e:
|
||||
# This means we were asked to rebuild one of our own
|
||||
# instances, or another instance as a target of an
|
||||
# evacuation, but we are unable to find a matching compute
|
||||
# node.
|
||||
LOG.exception('Failed to get compute_info for %s',
|
||||
self.host)
|
||||
self._set_migration_status(migration, 'failed')
|
||||
self._notify_instance_rebuild_error(context, instance, e,
|
||||
bdms)
|
||||
raise exception.InstanceFaultRollback(
|
||||
inner_exception=exception.BuildAbortException(
|
||||
instance_uuid=instance.uuid,
|
||||
reason=e.format_message()))
|
||||
|
||||
else:
|
||||
scheduled_node = instance.node
|
||||
|
||||
|
|
|
@ -13513,7 +13513,8 @@ class EvacuateHostTestCase(BaseTestCase):
|
|||
super(EvacuateHostTestCase, self).tearDown()
|
||||
|
||||
def _rebuild(self, on_shared_storage=True, migration=None,
|
||||
send_node=False, vm_states_is_stopped=False):
|
||||
send_node=False, vm_states_is_stopped=False,
|
||||
expect_error=False):
|
||||
network_api = self.compute.network_api
|
||||
ctxt = context.get_admin_context()
|
||||
|
||||
|
@ -13560,6 +13561,11 @@ class EvacuateHostTestCase(BaseTestCase):
|
|||
action='power_off', phase='start'),
|
||||
mock.call(ctxt, self.inst, self.inst.host,
|
||||
action='power_off', phase='end')])
|
||||
elif expect_error:
|
||||
mock_notify_rebuild.assert_has_calls([
|
||||
mock.call(ctxt, self.inst, self.compute.host,
|
||||
phase='error', exception=mock.ANY, bdms=bdms)])
|
||||
return
|
||||
else:
|
||||
mock_notify_rebuild.assert_has_calls([
|
||||
mock.call(ctxt, self.inst, self.inst.host, phase='start',
|
||||
|
@ -13614,14 +13620,15 @@ class EvacuateHostTestCase(BaseTestCase):
|
|||
mock.patch.object(self.compute, '_get_compute_info',
|
||||
side_effect=fake_get_compute_info)
|
||||
) as (mock_inst, mock_get):
|
||||
self._rebuild()
|
||||
self.assertRaises(exception.InstanceFaultRollback,
|
||||
self._rebuild, expect_error=True)
|
||||
|
||||
# Should be on destination host
|
||||
instance = db.instance_get(self.context, self.inst.id)
|
||||
self.assertEqual(instance['host'], self.compute.host)
|
||||
self.assertIsNone(instance['node'])
|
||||
self.assertTrue(mock_inst.called)
|
||||
self.assertTrue(mock_get.called)
|
||||
self.assertEqual('fake_host_2', instance['host'])
|
||||
self.assertEqual('fakenode2', instance['node'])
|
||||
mock_inst.assert_not_called()
|
||||
mock_get.assert_called_once_with(mock.ANY, self.compute.host)
|
||||
|
||||
def test_rebuild_on_host_node_passed(self):
|
||||
patch_get_info = mock.patch.object(self.compute, '_get_compute_info')
|
||||
|
|
Loading…
Reference in New Issue