Ironic: retry when node not available

After a baremetal instance is deleted, and its allocation is removed
in placement, the ironic node might start cleaning. Eventually nova
will notice and update the inventory to be reserved.
During this window, a new instance may have already picked this
ironic node.

When that race happens today the build fails with an error:
"Failed to reserve node ..."

This change tries to ensure the remaining alternative hosts are
attempted before aborting the build.
Clearly the race is still there, but this makes it less painful.

Related-Bug: #1974070
Change-Id: Ie5cdc17219c86927ab3769605808cb9d9fa9fa4d
This commit is contained in:
John Garbutt 2022-05-18 19:06:36 +01:00 committed by John Garbutt
parent 2f4feeabc2
commit 8a476061c5
4 changed files with 70 additions and 3 deletions

View File

@ -2736,7 +2736,8 @@ class ComputeManager(manager.Manager):
block_device_mapping)
resources['block_device_info'] = block_device_info
except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError):
exception.UnexpectedDeletingTaskStateError,
exception.ComputeResourcesUnavailable):
with excutils.save_and_reraise_exception():
self._build_resources_cleanup(instance, network_info)
except (exception.UnexpectedTaskStateError,

View File

@ -7925,6 +7925,42 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock_prepspawn.assert_called_once_with(self.instance)
mock_failedspawn.assert_called_once_with(self.instance)
@mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup')
@mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn')
@mock.patch.object(virt_driver.ComputeDriver,
'prepare_networks_before_block_device_mapping')
@mock.patch.object(virt_driver.ComputeDriver,
'clean_networks_preparation')
def test_failed_prepare_for_spawn(self, mock_clean, mock_prepnet,
mock_prepspawn, mock_failedspawn):
mock_prepspawn.side_effect = exception.ComputeResourcesUnavailable(
reason="asdf")
with mock.patch.object(self.compute,
'_build_networks_for_instance',
return_value=self.network_info
) as _build_networks_for_instance:
try:
with self.compute._build_resources(self.context, self.instance,
self.requested_networks, self.security_groups,
self.image, self.block_device_mapping,
self.resource_provider_mapping, self.accel_uuids):
pass
except Exception as e:
self.assertIsInstance(e,
exception.ComputeResourcesUnavailable)
_build_networks_for_instance.assert_has_calls(
[mock.call(self.context, self.instance,
self.requested_networks, self.security_groups,
self.resource_provider_mapping,
self.network_arqs)])
mock_prepnet.assert_not_called()
mock_clean.assert_called_once_with(self.instance, self.network_info)
mock_prepspawn.assert_called_once_with(self.instance)
mock_failedspawn.assert_called_once_with(self.instance)
@mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup')
@mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn')
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')

View File

@ -2542,7 +2542,10 @@ class IronicDriverTestCase(test.NoDBTestCase):
@mock.patch.object(cw.IronicClientWrapper, 'call')
def test_prepare_for_spawn(self, mock_call):
node = ironic_utils.get_test_node(driver='fake')
node = ironic_utils.get_test_node(
driver='fake', instance_uuid=None,
provision_state=ironic_states.AVAILABLE,
power_state=ironic_states.POWER_OFF)
self.mock_conn.get_node.return_value = node
instance = fake_instance.fake_instance_obj(self.ctx,
node=node.uuid)
@ -2574,7 +2577,10 @@ class IronicDriverTestCase(test.NoDBTestCase):
instance)
def test_prepare_for_spawn_conflict(self):
node = ironic_utils.get_test_node(driver='fake')
node = ironic_utils.get_test_node(
driver='fake', instance_uuid=None,
provision_state=ironic_states.AVAILABLE,
power_state=ironic_states.POWER_OFF)
self.mock_conn.get_node.return_value = node
self.mock_conn.update_node.side_effect = sdk_exc.ConflictException
instance = fake_instance.fake_instance_obj(self.ctx, node=node.id)
@ -2582,6 +2588,18 @@ class IronicDriverTestCase(test.NoDBTestCase):
self.driver.prepare_for_spawn,
instance)
def test_prepare_for_spawn_not_available(self):
node = ironic_utils.get_test_node(
driver='fake', instance_uuid=None,
provision_state=ironic_states.CLEANWAIT,
power_state=ironic_states.POWER_OFF)
self.mock_conn.get_node.return_value = node
self.mock_conn.update_node.side_effect = sdk_exc.ConflictException
instance = fake_instance.fake_instance_obj(self.ctx, node=node.id)
self.assertRaises(exception.ComputeResourcesUnavailable,
self.driver.prepare_for_spawn,
instance)
@mock.patch.object(ironic_driver.IronicDriver, '_cleanup_deploy')
def test_failed_spawn_cleanup(self, mock_cleanup):
node = ironic_utils.get_test_node(driver='fake')

View File

@ -397,6 +397,18 @@ class IronicDriver(virt_driver.ComputeDriver):
_("Ironic node uuid not supplied to "
"driver for instance %s.") % instance.uuid)
node = self._get_node(node_uuid)
# Its possible this node has just moved from deleting
# to cleaning. Placement will update the inventory
# as all reserved, but this instance might have got here
# before that happened, but after the previous allocation
# got deleted. We trigger a re-schedule to another node.
if (self._node_resources_used(node) or
self._node_resources_unavailable(node)):
msg = "Chosen ironic node %s is not available" % node_uuid
LOG.info(msg, instance=instance)
raise exception.ComputeResourcesUnavailable(reason=msg)
self._set_instance_id(node, instance)
def failed_spawn_cleanup(self, instance):