Ironic: retry when node not available
After a baremetal instance is deleted, and its allocation is removed in placement, the ironic node might start cleaning. Eventually nova will notice and update the inventory to be reserved. During this window, a new instance may have already picked this ironic node. When that race happens today the build fails with an error: "Failed to reserve node ..." This change tries to ensure the remaining alternative hosts are attempted before aborting the build. Clearly the race is still there, but this makes it less painful. Related-Bug: #1974070 Change-Id: Ie5cdc17219c86927ab3769605808cb9d9fa9fa4d
This commit is contained in:
parent
2f4feeabc2
commit
8a476061c5
|
@ -2736,7 +2736,8 @@ class ComputeManager(manager.Manager):
|
|||
block_device_mapping)
|
||||
resources['block_device_info'] = block_device_info
|
||||
except (exception.InstanceNotFound,
|
||||
exception.UnexpectedDeletingTaskStateError):
|
||||
exception.UnexpectedDeletingTaskStateError,
|
||||
exception.ComputeResourcesUnavailable):
|
||||
with excutils.save_and_reraise_exception():
|
||||
self._build_resources_cleanup(instance, network_info)
|
||||
except (exception.UnexpectedTaskStateError,
|
||||
|
|
|
@ -7925,6 +7925,42 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
|||
mock_prepspawn.assert_called_once_with(self.instance)
|
||||
mock_failedspawn.assert_called_once_with(self.instance)
|
||||
|
||||
@mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup')
|
||||
@mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn')
|
||||
@mock.patch.object(virt_driver.ComputeDriver,
|
||||
'prepare_networks_before_block_device_mapping')
|
||||
@mock.patch.object(virt_driver.ComputeDriver,
|
||||
'clean_networks_preparation')
|
||||
def test_failed_prepare_for_spawn(self, mock_clean, mock_prepnet,
|
||||
mock_prepspawn, mock_failedspawn):
|
||||
mock_prepspawn.side_effect = exception.ComputeResourcesUnavailable(
|
||||
reason="asdf")
|
||||
with mock.patch.object(self.compute,
|
||||
'_build_networks_for_instance',
|
||||
return_value=self.network_info
|
||||
) as _build_networks_for_instance:
|
||||
|
||||
try:
|
||||
with self.compute._build_resources(self.context, self.instance,
|
||||
self.requested_networks, self.security_groups,
|
||||
self.image, self.block_device_mapping,
|
||||
self.resource_provider_mapping, self.accel_uuids):
|
||||
pass
|
||||
except Exception as e:
|
||||
self.assertIsInstance(e,
|
||||
exception.ComputeResourcesUnavailable)
|
||||
|
||||
_build_networks_for_instance.assert_has_calls(
|
||||
[mock.call(self.context, self.instance,
|
||||
self.requested_networks, self.security_groups,
|
||||
self.resource_provider_mapping,
|
||||
self.network_arqs)])
|
||||
|
||||
mock_prepnet.assert_not_called()
|
||||
mock_clean.assert_called_once_with(self.instance, self.network_info)
|
||||
mock_prepspawn.assert_called_once_with(self.instance)
|
||||
mock_failedspawn.assert_called_once_with(self.instance)
|
||||
|
||||
@mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup')
|
||||
@mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn')
|
||||
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
|
||||
|
|
|
@ -2542,7 +2542,10 @@ class IronicDriverTestCase(test.NoDBTestCase):
|
|||
|
||||
@mock.patch.object(cw.IronicClientWrapper, 'call')
|
||||
def test_prepare_for_spawn(self, mock_call):
|
||||
node = ironic_utils.get_test_node(driver='fake')
|
||||
node = ironic_utils.get_test_node(
|
||||
driver='fake', instance_uuid=None,
|
||||
provision_state=ironic_states.AVAILABLE,
|
||||
power_state=ironic_states.POWER_OFF)
|
||||
self.mock_conn.get_node.return_value = node
|
||||
instance = fake_instance.fake_instance_obj(self.ctx,
|
||||
node=node.uuid)
|
||||
|
@ -2574,7 +2577,10 @@ class IronicDriverTestCase(test.NoDBTestCase):
|
|||
instance)
|
||||
|
||||
def test_prepare_for_spawn_conflict(self):
|
||||
node = ironic_utils.get_test_node(driver='fake')
|
||||
node = ironic_utils.get_test_node(
|
||||
driver='fake', instance_uuid=None,
|
||||
provision_state=ironic_states.AVAILABLE,
|
||||
power_state=ironic_states.POWER_OFF)
|
||||
self.mock_conn.get_node.return_value = node
|
||||
self.mock_conn.update_node.side_effect = sdk_exc.ConflictException
|
||||
instance = fake_instance.fake_instance_obj(self.ctx, node=node.id)
|
||||
|
@ -2582,6 +2588,18 @@ class IronicDriverTestCase(test.NoDBTestCase):
|
|||
self.driver.prepare_for_spawn,
|
||||
instance)
|
||||
|
||||
def test_prepare_for_spawn_not_available(self):
|
||||
node = ironic_utils.get_test_node(
|
||||
driver='fake', instance_uuid=None,
|
||||
provision_state=ironic_states.CLEANWAIT,
|
||||
power_state=ironic_states.POWER_OFF)
|
||||
self.mock_conn.get_node.return_value = node
|
||||
self.mock_conn.update_node.side_effect = sdk_exc.ConflictException
|
||||
instance = fake_instance.fake_instance_obj(self.ctx, node=node.id)
|
||||
self.assertRaises(exception.ComputeResourcesUnavailable,
|
||||
self.driver.prepare_for_spawn,
|
||||
instance)
|
||||
|
||||
@mock.patch.object(ironic_driver.IronicDriver, '_cleanup_deploy')
|
||||
def test_failed_spawn_cleanup(self, mock_cleanup):
|
||||
node = ironic_utils.get_test_node(driver='fake')
|
||||
|
|
|
@ -397,6 +397,18 @@ class IronicDriver(virt_driver.ComputeDriver):
|
|||
_("Ironic node uuid not supplied to "
|
||||
"driver for instance %s.") % instance.uuid)
|
||||
node = self._get_node(node_uuid)
|
||||
|
||||
# Its possible this node has just moved from deleting
|
||||
# to cleaning. Placement will update the inventory
|
||||
# as all reserved, but this instance might have got here
|
||||
# before that happened, but after the previous allocation
|
||||
# got deleted. We trigger a re-schedule to another node.
|
||||
if (self._node_resources_used(node) or
|
||||
self._node_resources_unavailable(node)):
|
||||
msg = "Chosen ironic node %s is not available" % node_uuid
|
||||
LOG.info(msg, instance=instance)
|
||||
raise exception.ComputeResourcesUnavailable(reason=msg)
|
||||
|
||||
self._set_instance_id(node, instance)
|
||||
|
||||
def failed_spawn_cleanup(self, instance):
|
||||
|
|
Loading…
Reference in New Issue