Ironic: Workaround to mitigate bug #1341420

The bug #1341420 can cause the driver to try to deploy onto a node which
already has an instance associated with it, the way Ironic reserve a
node to a specific instance is associating the UUID of that instance
with the node via the instance_uuid field. This patch just makes the
request to associate that field non-retriable when the api returns HTTP
409 (Conflict) (that's the HTTP error indicating that the node is
already associated with another instance), so it can fail fast in
case the node is already pick and the RetryFilter could then pick
another node for that instance.

Partial-Bug: #1341420
Change-Id: I697c5129893b18562182bbb3ceb8cb160e84c312
This commit is contained in:
Lucas Alvares Gomes 2015-09-21 16:36:04 +01:00
parent 7bc2171869
commit 608af87541
4 changed files with 45 additions and 14 deletions

View File

@ -45,7 +45,7 @@ class IronicClientWrapperTestCase(test.NoDBTestCase):
def test_call_good_no_args(self, mock_get_client, mock_multi_getattr):
mock_get_client.return_value = FAKE_CLIENT
self.ironicclient.call("node.list")
mock_get_client.assert_called_once_with()
mock_get_client.assert_called_once_with(retry_on_conflict=True)
mock_multi_getattr.assert_called_once_with(FAKE_CLIENT, "node.list")
mock_multi_getattr.return_value.assert_called_once_with()
@ -54,7 +54,7 @@ class IronicClientWrapperTestCase(test.NoDBTestCase):
def test_call_good_with_args(self, mock_get_client, mock_multi_getattr):
mock_get_client.return_value = FAKE_CLIENT
self.ironicclient.call("node.list", 'test', associated=True)
mock_get_client.assert_called_once_with()
mock_get_client.assert_called_once_with(retry_on_conflict=True)
mock_multi_getattr.assert_called_once_with(FAKE_CLIENT, "node.list")
mock_multi_getattr.return_value.assert_called_once_with(
'test', associated=True)

View File

@ -56,7 +56,7 @@ FAKE_CLIENT = ironic_utils.FakeClient()
class FakeClientWrapper(cw.IronicClientWrapper):
def _get_client(self):
def _get_client(self, retry_on_conflict=True):
return FAKE_CLIENT
@ -897,8 +897,7 @@ class IronicDriverTestCase(test.NoDBTestCase):
self.driver.spawn, self.ctx, instance, None, [], None)
self.assertEqual(0, mock_destroy.call_count)
@mock.patch.object(FAKE_CLIENT.node, 'update')
def test__add_driver_fields_good(self, mock_update):
def _test_add_driver_fields(self, mock_update=None, mock_call=None):
node = ironic_utils.get_test_node(driver='fake')
instance = fake_instance.fake_instance_obj(self.ctx,
node=node.uuid)
@ -921,7 +920,23 @@ class IronicDriverTestCase(test.NoDBTestCase):
'value': str(node.properties.get('local_gb', 0))},
{'path': '/instance_uuid', 'op': 'add',
'value': instance.uuid}]
mock_update.assert_called_once_with(node.uuid, expected_patch)
if mock_call is not None:
# assert call() is invoked with retry_on_conflict False to
# avoid bug #1341420
mock_call.assert_called_once_with('node.update', node.uuid,
expected_patch,
retry_on_conflict=False)
if mock_update is not None:
mock_update.assert_called_once_with(node.uuid, expected_patch)
@mock.patch.object(FAKE_CLIENT.node, 'update')
def test__add_driver_fields_mock_update(self, mock_update):
self._test_add_driver_fields(mock_update=mock_update)
@mock.patch.object(cw.IronicClientWrapper, 'call')
def test__add_driver_fields_mock_call(self, mock_call):
self._test_add_driver_fields(mock_call=mock_call)
@mock.patch.object(FAKE_CLIENT.node, 'update')
def test__add_driver_fields_fail(self, mock_update):

View File

@ -56,10 +56,14 @@ class IronicClientWrapper(object):
"""Tell the wrapper to invalidate the cached ironic-client."""
self._cached_client = None
def _get_client(self):
def _get_client(self, retry_on_conflict=True):
max_retries = CONF.ironic.api_max_retries if retry_on_conflict else 1
retry_interval = (CONF.ironic.api_retry_interval
if retry_on_conflict else 0)
# If we've already constructed a valid, authed client, just return
# that.
if self._cached_client is not None:
if retry_on_conflict and self._cached_client is not None:
return self._cached_client
auth_token = CONF.ironic.admin_auth_token
@ -76,13 +80,14 @@ class IronicClientWrapper(object):
'ironic_url': CONF.ironic.api_endpoint}
# Retries for Conflict exception
kwargs['max_retries'] = CONF.ironic.api_max_retries
kwargs['retry_interval'] = CONF.ironic.api_retry_interval
kwargs['max_retries'] = max_retries
kwargs['retry_interval'] = retry_interval
try:
cli = ironic.client.get_client(CONF.ironic.api_version, **kwargs)
# Cache the client so we don't have to reconstruct and
# reauthenticate it every time we need it.
self._cached_client = cli
if retry_on_conflict:
self._cached_client = cli
except ironic.exc.Unauthorized:
msg = _("Unable to authenticate Ironic client.")
@ -110,19 +115,25 @@ class IronicClientWrapper(object):
:param method: Name of the client method to call as a string.
:param args: Client method arguments.
:param kwargs: Client method keyword arguments.
:param retry_on_conflict: Boolean value. Whether the request should be
retried in case of a conflict error
(HTTP 409) or not. If retry_on_conflict is
False the cached instance of the client
won't be used. Defaults to True.
:raises: NovaException if all retries failed.
"""
# TODO(dtantsur): drop these once ironicclient 0.8.0 is out and used in
# global-requirements.
retry_excs = (ironic.exc.ServiceUnavailable,
ironic.exc.ConnectionRefused)
retry_on_conflict = kwargs.pop('retry_on_conflict', True)
# num_attempts should be the times of retry + 1
# eg. retry==0 just means run once and no retry
num_attempts = max(0, CONF.ironic.api_max_retries) + 1
for attempt in range(1, num_attempts + 1):
client = self._get_client()
client = self._get_client(retry_on_conflict=retry_on_conflict)
try:
return self._multi_getattr(client, method)(*args, **kwargs)

View File

@ -392,7 +392,12 @@ class IronicDriver(virt_driver.ComputeDriver):
patch.append({'path': '/instance_uuid', 'op': 'add',
'value': instance.uuid})
try:
self.ironicclient.call('node.update', node.uuid, patch)
# FIXME(lucasagomes): The "retry_on_conflict" parameter was added
# to basically causes the deployment to fail faster in case the
# node picked by the scheduler is already associated with another
# instance due bug #1341420.
self.ironicclient.call('node.update', node.uuid, patch,
retry_on_conflict=False)
except ironic.exc.BadRequest:
msg = (_("Failed to add deploy parameters on node %(node)s "
"when provisioning the instance %(instance)s")