Add retry logic mechanism

Added retry logic mechanism and added them to the API calls
where ironic presently attempts to pull an exclusive
lock on the baremetal node.

Change-Id: Ia6fbc9eec612793b3214d7883e0552913a088d5d
This commit is contained in:
Julia Kreger 2017-12-03 22:47:39 -05:00
parent faaf5f024b
commit 44bafb8cda
3 changed files with 209 additions and 11 deletions

View File

@ -523,6 +523,60 @@ def safe_dict_max(key, data):
return max_value
def _call_client_and_retry(client, url, retry_on=None,
call_retries=3, retry_wait=2,
**kwargs):
"""Method to provide retry operations.
Some APIs utilize HTTP errors on certian operations to indicate that
the resource is presently locked, and as such this mechanism provides
the ability to retry upon known error codes.
:param object client: The client method, such as:
``self.baremetal_client.post``
:param string url: The URL to perform the operation upon.
:param integer retry_on: A list of error codes that can be retried on.
The method also supports a single integer to be
defined.
:param integer call_retries: The number of times to retry the call upon
the error code defined by the 'retry_on'
parameter. Default: 3
:param integer retry_wait: The time in seconds to wait between retry
attempts. Default: 2
:returns: The object returned by the client call.
"""
# NOTE(TheJulia): This method, as of this note, does not have direct
# unit tests, although is fairly well tested by the tests checking
# retry logic in test_baremetal_node.py.
log = _log.setup_logging('shade.http')
if isinstance(retry_on, int):
retry_on = [retry_on]
count = 0
while (count < call_retries):
count += 1
try:
ret_val = client(url, **kwargs)
except exc.OpenStackCloudHTTPError as e:
if (retry_on is not None and
e.response.status_code in retry_on):
log.debug('Received retryable error {err}, waiting '
'{wait} seconds to retry', {
'err': e.response.status_code,
'wait': retry_wait
})
time.sleep(retry_wait)
continue
else:
raise
# Break out of the loop, since the loop should only continue
# when we encounter a known connection error.
return ret_val
def parse_range(value):
"""Parse a numerical range string.

View File

@ -9166,7 +9166,9 @@ class OpenStackCloud(
port = self._baremetal_client.get(port_url, microversion=1.6,
error_message=port_msg)
port_url = '/ports/{uuid}'.format(uuid=port['ports'][0]['uuid'])
self._baremetal_client.delete(port_url, error_message=port_msg)
_utils._call_client_and_retry(self._baremetal_client.delete,
port_url, retry_on=[409, 503],
error_message=port_msg)
with _utils.shade_exceptions(
"Error unregistering machine {node_id} from the baremetal "
@ -9178,10 +9180,11 @@ class OpenStackCloud(
# microversions in future releases, as such, we explicitly set
# the version to what we have been using with the client library..
version = "1.6"
msg = "Baremetal machine failed to be deleted."
msg = "Baremetal machine failed to be deleted"
url = '/nodes/{node_id}'.format(
node_id=uuid)
self._baremetal_client.delete(url,
_utils._call_client_and_retry(self._baremetal_client.delete,
url, retry_on=[409, 503],
error_message=msg,
microversion=version)
@ -9407,10 +9410,12 @@ class OpenStackCloud(
if configdrive:
payload['configdrive'] = configdrive
machine = self._baremetal_client.put(url,
json=payload,
error_message=msg,
microversion=version)
machine = _utils._call_client_and_retry(self._baremetal_client.put,
url,
retry_on=[409, 503],
json=payload,
error_message=msg,
microversion=version)
if wait:
for count in _utils._iterate_timeout(
timeout,
@ -9511,10 +9516,12 @@ class OpenStackCloud(
else:
desired_state = 'power {state}'.format(state=state)
payload = {'target': desired_state}
self._baremetal_client.put(url,
json=payload,
error_message=msg,
microversion="1.6")
_utils._call_client_and_retry(self._baremetal_client.put,
url,
retry_on=[409, 503],
json=payload,
error_message=msg,
microversion="1.6")
return None
def set_machine_power_on(self, name_or_id):

View File

@ -553,6 +553,40 @@ class TestBaremetalNode(base.IronicTestCase):
self.assert_calls()
def test_set_machine_power_on_with_retires(self):
# NOTE(TheJulia): This logic ends up testing power on/off and reboot
# as they all utilize the same helper method.
self.register_uris([
dict(
method='PUT',
status_code=503,
uri=self.get_mock_url(
resource='nodes',
append=[self.fake_baremetal_node['uuid'],
'states', 'power']),
validate=dict(json={'target': 'power on'})),
dict(
method='PUT',
status_code=409,
uri=self.get_mock_url(
resource='nodes',
append=[self.fake_baremetal_node['uuid'],
'states', 'power']),
validate=dict(json={'target': 'power on'})),
dict(
method='PUT',
uri=self.get_mock_url(
resource='nodes',
append=[self.fake_baremetal_node['uuid'],
'states', 'power']),
validate=dict(json={'target': 'power on'})),
])
return_value = self.op_cloud.set_machine_power_on(
self.fake_baremetal_node['uuid'])
self.assertIsNone(return_value)
self.assert_calls()
def test_set_machine_power_off(self):
self.register_uris([
dict(
@ -630,6 +664,51 @@ class TestBaremetalNode(base.IronicTestCase):
self.assert_calls()
def test_node_set_provision_state_with_retries(self):
deploy_node = self.fake_baremetal_node.copy()
deploy_node['provision_state'] = 'deploying'
active_node = self.fake_baremetal_node.copy()
active_node['provision_state'] = 'active'
self.register_uris([
dict(
method='PUT',
status_code=409,
uri=self.get_mock_url(
resource='nodes',
append=[self.fake_baremetal_node['uuid'],
'states', 'provision']),
validate=dict(json={'target': 'active',
'configdrive': 'http://host/file'})),
dict(
method='PUT',
status_code=503,
uri=self.get_mock_url(
resource='nodes',
append=[self.fake_baremetal_node['uuid'],
'states', 'provision']),
validate=dict(json={'target': 'active',
'configdrive': 'http://host/file'})),
dict(
method='PUT',
uri=self.get_mock_url(
resource='nodes',
append=[self.fake_baremetal_node['uuid'],
'states', 'provision']),
validate=dict(json={'target': 'active',
'configdrive': 'http://host/file'})),
dict(method='GET',
uri=self.get_mock_url(
resource='nodes',
append=[self.fake_baremetal_node['uuid']]),
json=self.fake_baremetal_node),
])
self.op_cloud.node_set_provision_state(
self.fake_baremetal_node['uuid'],
'active',
configdrive='http://host/file')
self.assert_calls()
def test_node_set_provision_state_wait_timeout(self):
deploy_node = self.fake_baremetal_node.copy()
deploy_node['provision_state'] = 'deploying'
@ -1408,6 +1487,64 @@ class TestBaremetalNode(base.IronicTestCase):
timeout=0.001)
self.assert_calls()
def test_unregister_machine_retries(self):
mac_address = self.fake_baremetal_port['address']
nics = [{'mac': mac_address}]
port_uuid = self.fake_baremetal_port['uuid']
# NOTE(TheJulia): The two values below should be the same.
port_node_uuid = self.fake_baremetal_port['node_uuid']
port_url_address = 'detail?address=%s' % mac_address
self.fake_baremetal_node['provision_state'] = 'available'
self.register_uris([
dict(
method='GET',
uri=self.get_mock_url(
resource='nodes',
append=[self.fake_baremetal_node['uuid']]),
json=self.fake_baremetal_node),
dict(
method='GET',
uri=self.get_mock_url(
resource='ports',
append=[port_url_address]),
json={'ports': [{'address': mac_address,
'node_uuid': port_node_uuid,
'uuid': port_uuid}]}),
dict(
method='DELETE',
status_code=503,
uri=self.get_mock_url(
resource='ports',
append=[self.fake_baremetal_port['uuid']])),
dict(
method='DELETE',
status_code=409,
uri=self.get_mock_url(
resource='ports',
append=[self.fake_baremetal_port['uuid']])),
dict(
method='DELETE',
uri=self.get_mock_url(
resource='ports',
append=[self.fake_baremetal_port['uuid']])),
dict(
method='DELETE',
status_code=409,
uri=self.get_mock_url(
resource='nodes',
append=[self.fake_baremetal_node['uuid']])),
dict(
method='DELETE',
uri=self.get_mock_url(
resource='nodes',
append=[self.fake_baremetal_node['uuid']])),
])
self.op_cloud.unregister_machine(nics,
self.fake_baremetal_node['uuid'])
self.assert_calls()
def test_unregister_machine_unavailable(self):
# This is a list of invalid states that the method
# should fail on.