diff --git a/shade/_utils.py b/shade/_utils.py index 750f07ab8..e5117b368 100644 --- a/shade/_utils.py +++ b/shade/_utils.py @@ -523,6 +523,60 @@ def safe_dict_max(key, data): return max_value +def _call_client_and_retry(client, url, retry_on=None, + call_retries=3, retry_wait=2, + **kwargs): + """Method to provide retry operations. + + Some APIs utilize HTTP errors on certian operations to indicate that + the resource is presently locked, and as such this mechanism provides + the ability to retry upon known error codes. + + :param object client: The client method, such as: + ``self.baremetal_client.post`` + :param string url: The URL to perform the operation upon. + :param integer retry_on: A list of error codes that can be retried on. + The method also supports a single integer to be + defined. + :param integer call_retries: The number of times to retry the call upon + the error code defined by the 'retry_on' + parameter. Default: 3 + :param integer retry_wait: The time in seconds to wait between retry + attempts. Default: 2 + + :returns: The object returned by the client call. + """ + + # NOTE(TheJulia): This method, as of this note, does not have direct + # unit tests, although is fairly well tested by the tests checking + # retry logic in test_baremetal_node.py. + log = _log.setup_logging('shade.http') + + if isinstance(retry_on, int): + retry_on = [retry_on] + + count = 0 + while (count < call_retries): + count += 1 + try: + ret_val = client(url, **kwargs) + except exc.OpenStackCloudHTTPError as e: + if (retry_on is not None and + e.response.status_code in retry_on): + log.debug('Received retryable error {err}, waiting ' + '{wait} seconds to retry', { + 'err': e.response.status_code, + 'wait': retry_wait + }) + time.sleep(retry_wait) + continue + else: + raise + # Break out of the loop, since the loop should only continue + # when we encounter a known connection error. + return ret_val + + def parse_range(value): """Parse a numerical range string. diff --git a/shade/openstackcloud.py b/shade/openstackcloud.py index 28ed2e543..31763a6f0 100644 --- a/shade/openstackcloud.py +++ b/shade/openstackcloud.py @@ -9166,7 +9166,9 @@ class OpenStackCloud( port = self._baremetal_client.get(port_url, microversion=1.6, error_message=port_msg) port_url = '/ports/{uuid}'.format(uuid=port['ports'][0]['uuid']) - self._baremetal_client.delete(port_url, error_message=port_msg) + _utils._call_client_and_retry(self._baremetal_client.delete, + port_url, retry_on=[409, 503], + error_message=port_msg) with _utils.shade_exceptions( "Error unregistering machine {node_id} from the baremetal " @@ -9178,10 +9180,11 @@ class OpenStackCloud( # microversions in future releases, as such, we explicitly set # the version to what we have been using with the client library.. version = "1.6" - msg = "Baremetal machine failed to be deleted." + msg = "Baremetal machine failed to be deleted" url = '/nodes/{node_id}'.format( node_id=uuid) - self._baremetal_client.delete(url, + _utils._call_client_and_retry(self._baremetal_client.delete, + url, retry_on=[409, 503], error_message=msg, microversion=version) @@ -9407,10 +9410,12 @@ class OpenStackCloud( if configdrive: payload['configdrive'] = configdrive - machine = self._baremetal_client.put(url, - json=payload, - error_message=msg, - microversion=version) + machine = _utils._call_client_and_retry(self._baremetal_client.put, + url, + retry_on=[409, 503], + json=payload, + error_message=msg, + microversion=version) if wait: for count in _utils._iterate_timeout( timeout, @@ -9511,10 +9516,12 @@ class OpenStackCloud( else: desired_state = 'power {state}'.format(state=state) payload = {'target': desired_state} - self._baremetal_client.put(url, - json=payload, - error_message=msg, - microversion="1.6") + _utils._call_client_and_retry(self._baremetal_client.put, + url, + retry_on=[409, 503], + json=payload, + error_message=msg, + microversion="1.6") return None def set_machine_power_on(self, name_or_id): diff --git a/shade/tests/unit/test_baremetal_node.py b/shade/tests/unit/test_baremetal_node.py index c0f54818b..9a8795745 100644 --- a/shade/tests/unit/test_baremetal_node.py +++ b/shade/tests/unit/test_baremetal_node.py @@ -553,6 +553,40 @@ class TestBaremetalNode(base.IronicTestCase): self.assert_calls() + def test_set_machine_power_on_with_retires(self): + # NOTE(TheJulia): This logic ends up testing power on/off and reboot + # as they all utilize the same helper method. + self.register_uris([ + dict( + method='PUT', + status_code=503, + uri=self.get_mock_url( + resource='nodes', + append=[self.fake_baremetal_node['uuid'], + 'states', 'power']), + validate=dict(json={'target': 'power on'})), + dict( + method='PUT', + status_code=409, + uri=self.get_mock_url( + resource='nodes', + append=[self.fake_baremetal_node['uuid'], + 'states', 'power']), + validate=dict(json={'target': 'power on'})), + dict( + method='PUT', + uri=self.get_mock_url( + resource='nodes', + append=[self.fake_baremetal_node['uuid'], + 'states', 'power']), + validate=dict(json={'target': 'power on'})), + ]) + return_value = self.op_cloud.set_machine_power_on( + self.fake_baremetal_node['uuid']) + self.assertIsNone(return_value) + + self.assert_calls() + def test_set_machine_power_off(self): self.register_uris([ dict( @@ -630,6 +664,51 @@ class TestBaremetalNode(base.IronicTestCase): self.assert_calls() + def test_node_set_provision_state_with_retries(self): + deploy_node = self.fake_baremetal_node.copy() + deploy_node['provision_state'] = 'deploying' + active_node = self.fake_baremetal_node.copy() + active_node['provision_state'] = 'active' + self.register_uris([ + dict( + method='PUT', + status_code=409, + uri=self.get_mock_url( + resource='nodes', + append=[self.fake_baremetal_node['uuid'], + 'states', 'provision']), + validate=dict(json={'target': 'active', + 'configdrive': 'http://host/file'})), + dict( + method='PUT', + status_code=503, + uri=self.get_mock_url( + resource='nodes', + append=[self.fake_baremetal_node['uuid'], + 'states', 'provision']), + validate=dict(json={'target': 'active', + 'configdrive': 'http://host/file'})), + dict( + method='PUT', + uri=self.get_mock_url( + resource='nodes', + append=[self.fake_baremetal_node['uuid'], + 'states', 'provision']), + validate=dict(json={'target': 'active', + 'configdrive': 'http://host/file'})), + dict(method='GET', + uri=self.get_mock_url( + resource='nodes', + append=[self.fake_baremetal_node['uuid']]), + json=self.fake_baremetal_node), + ]) + self.op_cloud.node_set_provision_state( + self.fake_baremetal_node['uuid'], + 'active', + configdrive='http://host/file') + + self.assert_calls() + def test_node_set_provision_state_wait_timeout(self): deploy_node = self.fake_baremetal_node.copy() deploy_node['provision_state'] = 'deploying' @@ -1408,6 +1487,64 @@ class TestBaremetalNode(base.IronicTestCase): timeout=0.001) self.assert_calls() + def test_unregister_machine_retries(self): + mac_address = self.fake_baremetal_port['address'] + nics = [{'mac': mac_address}] + port_uuid = self.fake_baremetal_port['uuid'] + # NOTE(TheJulia): The two values below should be the same. + port_node_uuid = self.fake_baremetal_port['node_uuid'] + port_url_address = 'detail?address=%s' % mac_address + self.fake_baremetal_node['provision_state'] = 'available' + self.register_uris([ + dict( + method='GET', + uri=self.get_mock_url( + resource='nodes', + append=[self.fake_baremetal_node['uuid']]), + json=self.fake_baremetal_node), + dict( + method='GET', + uri=self.get_mock_url( + resource='ports', + append=[port_url_address]), + json={'ports': [{'address': mac_address, + 'node_uuid': port_node_uuid, + 'uuid': port_uuid}]}), + dict( + method='DELETE', + status_code=503, + uri=self.get_mock_url( + resource='ports', + append=[self.fake_baremetal_port['uuid']])), + dict( + method='DELETE', + status_code=409, + uri=self.get_mock_url( + resource='ports', + append=[self.fake_baremetal_port['uuid']])), + dict( + method='DELETE', + uri=self.get_mock_url( + resource='ports', + append=[self.fake_baremetal_port['uuid']])), + dict( + method='DELETE', + status_code=409, + uri=self.get_mock_url( + resource='nodes', + append=[self.fake_baremetal_node['uuid']])), + dict( + method='DELETE', + uri=self.get_mock_url( + resource='nodes', + append=[self.fake_baremetal_node['uuid']])), + ]) + + self.op_cloud.unregister_machine(nics, + self.fake_baremetal_node['uuid']) + + self.assert_calls() + def test_unregister_machine_unavailable(self): # This is a list of invalid states that the method # should fail on.