From 6030b02bb3296058036b187fdfedc9496a0087a9 Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Mon, 28 May 2018 16:25:31 +0200 Subject: [PATCH] Separate wait_for_provision_state method Change-Id: I27d976162c503ff494d5786649e273a6385207d2 --- metalsmith/_os_api.py | 8 +- metalsmith/_provisioner.py | 100 ++++++++++++++++-- metalsmith/exceptions.py | 11 ++ metalsmith/test/test_provisioner.py | 152 +++++++++++++++++++++++----- 4 files changed, 232 insertions(+), 39 deletions(-) diff --git a/metalsmith/_os_api.py b/metalsmith/_os_api.py index c534105..71d9d33 100644 --- a/metalsmith/_os_api.py +++ b/metalsmith/_os_api.py @@ -25,7 +25,8 @@ from metalsmith import _utils LOG = logging.getLogger(__name__) NODE_FIELDS = ['name', 'uuid', 'instance_info', 'instance_uuid', 'maintenance', - 'maintenance_reason', 'properties', 'provision_state', 'extra'] + 'maintenance_reason', 'properties', 'provision_state', 'extra', + 'last_error'] HOSTNAME_FIELD = 'metalsmith_hostname' @@ -195,11 +196,6 @@ class API(object): if not result['result']: raise RuntimeError('%s: %s' % (iface, result['reason'])) - def wait_for_node_state(self, node, state, timeout): - self.ironic.node.wait_for_provision_state(_node_id(node), - state, - timeout=timeout) - def _node_id(node): if isinstance(node, six.string_types): diff --git a/metalsmith/_provisioner.py b/metalsmith/_provisioner.py index 07b053f..b209597 100644 --- a/metalsmith/_provisioner.py +++ b/metalsmith/_provisioner.py @@ -16,6 +16,7 @@ import logging import random import sys +import time import six @@ -237,9 +238,8 @@ class Provisioner(object): try: LOG.error('Deploy attempt failed on node %s, cleaning up', _utils.log_node(node)) - self._delete_ports(node, created_ports, attached_ports) - LOG.debug('Releasing lock on node %s', _utils.log_node(node)) - self._api.release_node(node) + self._clean_up(node, created_ports=created_ports, + attached_ports=attached_ports) except Exception: LOG.exception('Clean up failed') @@ -251,11 +251,12 @@ class Provisioner(object): LOG.debug('Waiting for node %(node)s to reach state active ' 'with timeout %(timeout)s', {'node': _utils.log_node(node), 'timeout': wait}) - self._api.wait_for_node_state(node, 'active', timeout=wait) + node = self.wait_for_provisioning([node], timeout=wait)[0] LOG.info('Deploy succeeded on node %s', _utils.log_node(node)) + else: + # Update the node to return it's latest state + node = self._api.get_node(node, refresh=True) - # Update the node to return it's latest state - node = self._api.get_node(node, refresh=True) return _instance.Instance(self._api, node) def _get_nics(self, nics): @@ -346,6 +347,87 @@ class Provisioner(object): LOG.debug('Failed to clear node %(node)s extra: %(exc)s', {'node': _utils.log_node(node), 'exc': exc}) + def wait_for_provisioning(self, nodes, timeout=None, delay=15): + """Wait for nodes to be provisioned. + + Loops until all nodes finish provisioning. + + :param nodes: List of nodes (UUID, name, `Node` object or + :py:class:`metalsmith.Instance`). + :param timeout: How much time (in seconds) to wait for all nodes + to finish provisioning. If ``None`` (the default), wait forever + (more precisely, until the operation times out on server side). + :param delay: Delay (in seconds) between two provision state checks. + :return: List of updated nodes if all succeeded. + :raises: :py:class:`metalsmith.exceptions.DeploymentFailure` + if the deployment failed or timed out for any nodes. + """ + return self._wait_for_state(nodes, 'active', + timeout=timeout, delay=delay) + + def _wait_for_state(self, nodes, state, timeout, delay=15): + if timeout is not None and timeout <= 0: + raise ValueError("The timeout argument must be a positive int") + if delay < 0: + raise ValueError("The delay argument must be a non-negative int") + + failed_nodes = [] + finished_nodes = [] + + deadline = time.time() + timeout if timeout is not None else None + while timeout is None or time.time() < deadline: + remaining_nodes = [] + for node in nodes: + node = self._api.get_node(node, refresh=True, + accept_hostname=True) + if node.provision_state == state: + LOG.debug('Node %(node)s reached state %(state)s', + {'node': _utils.log_node(node), 'state': state}) + finished_nodes.append(node) + elif (node.provision_state == 'error' or + node.provision_state.endswith(' failed')): + LOG.error('Node %(node)s failed deployment: %(error)s', + {'node': _utils.log_node(node), + 'error': node.last_error}) + failed_nodes.append(node) + else: + remaining_nodes.append(node) + + if remaining_nodes: + nodes = remaining_nodes + else: + nodes = [] + break + + LOG.debug('Still waiting for the following nodes to reach state ' + '%(state)s: %(nodes)s', + {'state': state, + 'nodes': ', '.join(_utils.log_node(n) for n in nodes)}) + time.sleep(delay) + + messages = [] + if failed_nodes: + messages.append('the following nodes failed deployment: %s' % + ', '.join('%s (%s)' % (_utils.log_node(node), + node.last_error) + for node in failed_nodes)) + if nodes: + messages.append('deployment timed out for nodes %s' % + ', '.join(_utils.log_node(node) for node in nodes)) + + if messages: + raise exceptions.DeploymentFailure( + 'Deployment failed: %s' % '; '.join(messages), + failed_nodes + nodes) + else: + LOG.debug('All nodes reached state %s', state) + return finished_nodes + + def _clean_up(self, node, created_ports=None, attached_ports=None): + self._delete_ports(node, created_ports, attached_ports) + LOG.debug('Releasing lock on node %s', _utils.log_node(node)) + self._api.release_node(node) + def unprovision_node(self, node, wait=None): """Unprovision a previously provisioned node. @@ -360,15 +442,13 @@ class Provisioner(object): LOG.warning("Dry run, not unprovisioning") return - self._delete_ports(node) - LOG.debug('Releasing lock on node %s', _utils.log_node(node)) - self._api.release_node(node) + self._clean_up(node) self._api.node_action(node, 'deleted') LOG.info('Deleting started for node %s', _utils.log_node(node)) if wait is not None: - self._api.wait_for_node_state(node, 'available', timeout=wait) + self._wait_for_state([node], 'available', timeout=wait) LOG.info('Node %s undeployed successfully', _utils.log_node(node)) return self._api.get_node(node, refresh=True) diff --git a/metalsmith/exceptions.py b/metalsmith/exceptions.py index 7f3d84d..1a935eb 100644 --- a/metalsmith/exceptions.py +++ b/metalsmith/exceptions.py @@ -71,3 +71,14 @@ class UnknownRootDiskSize(Error): class InvalidNode(Error): """This node cannot be deployed onto.""" + + +class DeploymentFailure(Error): + """One or more nodes have failed the deployment. + + :ivar nodes: List of failed nodes. + """ + + def __init__(self, message, nodes): + self.nodes = nodes + super(DeploymentFailure, self).__init__(message) diff --git a/metalsmith/test/test_provisioner.py b/metalsmith/test/test_provisioner.py index e687d0d..56eeb7a 100644 --- a/metalsmith/test/test_provisioner.py +++ b/metalsmith/test/test_provisioner.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import fixtures import mock import testtools @@ -114,6 +115,10 @@ class TestProvisionNode(Base): ], '/instance_info/%s' % _os_api.HOSTNAME_FIELD: 'control-0' } + self.wait_fixture = self.useFixture( + fixtures.MockPatchObject(_provisioner.Provisioner, + '_wait_for_state', autospec=True)) + self.wait_mock = self.wait_fixture.mock def test_ok(self): inst = self.pr.provision_node(self.node, 'image', @@ -131,7 +136,7 @@ class TestProvisionNode(Base): validate_deploy=True) self.api.node_action.assert_called_once_with(self.node, 'active', configdrive=mock.ANY) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.assertFalse(self.api.release_node.called) self.assertFalse(self.api.delete_port.called) @@ -155,7 +160,7 @@ class TestProvisionNode(Base): validate_deploy=True) self.api.node_action.assert_called_once_with(self.node, 'active', configdrive=mock.ANY) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.assertFalse(self.api.release_node.called) self.assertFalse(self.api.delete_port.called) @@ -178,7 +183,7 @@ class TestProvisionNode(Base): validate_deploy=True) self.api.node_action.assert_called_once_with(self.node, 'active', configdrive=mock.ANY) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.assertFalse(self.api.release_node.called) self.assertFalse(self.api.delete_port.called) @@ -200,7 +205,7 @@ class TestProvisionNode(Base): validate_deploy=True) self.api.node_action.assert_called_once_with(self.node, 'active', configdrive=mock.ANY) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.assertFalse(self.api.release_node.called) self.assertFalse(self.api.delete_port.called) @@ -220,7 +225,7 @@ class TestProvisionNode(Base): validate_deploy=True) self.api.node_action.assert_called_once_with(self.node, 'active', configdrive=mock.ANY) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.assertFalse(self.api.release_node.called) self.assertFalse(self.api.delete_port.called) @@ -244,7 +249,7 @@ class TestProvisionNode(Base): validate_deploy=True) self.api.node_action.assert_called_once_with(self.node, 'active', configdrive=mock.ANY) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.assertFalse(self.api.release_node.called) self.assertFalse(self.api.delete_port.called) @@ -266,7 +271,7 @@ class TestProvisionNode(Base): validate_deploy=True) self.api.node_action.assert_called_once_with(self.node, 'active', configdrive=mock.ANY) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.assertFalse(self.api.release_node.called) self.assertFalse(self.api.delete_port.called) @@ -285,7 +290,7 @@ class TestProvisionNode(Base): validate_deploy=True) self.api.node_action.assert_called_once_with(self.node, 'active', configdrive=mock.ANY) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.assertFalse(self.api.release_node.called) self.assertFalse(self.api.delete_port.called) @@ -306,9 +311,11 @@ class TestProvisionNode(Base): validate_deploy=True) self.api.node_action.assert_called_once_with(self.node, 'active', configdrive=mock.ANY) - self.api.wait_for_node_state.assert_called_once_with(self.node, - 'active', - timeout=3600) + self.wait_mock.assert_called_once_with(self.pr, + [self.node], + 'active', + delay=15, + timeout=3600) self.assertFalse(self.api.release_node.called) self.assertFalse(self.api.delete_port.called) @@ -320,7 +327,7 @@ class TestProvisionNode(Base): self.assertFalse(self.api.attach_port_to_node.called) self.assertFalse(self.api.update_node.called) self.assertFalse(self.api.node_action.called) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.assertFalse(self.api.release_node.called) self.assertFalse(self.api.delete_port.called) @@ -335,7 +342,7 @@ class TestProvisionNode(Base): self.assertFalse(self.api.attach_port_to_node.called) self.assertFalse(self.api.update_node.called) self.assertFalse(self.api.node_action.called) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.assertFalse(self.api.release_node.called) self.assertFalse(self.api.delete_port.called) @@ -347,7 +354,7 @@ class TestProvisionNode(Base): wait=3600) self.api.update_node.assert_any_call(self.node, CLEAN_UP) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.api.release_node.assert_called_once_with(self.node) self.api.delete_port.assert_called_once_with( self.api.create_port.return_value.id) @@ -395,7 +402,7 @@ class TestProvisionNode(Base): 'image', [{'network': 'network'}], wait=3600) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.api.release_node.assert_called_once_with(self.node) self.api.delete_port.assert_called_once_with( self.api.create_port.return_value.id) @@ -412,7 +419,7 @@ class TestProvisionNode(Base): 'image', [{'network': 'network'}], wait=3600) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.api.release_node.assert_called_once_with(self.node) self.api.delete_port.assert_called_once_with( self.api.create_port.return_value.id) @@ -420,7 +427,7 @@ class TestProvisionNode(Base): self.node, self.api.create_port.return_value.id) def test_wait_failure(self): - self.api.wait_for_node_state.side_effect = RuntimeError('boom') + self.wait_mock.side_effect = RuntimeError('boom') self.assertRaisesRegex(RuntimeError, 'boom', self.pr.provision_node, self.node, 'image', [{'network': 'network'}], wait=3600) @@ -579,6 +586,13 @@ class TestProvisionNode(Base): class TestUnprovisionNode(Base): + def setUp(self): + super(TestUnprovisionNode, self).setUp() + self.wait_fixture = self.useFixture( + fixtures.MockPatchObject(_provisioner.Provisioner, + '_wait_for_state', autospec=True)) + self.wait_mock = self.wait_fixture.mock + def test_ok(self): self.node.extra['metalsmith_created_ports'] = ['port1'] result = self.pr.unprovision_node(self.node) @@ -589,7 +603,7 @@ class TestUnprovisionNode(Base): 'port1') self.api.node_action.assert_called_once_with(self.node, 'deleted') self.api.release_node.assert_called_once_with(self.node) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.api.update_node.assert_called_once_with(self.node, CLEAN_UP) def test_with_attached(self): @@ -602,7 +616,7 @@ class TestUnprovisionNode(Base): self.api.detach_port_from_node.assert_has_calls(calls, any_order=True) self.api.node_action.assert_called_once_with(self.node, 'deleted') self.api.release_node.assert_called_once_with(self.node) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.api.update_node.assert_called_once_with(self.node, CLEAN_UP) def test_with_wait(self): @@ -615,9 +629,10 @@ class TestUnprovisionNode(Base): 'port1') self.api.node_action.assert_called_once_with(self.node, 'deleted') self.api.release_node.assert_called_once_with(self.node) - self.api.wait_for_node_state.assert_called_once_with(self.node, - 'available', - timeout=3600) + self.wait_mock.assert_called_once_with(self.pr, + [self.node], + 'available', + timeout=3600) def test_dry_run(self): self.pr._dry_run = True @@ -628,7 +643,7 @@ class TestUnprovisionNode(Base): self.assertFalse(self.api.release_node.called) self.assertFalse(self.api.delete_port.called) self.assertFalse(self.api.detach_port_from_node.called) - self.assertFalse(self.api.wait_for_node_state.called) + self.assertFalse(self.wait_mock.called) self.assertFalse(self.api.update_node.called) @@ -656,3 +671,94 @@ class TestShowInstance(Base): self.assertIs(result[0].node, self.node) self.assertIs(result[0].uuid, self.node.uuid) self.api.cache_node_list_for_lookup.assert_called_once_with() + + +@mock.patch('time.sleep', autospec=True) +class TestWaitForState(Base): + def test_invalid_timeout(self, mock_sleep): + for invalid in (0, -42): + self.assertRaisesRegex(ValueError, + 'timeout argument must be a positive', + self.pr.wait_for_provisioning, + ['uuid1'], timeout=invalid) + + def test_invalid_delay(self, mock_sleep): + self.assertRaisesRegex(ValueError, + 'delay argument must be a non-negative', + self.pr.wait_for_provisioning, + ['uuid1'], delay=-42) + + def test_success_one_node(self, mock_sleep): + nodes = [ + mock.Mock(spec=_os_api.NODE_FIELDS, provision_state=state) + for state in ('deploying', 'deploy wait', 'deploying', 'active') + ] + self.api.get_node.side_effect = nodes + + result = self.pr.wait_for_provisioning(['uuid1']) + self.assertEqual(nodes[-1:], result) + + mock_sleep.assert_called_with(15) + self.assertEqual(3, mock_sleep.call_count) + + def test_success_several_nodes(self, mock_sleep): + nodes = [ + mock.Mock(spec=_os_api.NODE_FIELDS, provision_state=state) + for state in ('deploying', 'deploy wait', # iteration 1 + 'deploying', 'active', # iteration 2 + 'active') # iteration 3 + ] + self.api.get_node.side_effect = nodes + + result = self.pr.wait_for_provisioning(['uuid1', 'uuid2']) + self.assertEqual(nodes[-2:], result) + + mock_sleep.assert_called_with(15) + self.assertEqual(2, mock_sleep.call_count) + + def test_one_node_failed(self, mock_sleep): + nodes = [ + mock.Mock(spec=_os_api.NODE_FIELDS, provision_state=state) + for state in ('deploying', 'deploy wait', # iteration 1 + 'deploying', 'deploy failed', # iteration 2 + 'active') # iteration 3 + ] + self.api.get_node.side_effect = nodes + + exc = self.assertRaises(exceptions.DeploymentFailure, + self.pr.wait_for_provisioning, + ['uuid1', 'uuid2']) + # The exception contains the failed node + self.assertEqual(exc.nodes, [nodes[-2]]) + + mock_sleep.assert_called_with(15) + self.assertEqual(2, mock_sleep.call_count) + + def test_timeout(self, mock_sleep): + def _fake_get(*args, **kwargs): + while True: + yield mock.Mock(spec=_os_api.NODE_FIELDS, + provision_state='deploying') + + self.api.get_node.side_effect = _fake_get() + + exc = self.assertRaises(exceptions.DeploymentFailure, + self.pr.wait_for_provisioning, + ['uuid1', 'uuid2'], + timeout=0.001) + self.assertEqual(2, len(exc.nodes)) + + mock_sleep.assert_called_with(15) + + def test_custom_delay(self, mock_sleep): + nodes = [ + mock.Mock(spec=_os_api.NODE_FIELDS, provision_state=state) + for state in ('deploying', 'deploy wait', 'deploying', 'active') + ] + self.api.get_node.side_effect = nodes + + result = self.pr.wait_for_provisioning(['uuid1'], delay=1) + self.assertEqual(nodes[-1:], result) + + mock_sleep.assert_called_with(1) + self.assertEqual(3, mock_sleep.call_count)