From 8dc3863e102828ce8bf3111d24ad74acd2dfd405 Mon Sep 17 00:00:00 2001 From: Alyson Rosa Date: Wed, 2 Sep 2015 13:28:30 -0300 Subject: [PATCH] Adds retry function to HNAS driver Multiple requests sent to HNAS can cause concurrency problems and that ends up with 'SSC failed connection' errors. This patch adds a retry decorator to _execute in HDS HNAS Manila driver to fix this problem. The current retry functionality in Manila always uses fixed numbers to define the time to wait before performing the next attempts. This behavior can make the retries of multiple requests to collide in each attempt as they will wait the same amount of time and try to use the same resource together again. So additionally, this patch changes the behavior of manila.utils.retry() to receive a parameter that allows the function to implement randomly generated wait intervals. Change-Id: Ib862f62517fcc8816781204b902119e9b20121e0 Closes-bug: 1491550 --- manila/exception.py | 4 ++ manila/share/drivers/hitachi/ssh.py | 21 ++++++--- .../tests/share/drivers/hitachi/test_ssh.py | 30 ++++++++++++- manila/tests/test_utils.py | 43 +++++++++++++++++++ manila/utils.py | 39 ++++++++++------- 5 files changed, 113 insertions(+), 24 deletions(-) diff --git a/manila/exception.py b/manila/exception.py index 24091baa05..c74894f82f 100644 --- a/manila/exception.py +++ b/manila/exception.py @@ -659,6 +659,10 @@ class HNASBackendException(ManilaException): message = _("HNAS Backend Exception: %(msg)s") +class HNASConnException(ManilaException): + message = _("HNAS Connection Exception: %(msg)s") + + # ConsistencyGroup class ConsistencyGroupNotFound(NotFound): message = _("ConsistencyGroup %(consistency_group_id)s could not be " diff --git a/manila/share/drivers/hitachi/ssh.py b/manila/share/drivers/hitachi/ssh.py index a8ab4a8402..45178a0293 100644 --- a/manila/share/drivers/hitachi/ssh.py +++ b/manila/share/drivers/hitachi/ssh.py @@ -414,6 +414,7 @@ class HNASSSHBackend(object): msg = (_("Share %s was not created.") % share['id']) raise exception.HNASBackendException(msg=msg) + @mutils.retry(exception=exception.HNASConnException, wait_random=True) def _execute(self, commands): command = ['ssc', '127.0.0.1'] if self.admin_ip0 is not None: @@ -442,13 +443,19 @@ class HNASSSHBackend(object): 'out': out, 'err': err}) return out, err except processutils.ProcessExecutionError as e: - LOG.debug("Command %(cmd)s result: out = %(out)s - err = " - "%(err)s - exit = %(exit)s.", {'cmd': e.cmd, - 'out': e.stdout, - 'err': e.stderr, - 'exit': e.exit_code}) - LOG.error(_LE("Error running SSH command.")) - raise + if 'Failed to establish SSC connection' in e.stderr: + LOG.debug("SSC connection error!") + msg = _("Failed to establish SSC connection.") + raise exception.HNASConnException(msg=msg) + else: + LOG.debug("Command %(cmd)s result: out = %(out)s - err = " + "%(err)s - exit = %(exit)s.", {'cmd': e.cmd, + 'out': e.stdout, + 'err': e.stderr, + 'exit': + e.exit_code}) + LOG.error(_LE("Error running SSH command.")) + raise def _check_fs_mounted(self, fs_name): self._check_fs() diff --git a/manila/tests/share/drivers/hitachi/test_ssh.py b/manila/tests/share/drivers/hitachi/test_ssh.py index 98bcbcfab3..f2809097a2 100644 --- a/manila/tests/share/drivers/hitachi/test_ssh.py +++ b/manila/tests/share/drivers/hitachi/test_ssh.py @@ -23,6 +23,7 @@ import paramiko from manila import exception from manila.share.drivers.hitachi import ssh from manila import test +from manila import utils CONF = cfg.CONF @@ -69,7 +70,6 @@ file_system 1055 fake_span Umount 2 4 5 file_system2 1050 fake_span2 NoEVS - 10 0 1 fake_fs 1051 fake_span Umount 2 100 1024 """ - HNAS_RESULT_one_fs = """ \ Instance name Dev On span State EVS Cap/GiB Confined Flag ----------------- ---- ------- ------ --- ------- -------- ---- @@ -1170,6 +1170,31 @@ class HNASSSHTestCase(test.TestCase): port=self.port) self.assertIn('Request submitted successfully.', output) + def test__execute_retry(self): + commands = ['tree-clone-job-submit', '-e', '/src', '/dst'] + concat_command = ('ssc --smuauth fake console-context --evs 2 ' + 'tree-clone-job-submit -e /src /dst') + msg = 'Failed to establish SSC connection' + + item_mock = mock.Mock() + self.mock_object(utils.pools.Pool, 'item', + mock.Mock(return_value=item_mock)) + setattr(item_mock, '__enter__', mock.Mock()) + setattr(item_mock, '__exit__', mock.Mock()) + + self.mock_object(paramiko.SSHClient, 'connect') + # testing retrying 3 times + self.mock_object(putils, 'ssh_execute', mock.Mock( + side_effect=[putils.ProcessExecutionError(stderr=msg), + putils.ProcessExecutionError(stderr=msg), + putils.ProcessExecutionError(stderr=msg), + (HNAS_RESULT_job, '')])) + + self._driver._execute(commands) + + putils.ssh_execute.assert_called_with(mock.ANY, concat_command, + check_exit_code=True) + def test__execute_ssh_exception(self): key = self.ssh_private_key commands = ['tree-clone-job-submit', '-e', '/src', '/dst'] @@ -1177,7 +1202,8 @@ class HNASSSHTestCase(test.TestCase): 'tree-clone-job-submit -e /src /dst') self.mock_object(paramiko.SSHClient, 'connect') self.mock_object(putils, 'ssh_execute', - mock.Mock(side_effect=putils.ProcessExecutionError)) + mock.Mock(side_effect=putils.ProcessExecutionError + (stderr='Error'))) self.assertRaises(putils.ProcessExecutionError, self._driver._execute, commands) diff --git a/manila/tests/test_utils.py b/manila/tests/test_utils.py index 5756d8accd..5db267977b 100644 --- a/manila/tests/test_utils.py +++ b/manila/tests/test_utils.py @@ -694,6 +694,49 @@ class TestRetryDecorator(test.TestCase): self.assertEqual('success', ret) self.assertEqual(1, self.counter) + def test_no_retry_required_random(self): + self.counter = 0 + + with mock.patch.object(time, 'sleep') as mock_sleep: + @utils.retry(exception.ManilaException, + interval=2, + retries=3, + backoff_rate=2, + wait_random=True) + def succeeds(): + self.counter += 1 + return 'success' + + ret = succeeds() + self.assertFalse(mock_sleep.called) + self.assertEqual('success', ret) + self.assertEqual(1, self.counter) + + def test_retries_once_random(self): + self.counter = 0 + interval = 2 + backoff_rate = 2 + retries = 3 + + with mock.patch.object(time, 'sleep') as mock_sleep: + @utils.retry(exception.ManilaException, + interval, + retries, + backoff_rate, + wait_random=True) + def fails_once(): + self.counter += 1 + if self.counter < 2: + raise exception.ManilaException(data='fake') + else: + return 'success' + + ret = fails_once() + self.assertEqual('success', ret) + self.assertEqual(2, self.counter) + self.assertEqual(1, mock_sleep.call_count) + self.assertTrue(mock_sleep.called) + def test_retries_once(self): self.counter = 0 interval = 2 diff --git a/manila/utils.py b/manila/utils.py index e736573700..f729991371 100644 --- a/manila/utils.py +++ b/manila/utils.py @@ -22,6 +22,7 @@ import errno import inspect import os import pyclbr +import random import re import shutil import socket @@ -540,23 +541,25 @@ class ComparableMixin(object): return self._compare(other, lambda s, o: s != o) -def retry(exception, interval=1, retries=10, backoff_rate=2): +def retry(exception, interval=1, retries=10, backoff_rate=2, + wait_random=False): """A wrapper around retrying library. - This decorator allows to log and to check 'retries' input param. - Time interval between retries is calculated in the following way: - interval * backoff_rate ^ previous_attempt_number + This decorator allows to log and to check 'retries' input param. + Time interval between retries is calculated in the following way: + interval * backoff_rate ^ previous_attempt_number - :param exception: expected exception type. When wrapped function - raises an exception of this type,the function - execution is retried. - :param interval: param 'interval' is used to calculate time interval - between retries: + :param exception: expected exception type. When wrapped function + raises an exception of this type, the function + execution is retried. + :param interval: param 'interval' is used to calculate time interval + between retries: + interval * backoff_rate ^ previous_attempt_number + :param retries: number of retries. + :param backoff_rate: param 'backoff_rate' is used to calculate time + interval between retries: interval * backoff_rate ^ previous_attempt_number - :param retries: number of retries - :param backoff_rate: param 'backoff_rate' is used to calculate time - interval between retries: - interval * backoff_rate ^ previous_attempt_number + :param wait_random: boolean value to enable retry with random wait timer. """ def _retry_on_exception(e): @@ -565,8 +568,14 @@ def retry(exception, interval=1, retries=10, backoff_rate=2): def _backoff_sleep(previous_attempt_number, delay_since_first_attempt_ms): exp = backoff_rate ** previous_attempt_number wait_for = max(0, interval * exp) - LOG.debug("Sleeping for %s seconds", wait_for) - return wait_for * 1000.0 + + if wait_random: + wait_val = random.randrange(interval * 1000.0, wait_for * 1000.0) + else: + wait_val = wait_for * 1000.0 + + LOG.debug("Sleeping for %s seconds.", (wait_val / 1000.0)) + return wait_val def _print_stop(previous_attempt_number, delay_since_first_attempt_ms): delay_since_first_attempt = delay_since_first_attempt_ms / 1000.0