Bring over the 'url_helper' module from bzr

This module is quite useful for fetching urls in a manner that can be provided custom headers, retries and ssl_args so let's ensure we have it available for usage under these circumstances. Also adds some basic sanity tests so that it will continue to work as expected. Change-Id: I4d5e777cf530e822bbea69c76846ebebb48a7927
2015-04-02 12:04:50 -07:00 · 2015-04-02 12:04:50 -07:00 · 911ada7bba
parent 3e391d6032
commit 911ada7bba
6 changed files with 436 additions and 1 deletions
--- a/cloudinit/test.py
+++ b/cloudinit/test.py
@ -0,0 +1,21 @@
+# Copyright 2015 Canonical Ltd.
+# This file is part of cloud-init.  See LICENCE file for license information.
+#
+# vi: ts=4 expandtab
+
+import httpretty
+import testtools
+
+
+class TestCase(testtools.TestCase):
+    """Base class for all cloud-init test cases."""
+
+    def setUp(self):
+        super(TestCase, self).setUp()
+        # Do not allow any unknown network connections to get triggered...
+        httpretty.HTTPretty.allow_net_connect = False
+
+    def tearDown(self):
+        super(TestCase, self).tearDown()
+        # Ok allow it again....
+        httpretty.HTTPretty.allow_net_connect = True
--- a/cloudinit/tests/test_url_helper.py
+++ b/cloudinit/tests/test_url_helper.py
@ -0,0 +1,102 @@
+# Copyright 2015 Canonical Ltd.
+# This file is part of cloud-init.  See LICENCE file for license information.
+#
+# vi: ts=4 expandtab
+
+import httpretty
+
+from cloudinit import test
+from cloudinit import url_helper
+
+
+class UrlHelperWaitForUrlsTest(test.TestCase):
+
+    @httpretty.activate
+    def test_url_wait_for(self):
+        urls_actions = [
+            ("http://www.yahoo.com", (False, False, True)),
+            ("http://www.google.com", (False, False, False)),
+        ]
+        urls = []
+        for (url, actions) in urls_actions:
+            urls.append(url)
+            for worked in actions:
+                if worked:
+                    httpretty.register_uri(httpretty.GET,
+                                           url, body=b'it worked!')
+                else:
+                    httpretty.register_uri(httpretty.GET,
+                                           url, body=b'no worky',
+                                           status=400)
+
+        url = url_helper.wait_any_url(urls)
+        self.assertEqual("http://www.yahoo.com", url)
+
+    @httpretty.activate
+    def test_url_wait_for_no_work(self):
+
+        def request_callback(request, uri, headers):
+            return (400, headers, b"no worky")
+
+        urls = [
+            "http://www.yahoo.com",
+            "http://www.google.com",
+        ]
+        for url in urls:
+            httpretty.register_uri(httpretty.GET,
+                                   url, body=request_callback)
+
+        self.assertIsNone(url_helper.wait_any_url(urls, max_wait=1))
+
+
+class UrlHelperFetchTest(test.TestCase):
+
+    @httpretty.activate
+    def test_url_fetch(self):
+        httpretty.register_uri(httpretty.GET,
+                               "http://www.yahoo.com",
+                               body=b'it worked!')
+
+        resp = url_helper.read_url("http://www.yahoo.com")
+        self.assertEqual(b"it worked!", resp.contents)
+        self.assertEqual(url_helper.OK, resp.status_code)
+
+    @httpretty.activate
+    def test_retry_url_fetch(self):
+        httpretty.register_uri(httpretty.GET,
+                               "http://www.yahoo.com",
+                               responses=[
+                                   httpretty.Response(body=b"no worky",
+                                                      status=400),
+                                   httpretty.Response(body=b"it worked!",
+                                                      status=200),
+                               ])
+
+        resp = url_helper.read_url("http://www.yahoo.com", retries=2)
+        self.assertEqual(b"it worked!", resp.contents)
+        self.assertEqual(url_helper.OK, resp.status_code)
+
+    @httpretty.activate
+    def test_failed_url_fetch(self):
+        httpretty.register_uri(httpretty.GET,
+                               "http://www.yahoo.com",
+                               body=b'no worky', status=400)
+        self.assertRaises(url_helper.UrlError,
+                          url_helper.read_url, "http://www.yahoo.com")
+
+    @httpretty.activate
+    def test_failed_retry_url_fetch(self):
+        httpretty.register_uri(httpretty.GET,
+                               "http://www.yahoo.com",
+                               responses=[
+                                   httpretty.Response(body=b"no worky",
+                                                      status=400),
+                                   httpretty.Response(body=b"no worky",
+                                                      status=400),
+                                   httpretty.Response(body=b"no worky",
+                                                      status=400),
+                               ])
+
+        self.assertRaises(url_helper.UrlError,
+                          url_helper.read_url, "http://www.yahoo.com",
+                          retries=2)
--- a/cloudinit/url_helper.py
+++ b/cloudinit/url_helper.py
@ -0,0 +1,297 @@
+# Copyright 2015 Canonical Ltd.
+# This file is part of cloud-init.  See LICENCE file for license information.
+#
+# vi: ts=4 expandtab
+
+import logging
+import time
+
+try:
+    from time import monotonic as now
+except ImportError:
+    from time import time as now
+
+import requests
+from requests import adapters
+from requests import exceptions
+from requests import structures
+
+# Arg, why does requests vendorize urllib3....
+from requests.packages.urllib3 import util as urllib3_util
+
+from six.moves.urllib.parse import quote as urlquote  # noqa
+from six.moves.urllib.parse import urlparse  # noqa
+from six.moves.urllib.parse import urlunparse  # noqa
+
+from six.moves.http_client import BAD_REQUEST as _BAD_REQUEST
+from six.moves.http_client import MULTIPLE_CHOICES as _MULTIPLE_CHOICES
+from six.moves.http_client import OK
+
+from cloudinit import version
+
+
+SSL_ENABLED = True
+try:
+    import ssl as _ssl  # noqa
+except ImportError:
+    SSL_ENABLED = False
+
+
+LOG = logging.getLogger(__name__)
+
+
+def _get_base_url(url):
+    parsed_url = list(urlparse(url, scheme='http'))
+    parsed_url[2] = parsed_url[3] = parsed_url[4] = parsed_url[5] = ''
+    return urlunparse(parsed_url)
+
+
+def _clean_url(url):
+    parsed_url = list(urlparse(url, scheme='http'))
+    if not parsed_url[1] and parsed_url[2]:
+        # Swap these since this seems to be a common
+        # occurrence when given urls like 'www.google.com'
+        parsed_url[1] = parsed_url[2]
+        parsed_url[2] = ''
+    return urlunparse(parsed_url)
+
+
+class _Retry(urllib3_util.Retry):
+    def is_forced_retry(self, method, status_code):
+        # Allow >= 400 to be tried...
+        return status_code >= _BAD_REQUEST
+
+    def sleep(self):
+        # The base class doesn't have a way to log what we are doing,
+        # so replace it with one that does...
+        backoff = self.get_backoff_time()
+        if backoff <= 0:
+            return
+        else:
+            LOG.debug("Please wait %s seconds while we wait to try again",
+                      backoff)
+            time.sleep(backoff)
+
+
+class RequestsResponse(object):
+    """A wrapper for requests responses (that provides common functions).
+
+    This exists so that things like StringResponse or FileResponse can
+    also exist, but with different sources of there response (aka not
+    just from the requests library).
+    """
+
+    def __init__(self, response):
+        self._response = response
+
+    @property
+    def contents(self):
+        return self._response.content
+
+    @property
+    def url(self):
+        return self._response.url
+
+    def ok(self, redirects_ok=False):
+        upper = _MULTIPLE_CHOICES
+        if redirects_ok:
+            upper = _BAD_REQUEST
+        return self.status_code >= OK and self.status_code < upper
+
+    @property
+    def headers(self):
+        return self._response.headers
+
+    @property
+    def status_code(self):
+        return self._response.status_code
+
+    def __str__(self):
+        return self._response.text
+
+
+class UrlError(IOError):
+    def __init__(self, cause, code=None, headers=None):
+        super(UrlError, self).__init__(str(cause))
+        self.cause = cause
+        self.status_code = code
+        self.headers = headers or {}
+
+
+def _get_ssl_args(url, ssl_details):
+    ssl_args = {}
+    scheme = urlparse(url).scheme
+    if scheme == 'https' and ssl_details:
+        if not SSL_ENABLED:
+            LOG.warn("SSL is not supported, "
+                     "cert. verification can not occur!")
+        else:
+            if 'ca_certs' in ssl_details and ssl_details['ca_certs']:
+                ssl_args['verify'] = ssl_details['ca_certs']
+            else:
+                ssl_args['verify'] = True
+            if 'cert_file' in ssl_details and 'key_file' in ssl_details:
+                ssl_args['cert'] = [ssl_details['cert_file'],
+                                    ssl_details['key_file']]
+            elif 'cert_file' in ssl_details:
+                ssl_args['cert'] = str(ssl_details['cert_file'])
+    return ssl_args
+
+
+def read_url(url, data=None, timeout=None, retries=0,
+             headers=None, ssl_details=None,
+             check_status=True, allow_redirects=True):
+    """Fetch a url (or post to one) with the given options.
+
+    url: url to fetch
+    data: any data to POST (this switches the request method to POST
+          instead of GET)
+    timeout: the timeout (in seconds) to wait for a response
+    headers: any headers to provide (and send along) in the request
+    ssl_details: a dictionary containing any ssl settings, cert_file,
+                 ca_certs and verify are valid entries (and they are only
+                 used when the url provided is https)
+    check_status: checks that the response status is OK after fetching (this
+                  ensures a exception is raised on non-OK status codes)
+    allow_redirects: enables redirects (or disables them)
+    retries: maximum number of retries to attempt when fetching the url and
+             the fetch fails
+    """
+    url = _clean_url(url)
+    request_args = {
+        'url': url,
+    }
+    request_args.update(_get_ssl_args(url, ssl_details))
+    request_args['allow_redirects'] = allow_redirects
+    request_args['method'] = 'GET'
+    if timeout is not None:
+        request_args['timeout'] = max(float(timeout), 0)
+    if data:
+        request_args['method'] = 'POST'
+        request_args['data'] = data
+    if not headers:
+        headers = structures.CaseInsensitiveDict()
+    else:
+        headers = structures.CaseInsensitiveDict(headers)
+    if 'User-Agent' not in headers:
+        headers['User-Agent'] = 'Cloud-Init/%s' % (version.version_string())
+    request_args['headers'] = headers
+    session = requests.Session()
+    if retries:
+        retry = _Retry(total=max(int(retries), 0),
+                       raise_on_redirect=not allow_redirects)
+        session.mount(_get_base_url(url),
+                      adapters.HTTPAdapter(max_retries=retry))
+    try:
+        with session:
+            response = session.request(**request_args)
+            if check_status:
+                response.raise_for_status()
+    except exceptions.RequestException as e:
+        if e.response is not None:
+            raise UrlError(e, code=e.response.status_code,
+                           headers=e.response.headers)
+        else:
+            raise UrlError(e)
+    else:
+        LOG.debug("Read from %s (%s, %sb)", url, response.status_code,
+                  len(response.content))
+        return RequestsResponse(response)
+
+
+def wait_any_url(urls, max_wait=None, timeout=None,
+                 status_cb=None, sleep_time=1,
+                 exception_cb=None):
+    """Wait for one of many urls to respond correctly.
+
+    urls:      a list of urls to try
+    max_wait:  roughly the maximum time to wait before giving up
+    timeout:   the timeout provided to ``read_url``
+    status_cb: call method with string message when a url is not available
+    exception_cb: call method with 2 arguments 'msg' (per status_cb) and
+                  'exception', the exception that occurred.
+    sleep_time: how long to sleep before trying each url again
+
+    The idea of this routine is to wait for the EC2 metdata service to
+    come up. On both Eucalyptus and EC2 we have seen the case where
+    the instance hit the MD before the MD service was up. EC2 seems
+    to have permenantely fixed this, though.
+
+    In openstack, the metadata service might be painfully slow, and
+    unable to avoid hitting a timeout of even up to 10 seconds or more
+    (LP: #894279) for a simple GET.
+
+    Offset those needs with the need to not hang forever (and block boot)
+    on a system where cloud-init is configured to look for EC2 Metadata
+    service but is not going to find one.  It is possible that the instance
+    data host (169.254.169.254) may be firewalled off Entirely for a sytem,
+    meaning that the connection will block forever unless a timeout is set.
+    """
+    start_time = now()
+
+    def log_status_cb(msg, exc=None):
+        LOG.debug(msg)
+
+    if not status_cb:
+        status_cb = log_status_cb
+
+    def timeup(max_wait, start_time):
+        current_time = now()
+        return ((max_wait <= 0 or max_wait is None) or
+                (current_time - start_time > max_wait))
+
+    loop_n = 0
+    while True:
+        # This makes a backoff with the following graph:
+        #
+        # https://www.desmos.com/calculator/c8pwjy6wmt
+        sleep_time = int(loop_n / 5) + 1
+        for url in urls:
+            current_time = now()
+            if loop_n != 0:
+                if timeup(max_wait, start_time):
+                    break
+                if (timeout and
+                        (current_time + timeout > (start_time + max_wait))):
+                    # shorten timeout to not run way over max_time
+                    timeout = int((start_time + max_wait) - current_time)
+            reason = ""
+            url_exc = None
+            try:
+                response = read_url(url, timeout=timeout, check_status=False)
+                if not response.contents:
+                    reason = "empty response [%s]" % (response.code)
+                    url_exc = UrlError(ValueError(reason), code=response.code,
+                                       headers=response.headers)
+                elif not response.ok():
+                    reason = "bad status code [%s]" % (response.code)
+                    url_exc = UrlError(ValueError(reason), code=response.code,
+                                       headers=response.headers)
+                else:
+                    return url
+            except UrlError as e:
+                reason = "request error [%s]" % e
+                url_exc = e
+            except Exception as e:
+                reason = "unexpected error [%s]" % e
+                url_exc = e
+
+            current_time = now()
+            time_taken = int(current_time - start_time)
+            status_msg = "Calling '%s' failed [%s/%ss]: %s" % (url,
+                                                               time_taken,
+                                                               max_wait,
+                                                               reason)
+            status_cb(status_msg)
+            if exception_cb:
+                exception_cb(msg=status_msg, exception=url_exc)
+
+        if timeup(max_wait, start_time):
+            break
+
+        loop_n = loop_n + 1
+        LOG.debug("Please wait %s seconds while we wait to try again",
+                  sleep_time)
+        time.sleep(sleep_time)
+
+    return None
--- a/cloudinit/version.py
+++ b/cloudinit/version.py
@ -0,0 +1,14 @@
+# Copyright 2015 Canonical Ltd.
+# This file is part of cloud-init.  See LICENCE file for license information.
+#
+# vi: ts=4 expandtab
+
+import pkg_resources
+
+try:
+    from pbr import version as pbr_version
+    _version_info = pbr_version.VersionInfo('cloudinit')
+    version_string = _version_info.version_string
+except ImportError:
+    _version_info = pkg_resources.get_distribution('cloudinit')
+    version_string = lambda: _version_info.version
--- a/requirements.txt
+++ b/requirements.txt
@ -6,5 +6,5 @@ pbr>=0.6,!=0.7,<1.0
 six>=1.7.0
 pyyaml
 jsonpatch
-requests
+requests>=1.0
 requests-oauthlib
--- a/test-requirements.txt
+++ b/test-requirements.txt
@ -5,6 +5,7 @@
 httpretty>=0.7.1
 mock
 nose
+testtools

 # For doc building
 sphinx>=1.1.2,!=1.2.0,!=1.3b1,<1.3