Document misc and abnf_regexp submodules

Add UseExisting and begin using it in the API. Also rename some now public attributes in rfc3986.abnf_regexp. Refs #24
2017-05-06 20:18:38 -05:00 · 2017-05-06 20:18:38 -05:00 · 18a689de0d
parent 6b79edcb72
commit 18a689de0d
5 changed files with 257 additions and 20 deletions
--- a/docs/source/api-ref/index.rst
+++ b/docs/source/api-ref/index.rst
@ -13,3 +13,4 @@ can be utilized, please see :ref:`narrative` instead.
    builder
    uri
    validators
+    miscellaneous
--- a/docs/source/api-ref/miscellaneous.rst
+++ b/docs/source/api-ref/miscellaneous.rst
@ -0,0 +1,231 @@
+==========================
+ Miscellaneous Submodules
+==========================
+
+There are several submodules in |rfc3986| that are not meant to be exposed to 
+users directly but which are valuable to document, regardless.
+
+.. data:: rfc3986.misc.UseExisting
+
+    A sentinel object to make certain APIs simpler for users.
+
+.. module:: rfc3986.abnf_regexp
+
+The :mod:`rfc3986.abnf_regexp` module contains the regular expressions written
+from the RFC's ABNF. The :mod:`rfc3986.misc` module contains compiled regular
+expressions from :mod:`rfc3986.abnf_regexp` and previously contained those
+regular expressions.
+
+.. data:: rfc3986.abnf_regexp.GEN_DELIMS
+.. data:: rfc3986.abnf_regexp.GENERIC_DELIMITERS
+
+    The string containing all of the generic delimiters as defined on
+    `page 13 <https://tools.ietf.org/html/rfc3986#page-13>`__.
+
+.. data:: rfc3986.abnf_regexp.GENERIC_DELIMITERS_SET
+
+    :data:`rfc3986.abnf_regexp.GEN_DELIMS` represented as a :class:`set`.
+
+.. data:: rfc3986.abnf_regexp.SUB_DELIMS
+.. data:: rfc3986.abnf_regexp.SUB_DELIMITERS
+
+    The string containing all of the 'sub' delimiters as defined on
+    `page 13 <https://tools.ietf.org/html/rfc3986#page-13>`__.
+
+.. data:: rfc3986.abnf_regexp.SUB_DELIMITERS_SET
+
+    :data:`rfc3986.abnf_regexp.SUB_DELIMS` represented as a :class:`set`.
+
+.. data:: rfc3986.abnf_regexp.SUB_DELIMITERS_RE
+
+    :data:`rfc3986.abnf_regexp.SUB_DELIMS` with the ``*`` escaped for use in
+    regular expressions.
+
+.. data:: rfc3986.abnf_regexp.RESERVED_CHARS_SET
+
+    A :class:`set` constructed of :data:`GEN_DELIMS` and :data:`SUB_DELIMS`.
+    This union is defined on `page 13
+    <https://tools.ietf.org/html/rfc3986#page-13>`__.
+
+.. data:: rfc3986.abnf_regexp.ALPHA
+
+    The string of upper- and lower-case letters in USASCII.
+
+.. data:: rfc3986.abnf_regexp.DIGIT
+
+    The string of digits 0 through 9.
+
+.. data:: rfc3986.abnf_regexp.UNRESERVED
+.. data:: rfc3986.abnf_regexp.UNRESERVED_CHARS
+
+    The string of unreserved characters defined in :rfc:`3986#section-2.3`.
+
+.. data:: rfc3986.abnf_regexp.UNRESERVED_CHARS_SET
+
+    :data:`rfc3986.abnf_regexp.UNRESERVED_CHARS` represented as a
+    :class:`set`.
+
+.. data:: rfc3986.abnf_regexp.NON_PCT_ENCODED_SET
+
+    The non-percent encoded characters represented as a :class:`set`.
+
+.. data:: rfc3986.abnf_regexp.UNRESERVED_RE
+
+    Optimized regular expression for unreserved characters.
+
+.. data:: rfc3986.abnf_regexp.SCHEME_RE
+
+    Stricter regular expression to match and validate the scheme part
+    of a URI.
+
+.. data:: rfc3986.abnf_regexp.COMPONENT_PATTERN_DICT
+
+    Dictionary with regular expressions to match various components in
+    a URI. Except for :data:`rfc3986.abnf_regexp.SCHEME_RE`, all patterns
+    are from :rfc:`3986#appendix-B`.
+
+.. data:: rfc3986.abnf_regexp.URL_PARSING_RE
+
+    Regular expression compposed from the components in
+    :data:`rfc3986.abnf_regexp.COMPONENT_PATTERN_DICT`.
+
+.. data:: rfc3986.abnf_regexp.HEXDIG_RE
+
+    Hexadecimal characters used in each piece of an IPv6 address.
+    See :rfc:`3986#section-3.2.2`.
+
+.. data:: rfc3986.abnf_regexp.LS32_RE
+
+    Lease significant 32 bits of an IPv6 address.
+    See :rfc:`3986#section-3.2.2`.
+
+.. data:: rfc3986.abnf_regexp.REG_NAME
+.. data:: rfc3986.abnf_regexp.REGULAR_NAME_RE
+
+    The pattern for a regular name, e.g., ``www.google.com``,
+    ``api.github.com``.
+    See :rfc:`3986#section-3.2.2`.
+
+.. data:: rfc3986.abnf_regexp.IPv4_RE
+
+    The pattern for an IPv4 address, e.g., ``192.168.255.255``.
+    See :rfc:`3986#section-3.2.2`.
+
+.. data:: rfc3986.abnf_regexp.IPv6_RE
+
+    The pattern for an IPv6 address, e.g., ``::1``.
+    See :rfc:`3986#section-3.2.2`.
+
+.. data:: rfc3986.abnf_regexp.IPv_FUTURE_RE
+
+    A regular expression to parse out IPv Futures.
+    See :rfc:`3986#section-3.2.2`.
+
+.. data:: rfc3986.abnf_regexp.IP_LITERAL_RE
+
+    Pattern to match IPv6 addresses and IPv Future addresses.
+    See :rfc:`3986#section-3.2.2`.
+
+.. data:: rfc3986.abnf_regexp.HOST_RE
+.. data:: rfc3986.abnf_regexp.HOST_PATTERN
+
+    Pattern to match and validate the host piece of an authority.
+    This is composed of
+
+    - :data:`rfc3986.abnf_regexp.REG_NAME`
+    - :data:`rfc3986.abnf_regexp.IPv4_RE`
+    - :data:`rfc3986.abnf_regexp.IP_LITERAL_RE`
+
+    See :rfc:`3986#section-3.2.2`.
+
+.. data:: rfc3986.abnf_regexp.USERINFO_RE
+
+    Pattern to match and validate the user information portion of
+    an authority component.
+
+    See :rfc:`3986#section-3.2.2`.
+
+.. data:: rfc3986.abnf_regexp.PORT_RE
+
+    Pattern to match and validate the port portion of an authority
+    component.
+
+    See :rfc:`3986#section-3.2.2`.
+
+.. data:: rfc3986.abnf_regexp.PCT_ENCODED
+.. data:: rfc3986.abnf_regexp.PERCENT_ENCODED
+
+    Regular expression to match percent encoded character values.
+
+.. data:: rfc3986.abnf_regexp.PCHAR
+
+    Regular expression to match printable characters.
+
+.. data:: rfc3986.abnf_regexp.PATH_RE
+
+    Regular expression to match and validate the path component of a URI.
+
+    See :rfc:`3986#section-3.3`.
+
+.. data:: rfc3986.abnf_regexp.PATH_EMPTY
+.. data:: rfc3986.abnf_regexp.PATH_ROOTLESS
+.. data:: rfc3986.abnf_regexp.PATH_NOSCHEME
+.. data:: rfc3986.abnf_regexp.PATH_ABSOLUTE
+.. data:: rfc3986.abnf_regexp.PATH_ABEMPTY
+
+    Components of the :data:`rfc3986.abnf_regexp.PATH_RE`.
+
+    See :rfc:`3986#section-3.3`.
+
+.. data:: rfc3986.abnf_regexp.QUERY_RE
+
+    Regular expression to parse and validate the query component of a URI.
+
+.. data:: rfc3986.abnf_regexp.FRAGMENT_RE
+
+    Regular expression to parse and validate the fragment component of a URI.
+
+.. data:: rfc3986.abnf_regexp.RELATIVE_PART_RE
+
+    Regular expression to parse the relative URI when resolving URIs.
+
+.. data:: rfc3986.abnf_regexp.HIER_PART_RE
+
+    The hierarchical part of a URI. This regular expression is used when
+    resolving relative URIs.
+
+    See :rfc:`3986#section-3`.
+
+.. module:: rfc3986.misc
+
+.. data:: rfc3986.misc.URI_MATCHER
+
+    Compiled version of :data:`rfc3986.abnf_regexp.URL_PARSING_RE`.
+
+.. data:: rfc3986.misc.SUBAUTHORITY_MATCHER
+
+    Compiled compilation of :data:`rfc3986.abnf_regexp.USERINFO_RE`,
+    :data:`rfc3986.abnf_regexp.HOST_PATTERN`,
+    :data:`rfc3986.abnf_regexp.PORT_RE`.
+
+.. data:: rfc3986.misc.SCHEME_MATCHER
+
+    Compiled version of :data:`rfc3986.abnf_regexp.SCHEME_RE`.
+
+.. data:: rfc3986.misc.IPv4_MATCHER
+
+    Compiled version of :data:`rfc3986.abnf_regexp.IPv4_RE`.
+
+.. data:: rfc3986.misc.PATH_MATCHER
+
+    Compiled version of :data:`rfc3986.abnf_regexp.PATH_RE`.
+
+.. data:: rfc3986.misc.QUERY_MATCHER
+
+    Compiled version of :data:`rfc3986.abnf_regexp.QUERY_RE`.
+
+.. data:: rfc3986.misc.RELATIVE_REF_MATCHER
+
+    Compiled compilation of :data:`rfc3986.abnf_regexp.SCHEME_RE`,
+    :data:`rfc3986.abnf_regexp.HIER_PART_RE`,
+    :data:`rfc3986.abnf_regexp.QUERY_RE`.
--- a/src/rfc3986/abnf_regexp.py
+++ b/src/rfc3986/abnf_regexp.py
@ -20,7 +20,7 @@ GENERIC_DELIMITERS_SET = set(GENERIC_DELIMITERS)
 SUB_DELIMS = SUB_DELIMITERS = "!$&'()*+,;="
 SUB_DELIMITERS_SET = set(SUB_DELIMITERS)
 # Escape the '*' for use in regular expressions
-RE_SUB_DELIMITERS = "!$&'()\*+,;="
+SUB_DELIMITERS_RE = "!$&'()\*+,;="
 RESERVED_CHARS_SET = GENERIC_DELIMITERS_SET.union(SUB_DELIMITERS_SET)
 ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
 DIGIT = '0123456789'
@ -29,24 +29,24 @@ UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + '._!-'
 UNRESERVED_CHARS_SET = set(UNRESERVED_CHARS)
 NON_PCT_ENCODED_SET = RESERVED_CHARS_SET.union(UNRESERVED_CHARS_SET).union('%')
 # We need to escape the '-' in this case:
-RE_UNRESERVED = 'A-Za-z0-9._~\-'
+UNRESERVED_RE = 'A-Za-z0-9._~\-'

 # NOTE(sigmavirus24): We're going to use more strict regular expressions
 # than appear in Appendix B for scheme. This will prevent over-eager
 # consuming of items that aren't schemes.
 SCHEME_RE = '[a-zA-Z][a-zA-Z0-9+.-]*'
-AUTHORITY_RE = '[^/?#]*'
-PATH_RE = '[^?#]*'
-QUERY_RE = '[^#]*'
-FRAGMENT_RE = '.*'
+_AUTHORITY_RE = '[^/?#]*'
+_PATH_RE = '[^?#]*'
+_QUERY_RE = '[^#]*'
+_FRAGMENT_RE = '.*'

 # Extracted from http://tools.ietf.org/html/rfc3986#appendix-B
 COMPONENT_PATTERN_DICT = {
    'scheme': SCHEME_RE,
-    'authority': AUTHORITY_RE,
-    'path': PATH_RE,
-    'query': QUERY_RE,
-    'fragment': FRAGMENT_RE,
+    'authority': _AUTHORITY_RE,
+    'path': _PATH_RE,
+    'query': _QUERY_RE,
+    'fragment': _FRAGMENT_RE,
 }

 # See http://tools.ietf.org/html/rfc3986#appendix-B
@ -68,7 +68,7 @@ URL_PARSING_RE = (
 # Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2
 # The pattern for a regular name, e.g.,  www.google.com, api.github.com
 REGULAR_NAME_RE = REG_NAME = '(({0})*|[{1}]*)'.format(
-    '%[0-9A-Fa-f]{2}', RE_SUB_DELIMITERS + RE_UNRESERVED
+    '%[0-9A-Fa-f]{2}', SUB_DELIMITERS_RE + UNRESERVED_RE
 )
 # The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1,
 IPv4_RE = '([0-9]{1,3}.){3}[0-9]{1,3}'
@ -108,7 +108,7 @@ IPv6_RE = '(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7}))'.format(
 )

 IPv_FUTURE_RE = 'v[0-9A-Fa-f]+.[%s]+' % (
-    RE_UNRESERVED + RE_SUB_DELIMITERS + ':'
+    UNRESERVED_RE + SUB_DELIMITERS_RE + ':'
 )

 IP_LITERAL_RE = '\[({0}|{1})\]'.format(IPv6_RE, IPv_FUTURE_RE)
@ -131,7 +131,7 @@ PORT_RE = '[0-9]{1,5}'

 # Percent encoded character values
 PERCENT_ENCODED = PCT_ENCODED = '%[A-Fa-f0-9]{2}'
-PCHAR = '([' + RE_UNRESERVED + RE_SUB_DELIMITERS + ':@]|%s)' % PCT_ENCODED
+PCHAR = '([' + UNRESERVED_RE + SUB_DELIMITERS_RE + ':@]|%s)' % PCT_ENCODED
 segments = {
    'segment': PCHAR + '*',
    # Non-zero length segment
@ -151,7 +151,7 @@ PATH_RE = '^(%s|%s|%s|%s|%s)$' % (
 )

 FRAGMENT_RE = QUERY_RE = (
-    '^([/?:@' + RE_UNRESERVED + RE_SUB_DELIMITERS + ']|%s)*$' % PCT_ENCODED
+    '^([/?:@' + UNRESERVED_RE + SUB_DELIMITERS_RE + ']|%s)*$' % PCT_ENCODED
 )

 # ##########################
--- a/src/rfc3986/misc.py
+++ b/src/rfc3986/misc.py
@ -31,11 +31,12 @@ important_characters = {
    'generic_delimiters': abnf_regexp.GENERIC_DELIMITERS,
    'sub_delimiters': abnf_regexp.SUB_DELIMITERS,
    # We need to escape the '*' in this case
-    're_sub_delimiters': abnf_regexp.RE_SUB_DELIMITERS,
+    're_sub_delimiters': abnf_regexp.SUB_DELIMITERS_RE,
    'unreserved_chars': abnf_regexp.UNRESERVED_CHARS,
    # We need to escape the '-' in this case:
-    're_unreserved': abnf_regexp.RE_UNRESERVED,
-    }
+    're_unreserved': abnf_regexp.UNRESERVED_RE,
+}
+
 # For details about delimiters and reserved characters, see:
 # http://tools.ietf.org/html/rfc3986#section-2.2
 GENERIC_DELIMITERS = abnf_regexp.GENERIC_DELIMITERS_SET
@ -96,3 +97,6 @@ def merge_paths(base_uri, relative_path):
        path = base_uri.path or ''
        index = path.rfind('/')
        return path[:index] + '/' + relative_path
+
+
+UseExisting = object()
--- a/src/rfc3986/uri.py
+++ b/src/rfc3986/uri.py
@ -412,8 +412,9 @@ class URIReference(namedtuple('URIReference', misc.URI_COMPONENTS)):
            result_list.extend(['#', self.fragment])
        return ''.join(result_list)

-    def copy_with(self, scheme=None, authority=None, path=None, query=None,
-                  fragment=None):
+    def copy_with(self, scheme=misc.UseExisting, authority=misc.UseExisting,
+                  path=misc.UseExisting, query=misc.UseExisting,
+                  fragment=misc.UseExisting):
        """Create a copy of this reference with the new components.

        :param str scheme:
@ -439,7 +440,7 @@ class URIReference(namedtuple('URIReference', misc.URI_COMPONENTS)):
            'fragment': fragment,
        }
        for key, value in list(attributes.items()):
-            if value is None:
+            if value is misc.UseExisting:
                del attributes[key]
        uri = self._replace(**attributes)
        uri.encoding = self.encoding