Move regular expressions into separate module

2017-04-29 17:29:55 -05:00 · 2017-04-29 17:29:55 -05:00 · cd453de7dd
parent ac06bd3159
commit cd453de7dd
2 changed files with 207 additions and 148 deletions
--- a/src/rfc3986/abnf_regexp.py
+++ b/src/rfc3986/abnf_regexp.py
@ -0,0 +1,177 @@
+# -*- coding: utf-8 -*-
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Module for the regular expressions crafted from ABNF."""
+
+# https://tools.ietf.org/html/rfc3986#page-13
+GEN_DELIMS = GENERIC_DELIMITERS = ":/?#[]@"
+GENERIC_DELIMITERS_SET = set(GENERIC_DELIMITERS)
+# https://tools.ietf.org/html/rfc3986#page-13
+SUB_DELIMS = SUB_DELIMITERS = "!$&'()*+,;="
+SUB_DELIMITERS_SET = set(SUB_DELIMITERS)
+# Escape the '*' for use in regular expressions
+RE_SUB_DELIMITERS = "!$&'()\*+,;="
+RESERVED_CHARS_SET = GENERIC_DELIMITERS_SET.union(SUB_DELIMITERS_SET)
+ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+DIGIT = '0123456789'
+# https://tools.ietf.org/html/rfc3986#section-2.3
+UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + '._!-'
+UNRESERVED_CHARS_SET = set(UNRESERVED_CHARS)
+NON_PCT_ENCODED_SET = RESERVED_CHARS_SET.union(UNRESERVED_CHARS_SET).union('%')
+# We need to escape the '-' in this case:
+RE_UNRESERVED = 'A-Za-z0-9._~\-'
+
+# NOTE(sigmavirus24): We're going to use more strict regular expressions
+# than appear in Appendix B for scheme. This will prevent over-eager
+# consuming of items that aren't schemes.
+SCHEME_RE = '[a-zA-Z][a-zA-Z0-9+.-]*'
+AUTHORITY_RE = '[^/?#]*'
+PATH_RE = '[^?#]*'
+QUERY_RE = '[^#]*'
+FRAGMENT_RE = '.*'
+
+# Extracted from http://tools.ietf.org/html/rfc3986#appendix-B
+COMPONENT_PATTERN_DICT = {
+    'scheme': SCHEME_RE,
+    'authority': AUTHORITY_RE,
+    'path': PATH_RE,
+    'query': QUERY_RE,
+    'fragment': FRAGMENT_RE,
+}
+
+# See http://tools.ietf.org/html/rfc3986#appendix-B
+# In this case, we name each of the important matches so we can use
+# SRE_Match#groupdict to parse the values out if we so choose. This is also
+# modified to ignore other matches that are not important to the parsing of
+# the reference so we can also simply use SRE_Match#groups.
+URL_PARSING_RE = (
+    '(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?'
+    '(?P<path>{path})(?:\?(?P<query>{query}))?'
+    '(?:#(?P<fragment>{fragment}))?'
+).format(**COMPONENT_PATTERN_DICT)
+
+
+# #########################
+# Authority Matcher Section
+# #########################
+
+# Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2
+# The pattern for a regular name, e.g.,  www.google.com, api.github.com
+REGULAR_NAME_RE = REG_NAME = '(({0})*|[{1}]*)'.format(
+    '%[0-9A-Fa-f]{2}', RE_SUB_DELIMITERS + RE_UNRESERVED
+)
+# The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1,
+IPv4_RE = '([0-9]{1,3}.){3}[0-9]{1,3}'
+# Hexadecimal characters used in each piece of an IPv6 address
+HEXDIG_RE = '[0-9A-Fa-f]{1,4}'
+# Least-significant 32 bits of an IPv6 address
+LS32_RE = '({hex}:{hex}|{ipv4})'.format(hex=HEXDIG_RE, ipv4=IPv4_RE)
+# Substitutions into the following patterns for IPv6 patterns defined
+# http://tools.ietf.org/html/rfc3986#page-20
+_subs = {'hex': HEXDIG_RE, 'ls32': LS32_RE}
+
+# Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details
+# about ABNF (Augmented Backus-Naur Form) use in the comments
+variations = [
+    #                            6( h16 ":" ) ls32
+    '(%(hex)s:){6}%(ls32)s' % _subs,
+    #                       "::" 5( h16 ":" ) ls32
+    '::(%(hex)s:){5}%(ls32)s' % _subs,
+    # [               h16 ] "::" 4( h16 ":" ) ls32
+    '(%(hex)s)?::(%(hex)s:){4}%(ls32)s' % _subs,
+    # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
+    '((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s' % _subs,
+    # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
+    '((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s' % _subs,
+    # [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
+    '((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s' % _subs,
+    # [ *4( h16 ":" ) h16 ] "::"              ls32
+    '((%(hex)s:){0,4}%(hex)s)?::%(ls32)s' % _subs,
+    # [ *5( h16 ":" ) h16 ] "::"              h16
+    '((%(hex)s:){0,5}%(hex)s)?::%(hex)s' % _subs,
+    # [ *6( h16 ":" ) h16 ] "::"
+    '((%(hex)s:){0,6}%(hex)s)?::' % _subs,
+]
+
+IPv6_RE = '(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7}))'.format(
+    *variations
+)
+
+IPv_FUTURE_RE = 'v[0-9A-Fa-f]+.[%s]+' % (
+    RE_UNRESERVED + RE_SUB_DELIMITERS + ':'
+)
+
+IP_LITERAL_RE = '\[({0}|{1})\]'.format(IPv6_RE, IPv_FUTURE_RE)
+
+# Pattern for matching the host piece of the authority
+HOST_RE = HOST_PATTERN = '({0}|{1}|{2})'.format(
+    REG_NAME,
+    IPv4_RE,
+    IP_LITERAL_RE,
+)
+USERINFO_RE = '^[A-Za-z0-9_.~\-%:]+'
+PORT_RE = '[0-9]{1,5}'
+
+# ####################
+# Path Matcher Section
+# ####################
+
+# See http://tools.ietf.org/html/rfc3986#section-3.3 for more information
+# about the path patterns defined below.
+
+# Percent encoded character values
+PERCENT_ENCODED = PCT_ENCODED = '%[A-Fa-f0-9]{2}'
+PCHAR = '([' + RE_UNRESERVED + RE_SUB_DELIMITERS + ':@]|%s)' % PCT_ENCODED
+segments = {
+    'segment': PCHAR + '*',
+    # Non-zero length segment
+    'segment-nz': PCHAR + '+',
+    # Non-zero length segment without ":"
+    'segment-nz-nc': PCHAR.replace(':', '') + '+'
+}
+
+# Path types taken from Section 3.3 (linked above)
+PATH_EMPTY = '^$'
+PATH_ROOTLESS = '%(segment-nz)s(/%(segment)s)*' % segments
+PATH_NOSCHEME = '%(segment-nz-nc)s(/%(segment)s)*' % segments
+PATH_ABSOLUTE = '/(%s)?' % PATH_ROOTLESS
+PATH_ABEMPTY = '(/%(segment)s)*' % segments
+PATH_RE = '^(%s|%s|%s|%s|%s)$' % (
+    PATH_ABEMPTY, PATH_ABSOLUTE, PATH_NOSCHEME, PATH_ROOTLESS, PATH_EMPTY
+)
+
+FRAGMENT_RE = QUERY_RE = (
+    '^([/?:@' + RE_UNRESERVED + RE_SUB_DELIMITERS + ']|%s)*$' % PCT_ENCODED
+)
+
+# ##########################
+# Relative reference matcher
+# ##########################
+
+# See http://tools.ietf.org/html/rfc3986#section-4.2 for details
+RELATIVE_PART_RE = '(//%s%s|%s|%s|%s)' % (
+    COMPONENT_PATTERN_DICT['authority'],
+    PATH_ABEMPTY,
+    PATH_ABSOLUTE,
+    PATH_NOSCHEME,
+    PATH_EMPTY,
+)
+
+# See http://tools.ietf.org/html/rfc3986#section-3 for definition
+HIER_PART_RE = '(//%s%s|%s|%s|%s)' % (
+    COMPONENT_PATTERN_DICT['authority'],
+    PATH_ABEMPTY,
+    PATH_ABSOLUTE,
+    PATH_ROOTLESS,
+    PATH_EMPTY,
+)
--- a/src/rfc3986/misc.py
+++ b/src/rfc3986/misc.py
@ -21,188 +21,70 @@ expressions for parsing and validating URIs and their components.

 import re

+from . import abnf_regexp
+
 # These are enumerated for the named tuple used as a superclass of
 # URIReference
 URI_COMPONENTS = ['scheme', 'authority', 'path', 'query', 'fragment']

 important_characters = {
-    'generic_delimiters': ":/?#[]@",
-    'sub_delimiters': "!$&'()*+,;=",
+    'generic_delimiters': abnf_regexp.GENERIC_DELIMITERS,
+    'sub_delimiters': abnf_regexp.SUB_DELIMITERS,
    # We need to escape the '*' in this case
-    're_sub_delimiters': "!$&'()\*+,;=",
-    'unreserved_chars': ('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
-                         '0123456789._~-'),
+    're_sub_delimiters': abnf_regexp.RE_SUB_DELIMITERS,
+    'unreserved_chars': abnf_regexp.UNRESERVED_CHARS,
    # We need to escape the '-' in this case:
-    're_unreserved': 'A-Za-z0-9._~\-',
+    're_unreserved': abnf_regexp.RE_UNRESERVED,
    }
 # For details about delimiters and reserved characters, see:
 # http://tools.ietf.org/html/rfc3986#section-2.2
-GENERIC_DELIMITERS = set(important_characters['generic_delimiters'])
-SUB_DELIMITERS = set(important_characters['sub_delimiters'])
-RESERVED_CHARS = GENERIC_DELIMITERS.union(SUB_DELIMITERS)
+GENERIC_DELIMITERS = abnf_regexp.GENERIC_DELIMITERS_SET
+SUB_DELIMITERS = abnf_regexp.SUB_DELIMITERS_SET
+RESERVED_CHARS = abnf_regexp.RESERVED_CHARS_SET
 # For details about unreserved characters, see:
 # http://tools.ietf.org/html/rfc3986#section-2.3
-UNRESERVED_CHARS = set(important_characters['unreserved_chars'])
-NON_PCT_ENCODED = RESERVED_CHARS.union(UNRESERVED_CHARS).union('%')
+UNRESERVED_CHARS = abnf_regexp.UNRESERVED_CHARS_SET
+NON_PCT_ENCODED = abnf_regexp.NON_PCT_ENCODED_SET

-# Extracted from http://tools.ietf.org/html/rfc3986#appendix-B
-component_pattern_dict = {
-    # NOTE(sigmavirus24): We're going to use more strict regular expressions
-    # than appear in Appendix B for scheme. This will prevent over-eager
-    # consuming of items that aren't schemes.
-    'scheme': '[a-zA-Z][a-zA-Z0-9+.-]*',
-    'authority': '[^/?#]*',
-    'path': '[^?#]*',
-    'query': '[^#]*',
-    'fragment': '.*',
-    }
-
-# See http://tools.ietf.org/html/rfc3986#appendix-B
-# In this case, we name each of the important matches so we can use
-# SRE_Match#groupdict to parse the values out if we so choose. This is also
-# modified to ignore other matches that are not important to the parsing of
-# the reference so we can also simply use SRE_Match#groups.
-expression = ('(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?'
-              '(?P<path>{path})(?:\?(?P<query>{query}))?'
-              '(?:#(?P<fragment>{fragment}))?'
-              ).format(**component_pattern_dict)
-
-URI_MATCHER = re.compile(expression)
-
-# #########################
-# Authority Matcher Section
-# #########################
-
-# Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2
-# The pattern for a regular name, e.g.,  www.google.com, api.github.com
-reg_name = '(({0})*|[{1}]*)'.format(
-    '%[0-9A-Fa-f]{2}',
-    important_characters['re_sub_delimiters'] +
-    important_characters['re_unreserved']
-    )
-# The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1,
-ipv4 = '([0-9]{1,3}.){3}[0-9]{1,3}'
-# Hexadecimal characters used in each piece of an IPv6 address
-hexdig = '[0-9A-Fa-f]{1,4}'
-# Least-significant 32 bits of an IPv6 address
-ls32 = '({hex}:{hex}|{ipv4})'.format(hex=hexdig, ipv4=ipv4)
-# Substitutions into the following patterns for IPv6 patterns defined
-# http://tools.ietf.org/html/rfc3986#page-20
-subs = {'hex': hexdig, 'ls32': ls32}
-
-# Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details
-# about ABNF (Augmented Backus-Naur Form) use in the comments
-variations = [
-    #                            6( h16 ":" ) ls32
-    '(%(hex)s:){6}%(ls32)s' % subs,
-    #                       "::" 5( h16 ":" ) ls32
-    '::(%(hex)s:){5}%(ls32)s' % subs,
-    # [               h16 ] "::" 4( h16 ":" ) ls32
-    '(%(hex)s)?::(%(hex)s:){4}%(ls32)s' % subs,
-    # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
-    '((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s' % subs,
-    # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
-    '((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s' % subs,
-    # [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
-    '((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s' % subs,
-    # [ *4( h16 ":" ) h16 ] "::"              ls32
-    '((%(hex)s:){0,4}%(hex)s)?::%(ls32)s' % subs,
-    # [ *5( h16 ":" ) h16 ] "::"              h16
-    '((%(hex)s:){0,5}%(hex)s)?::%(hex)s' % subs,
-    # [ *6( h16 ":" ) h16 ] "::"
-    '((%(hex)s:){0,6}%(hex)s)?::' % subs,
-    ]
-
-ipv6 = '(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7}))'.format(*variations)
-
-ipv_future = 'v[0-9A-Fa-f]+.[%s]+' % (
-    important_characters['re_unreserved'] +
-    important_characters['re_sub_delimiters'] +
-    ':')
-
-ip_literal = '\[({0}|{1})\]'.format(ipv6, ipv_future)
-
-# Pattern for matching the host piece of the authority
-HOST_PATTERN = '({0}|{1}|{2})'.format(reg_name, ipv4, ip_literal)
+URI_MATCHER = re.compile(abnf_regexp.URL_PARSING_RE)

 SUBAUTHORITY_MATCHER = re.compile((
-    '^(?:(?P<userinfo>[A-Za-z0-9_.~\-%:]+)@)?'  # userinfo
-    '(?P<host>{0}?)'  # host
-    ':?(?P<port>[0-9]+)?$'  # port
-    ).format(HOST_PATTERN))
-
-IPv4_MATCHER = re.compile('^' + ipv4 + '$')
+    '^(?:(?P<userinfo>{0})@)?'  # userinfo
+    '(?P<host>{1}?)'  # host
+    ':?(?P<port>{2})?$'  # port
+    ).format(abnf_regexp.USERINFO_RE,
+             abnf_regexp.HOST_PATTERN,
+             abnf_regexp.PORT_RE))


-# ####################
-# Path Matcher Section
-# ####################
-
-# See http://tools.ietf.org/html/rfc3986#section-3.3 for more information
-# about the path patterns defined below.
-
-# Percent encoded character values
-pct_encoded = '%[A-Fa-f0-9]{2}'
-pchar = ('([' + important_characters['re_unreserved']
-         + important_characters['re_sub_delimiters']
-         + ':@]|%s)' % pct_encoded)
-segments = {
-    'segment': pchar + '*',
-    # Non-zero length segment
-    'segment-nz': pchar + '+',
-    # Non-zero length segment without ":"
-    'segment-nz-nc': pchar.replace(':', '') + '+'
-    }
-
-# Path types taken from Section 3.3 (linked above)
-path_empty = '^$'
-path_rootless = '%(segment-nz)s(/%(segment)s)*' % segments
-path_noscheme = '%(segment-nz-nc)s(/%(segment)s)*' % segments
-path_absolute = '/(%s)?' % path_rootless
-path_abempty = '(/%(segment)s)*' % segments
+IPv4_MATCHER = re.compile('^' + abnf_regexp.IPv4_RE + '$')

 # Matcher used to validate path components
-PATH_MATCHER = re.compile('^(%s|%s|%s|%s|%s)$' % (
-    path_abempty, path_absolute, path_noscheme, path_rootless, path_empty
-    ))
+PATH_MATCHER = re.compile(abnf_regexp.PATH_RE)


 # ##################################
 # Query and Fragment Matcher Section
 # ##################################

-QUERY_MATCHER = re.compile(
-    '^([/?:@' + important_characters['re_unreserved']
-    + important_characters['re_sub_delimiters']
-    + ']|%s)*$' % pct_encoded)
+QUERY_MATCHER = re.compile(abnf_regexp.QUERY_RE)

 FRAGMENT_MATCHER = QUERY_MATCHER

 # Scheme validation, see: http://tools.ietf.org/html/rfc3986#section-3.1
-SCHEME_MATCHER = re.compile('^[A-Za-z][A-Za-z0-9+.\-]*$')
-
-# Relative reference matcher
-
-# See http://tools.ietf.org/html/rfc3986#section-4.2 for details
-relative_part = '(//%s%s|%s|%s|%s)' % (
-    component_pattern_dict['authority'], path_abempty, path_absolute,
-    path_noscheme, path_empty
-    )
+SCHEME_MATCHER = re.compile('^{0}$'.format(abnf_regexp.SCHEME_RE))

 RELATIVE_REF_MATCHER = re.compile('^%s(\?%s)?(#%s)?$' % (
-    relative_part, QUERY_MATCHER.pattern, FRAGMENT_MATCHER.pattern
-    ))
-
-# See http://tools.ietf.org/html/rfc3986#section-3 for definition
-hier_part = '(//%s%s|%s|%s|%s)' % (
-    component_pattern_dict['authority'], path_abempty, path_absolute,
-    path_rootless, path_empty
-    )
+    abnf_regexp.RELATIVE_PART_RE, abnf_regexp.QUERY_RE,
+    abnf_regexp.FRAGMENT_RE,
+))

 # See http://tools.ietf.org/html/rfc3986#section-4.3
 ABSOLUTE_URI_MATCHER = re.compile('^%s:%s(\?%s)?$' % (
-    component_pattern_dict['scheme'], hier_part, QUERY_MATCHER.pattern[1:-1]
-    ))
+    abnf_regexp.COMPONENT_PATTERN_DICT['scheme'],
+    abnf_regexp.HIER_PART_RE,
+    abnf_regexp.QUERY_RE[1:-1],
+))


 # Path merger as defined in http://tools.ietf.org/html/rfc3986#section-5.2.3