Move regular expressions into separate module
This commit is contained in:
parent
ac06bd3159
commit
cd453de7dd
|
@ -0,0 +1,177 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
# implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Module for the regular expressions crafted from ABNF."""
|
||||
|
||||
# https://tools.ietf.org/html/rfc3986#page-13
|
||||
GEN_DELIMS = GENERIC_DELIMITERS = ":/?#[]@"
|
||||
GENERIC_DELIMITERS_SET = set(GENERIC_DELIMITERS)
|
||||
# https://tools.ietf.org/html/rfc3986#page-13
|
||||
SUB_DELIMS = SUB_DELIMITERS = "!$&'()*+,;="
|
||||
SUB_DELIMITERS_SET = set(SUB_DELIMITERS)
|
||||
# Escape the '*' for use in regular expressions
|
||||
RE_SUB_DELIMITERS = "!$&'()\*+,;="
|
||||
RESERVED_CHARS_SET = GENERIC_DELIMITERS_SET.union(SUB_DELIMITERS_SET)
|
||||
ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
|
||||
DIGIT = '0123456789'
|
||||
# https://tools.ietf.org/html/rfc3986#section-2.3
|
||||
UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + '._!-'
|
||||
UNRESERVED_CHARS_SET = set(UNRESERVED_CHARS)
|
||||
NON_PCT_ENCODED_SET = RESERVED_CHARS_SET.union(UNRESERVED_CHARS_SET).union('%')
|
||||
# We need to escape the '-' in this case:
|
||||
RE_UNRESERVED = 'A-Za-z0-9._~\-'
|
||||
|
||||
# NOTE(sigmavirus24): We're going to use more strict regular expressions
|
||||
# than appear in Appendix B for scheme. This will prevent over-eager
|
||||
# consuming of items that aren't schemes.
|
||||
SCHEME_RE = '[a-zA-Z][a-zA-Z0-9+.-]*'
|
||||
AUTHORITY_RE = '[^/?#]*'
|
||||
PATH_RE = '[^?#]*'
|
||||
QUERY_RE = '[^#]*'
|
||||
FRAGMENT_RE = '.*'
|
||||
|
||||
# Extracted from http://tools.ietf.org/html/rfc3986#appendix-B
|
||||
COMPONENT_PATTERN_DICT = {
|
||||
'scheme': SCHEME_RE,
|
||||
'authority': AUTHORITY_RE,
|
||||
'path': PATH_RE,
|
||||
'query': QUERY_RE,
|
||||
'fragment': FRAGMENT_RE,
|
||||
}
|
||||
|
||||
# See http://tools.ietf.org/html/rfc3986#appendix-B
|
||||
# In this case, we name each of the important matches so we can use
|
||||
# SRE_Match#groupdict to parse the values out if we so choose. This is also
|
||||
# modified to ignore other matches that are not important to the parsing of
|
||||
# the reference so we can also simply use SRE_Match#groups.
|
||||
URL_PARSING_RE = (
|
||||
'(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?'
|
||||
'(?P<path>{path})(?:\?(?P<query>{query}))?'
|
||||
'(?:#(?P<fragment>{fragment}))?'
|
||||
).format(**COMPONENT_PATTERN_DICT)
|
||||
|
||||
|
||||
# #########################
|
||||
# Authority Matcher Section
|
||||
# #########################
|
||||
|
||||
# Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2
|
||||
# The pattern for a regular name, e.g., www.google.com, api.github.com
|
||||
REGULAR_NAME_RE = REG_NAME = '(({0})*|[{1}]*)'.format(
|
||||
'%[0-9A-Fa-f]{2}', RE_SUB_DELIMITERS + RE_UNRESERVED
|
||||
)
|
||||
# The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1,
|
||||
IPv4_RE = '([0-9]{1,3}.){3}[0-9]{1,3}'
|
||||
# Hexadecimal characters used in each piece of an IPv6 address
|
||||
HEXDIG_RE = '[0-9A-Fa-f]{1,4}'
|
||||
# Least-significant 32 bits of an IPv6 address
|
||||
LS32_RE = '({hex}:{hex}|{ipv4})'.format(hex=HEXDIG_RE, ipv4=IPv4_RE)
|
||||
# Substitutions into the following patterns for IPv6 patterns defined
|
||||
# http://tools.ietf.org/html/rfc3986#page-20
|
||||
_subs = {'hex': HEXDIG_RE, 'ls32': LS32_RE}
|
||||
|
||||
# Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details
|
||||
# about ABNF (Augmented Backus-Naur Form) use in the comments
|
||||
variations = [
|
||||
# 6( h16 ":" ) ls32
|
||||
'(%(hex)s:){6}%(ls32)s' % _subs,
|
||||
# "::" 5( h16 ":" ) ls32
|
||||
'::(%(hex)s:){5}%(ls32)s' % _subs,
|
||||
# [ h16 ] "::" 4( h16 ":" ) ls32
|
||||
'(%(hex)s)?::(%(hex)s:){4}%(ls32)s' % _subs,
|
||||
# [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
|
||||
'((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s' % _subs,
|
||||
# [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
|
||||
'((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s' % _subs,
|
||||
# [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
|
||||
'((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s' % _subs,
|
||||
# [ *4( h16 ":" ) h16 ] "::" ls32
|
||||
'((%(hex)s:){0,4}%(hex)s)?::%(ls32)s' % _subs,
|
||||
# [ *5( h16 ":" ) h16 ] "::" h16
|
||||
'((%(hex)s:){0,5}%(hex)s)?::%(hex)s' % _subs,
|
||||
# [ *6( h16 ":" ) h16 ] "::"
|
||||
'((%(hex)s:){0,6}%(hex)s)?::' % _subs,
|
||||
]
|
||||
|
||||
IPv6_RE = '(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7}))'.format(
|
||||
*variations
|
||||
)
|
||||
|
||||
IPv_FUTURE_RE = 'v[0-9A-Fa-f]+.[%s]+' % (
|
||||
RE_UNRESERVED + RE_SUB_DELIMITERS + ':'
|
||||
)
|
||||
|
||||
IP_LITERAL_RE = '\[({0}|{1})\]'.format(IPv6_RE, IPv_FUTURE_RE)
|
||||
|
||||
# Pattern for matching the host piece of the authority
|
||||
HOST_RE = HOST_PATTERN = '({0}|{1}|{2})'.format(
|
||||
REG_NAME,
|
||||
IPv4_RE,
|
||||
IP_LITERAL_RE,
|
||||
)
|
||||
USERINFO_RE = '^[A-Za-z0-9_.~\-%:]+'
|
||||
PORT_RE = '[0-9]{1,5}'
|
||||
|
||||
# ####################
|
||||
# Path Matcher Section
|
||||
# ####################
|
||||
|
||||
# See http://tools.ietf.org/html/rfc3986#section-3.3 for more information
|
||||
# about the path patterns defined below.
|
||||
|
||||
# Percent encoded character values
|
||||
PERCENT_ENCODED = PCT_ENCODED = '%[A-Fa-f0-9]{2}'
|
||||
PCHAR = '([' + RE_UNRESERVED + RE_SUB_DELIMITERS + ':@]|%s)' % PCT_ENCODED
|
||||
segments = {
|
||||
'segment': PCHAR + '*',
|
||||
# Non-zero length segment
|
||||
'segment-nz': PCHAR + '+',
|
||||
# Non-zero length segment without ":"
|
||||
'segment-nz-nc': PCHAR.replace(':', '') + '+'
|
||||
}
|
||||
|
||||
# Path types taken from Section 3.3 (linked above)
|
||||
PATH_EMPTY = '^$'
|
||||
PATH_ROOTLESS = '%(segment-nz)s(/%(segment)s)*' % segments
|
||||
PATH_NOSCHEME = '%(segment-nz-nc)s(/%(segment)s)*' % segments
|
||||
PATH_ABSOLUTE = '/(%s)?' % PATH_ROOTLESS
|
||||
PATH_ABEMPTY = '(/%(segment)s)*' % segments
|
||||
PATH_RE = '^(%s|%s|%s|%s|%s)$' % (
|
||||
PATH_ABEMPTY, PATH_ABSOLUTE, PATH_NOSCHEME, PATH_ROOTLESS, PATH_EMPTY
|
||||
)
|
||||
|
||||
FRAGMENT_RE = QUERY_RE = (
|
||||
'^([/?:@' + RE_UNRESERVED + RE_SUB_DELIMITERS + ']|%s)*$' % PCT_ENCODED
|
||||
)
|
||||
|
||||
# ##########################
|
||||
# Relative reference matcher
|
||||
# ##########################
|
||||
|
||||
# See http://tools.ietf.org/html/rfc3986#section-4.2 for details
|
||||
RELATIVE_PART_RE = '(//%s%s|%s|%s|%s)' % (
|
||||
COMPONENT_PATTERN_DICT['authority'],
|
||||
PATH_ABEMPTY,
|
||||
PATH_ABSOLUTE,
|
||||
PATH_NOSCHEME,
|
||||
PATH_EMPTY,
|
||||
)
|
||||
|
||||
# See http://tools.ietf.org/html/rfc3986#section-3 for definition
|
||||
HIER_PART_RE = '(//%s%s|%s|%s|%s)' % (
|
||||
COMPONENT_PATTERN_DICT['authority'],
|
||||
PATH_ABEMPTY,
|
||||
PATH_ABSOLUTE,
|
||||
PATH_ROOTLESS,
|
||||
PATH_EMPTY,
|
||||
)
|
|
@ -21,188 +21,70 @@ expressions for parsing and validating URIs and their components.
|
|||
|
||||
import re
|
||||
|
||||
from . import abnf_regexp
|
||||
|
||||
# These are enumerated for the named tuple used as a superclass of
|
||||
# URIReference
|
||||
URI_COMPONENTS = ['scheme', 'authority', 'path', 'query', 'fragment']
|
||||
|
||||
important_characters = {
|
||||
'generic_delimiters': ":/?#[]@",
|
||||
'sub_delimiters': "!$&'()*+,;=",
|
||||
'generic_delimiters': abnf_regexp.GENERIC_DELIMITERS,
|
||||
'sub_delimiters': abnf_regexp.SUB_DELIMITERS,
|
||||
# We need to escape the '*' in this case
|
||||
're_sub_delimiters': "!$&'()\*+,;=",
|
||||
'unreserved_chars': ('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
|
||||
'0123456789._~-'),
|
||||
're_sub_delimiters': abnf_regexp.RE_SUB_DELIMITERS,
|
||||
'unreserved_chars': abnf_regexp.UNRESERVED_CHARS,
|
||||
# We need to escape the '-' in this case:
|
||||
're_unreserved': 'A-Za-z0-9._~\-',
|
||||
're_unreserved': abnf_regexp.RE_UNRESERVED,
|
||||
}
|
||||
# For details about delimiters and reserved characters, see:
|
||||
# http://tools.ietf.org/html/rfc3986#section-2.2
|
||||
GENERIC_DELIMITERS = set(important_characters['generic_delimiters'])
|
||||
SUB_DELIMITERS = set(important_characters['sub_delimiters'])
|
||||
RESERVED_CHARS = GENERIC_DELIMITERS.union(SUB_DELIMITERS)
|
||||
GENERIC_DELIMITERS = abnf_regexp.GENERIC_DELIMITERS_SET
|
||||
SUB_DELIMITERS = abnf_regexp.SUB_DELIMITERS_SET
|
||||
RESERVED_CHARS = abnf_regexp.RESERVED_CHARS_SET
|
||||
# For details about unreserved characters, see:
|
||||
# http://tools.ietf.org/html/rfc3986#section-2.3
|
||||
UNRESERVED_CHARS = set(important_characters['unreserved_chars'])
|
||||
NON_PCT_ENCODED = RESERVED_CHARS.union(UNRESERVED_CHARS).union('%')
|
||||
UNRESERVED_CHARS = abnf_regexp.UNRESERVED_CHARS_SET
|
||||
NON_PCT_ENCODED = abnf_regexp.NON_PCT_ENCODED_SET
|
||||
|
||||
# Extracted from http://tools.ietf.org/html/rfc3986#appendix-B
|
||||
component_pattern_dict = {
|
||||
# NOTE(sigmavirus24): We're going to use more strict regular expressions
|
||||
# than appear in Appendix B for scheme. This will prevent over-eager
|
||||
# consuming of items that aren't schemes.
|
||||
'scheme': '[a-zA-Z][a-zA-Z0-9+.-]*',
|
||||
'authority': '[^/?#]*',
|
||||
'path': '[^?#]*',
|
||||
'query': '[^#]*',
|
||||
'fragment': '.*',
|
||||
}
|
||||
|
||||
# See http://tools.ietf.org/html/rfc3986#appendix-B
|
||||
# In this case, we name each of the important matches so we can use
|
||||
# SRE_Match#groupdict to parse the values out if we so choose. This is also
|
||||
# modified to ignore other matches that are not important to the parsing of
|
||||
# the reference so we can also simply use SRE_Match#groups.
|
||||
expression = ('(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?'
|
||||
'(?P<path>{path})(?:\?(?P<query>{query}))?'
|
||||
'(?:#(?P<fragment>{fragment}))?'
|
||||
).format(**component_pattern_dict)
|
||||
|
||||
URI_MATCHER = re.compile(expression)
|
||||
|
||||
# #########################
|
||||
# Authority Matcher Section
|
||||
# #########################
|
||||
|
||||
# Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2
|
||||
# The pattern for a regular name, e.g., www.google.com, api.github.com
|
||||
reg_name = '(({0})*|[{1}]*)'.format(
|
||||
'%[0-9A-Fa-f]{2}',
|
||||
important_characters['re_sub_delimiters'] +
|
||||
important_characters['re_unreserved']
|
||||
)
|
||||
# The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1,
|
||||
ipv4 = '([0-9]{1,3}.){3}[0-9]{1,3}'
|
||||
# Hexadecimal characters used in each piece of an IPv6 address
|
||||
hexdig = '[0-9A-Fa-f]{1,4}'
|
||||
# Least-significant 32 bits of an IPv6 address
|
||||
ls32 = '({hex}:{hex}|{ipv4})'.format(hex=hexdig, ipv4=ipv4)
|
||||
# Substitutions into the following patterns for IPv6 patterns defined
|
||||
# http://tools.ietf.org/html/rfc3986#page-20
|
||||
subs = {'hex': hexdig, 'ls32': ls32}
|
||||
|
||||
# Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details
|
||||
# about ABNF (Augmented Backus-Naur Form) use in the comments
|
||||
variations = [
|
||||
# 6( h16 ":" ) ls32
|
||||
'(%(hex)s:){6}%(ls32)s' % subs,
|
||||
# "::" 5( h16 ":" ) ls32
|
||||
'::(%(hex)s:){5}%(ls32)s' % subs,
|
||||
# [ h16 ] "::" 4( h16 ":" ) ls32
|
||||
'(%(hex)s)?::(%(hex)s:){4}%(ls32)s' % subs,
|
||||
# [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
|
||||
'((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s' % subs,
|
||||
# [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
|
||||
'((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s' % subs,
|
||||
# [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
|
||||
'((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s' % subs,
|
||||
# [ *4( h16 ":" ) h16 ] "::" ls32
|
||||
'((%(hex)s:){0,4}%(hex)s)?::%(ls32)s' % subs,
|
||||
# [ *5( h16 ":" ) h16 ] "::" h16
|
||||
'((%(hex)s:){0,5}%(hex)s)?::%(hex)s' % subs,
|
||||
# [ *6( h16 ":" ) h16 ] "::"
|
||||
'((%(hex)s:){0,6}%(hex)s)?::' % subs,
|
||||
]
|
||||
|
||||
ipv6 = '(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7}))'.format(*variations)
|
||||
|
||||
ipv_future = 'v[0-9A-Fa-f]+.[%s]+' % (
|
||||
important_characters['re_unreserved'] +
|
||||
important_characters['re_sub_delimiters'] +
|
||||
':')
|
||||
|
||||
ip_literal = '\[({0}|{1})\]'.format(ipv6, ipv_future)
|
||||
|
||||
# Pattern for matching the host piece of the authority
|
||||
HOST_PATTERN = '({0}|{1}|{2})'.format(reg_name, ipv4, ip_literal)
|
||||
URI_MATCHER = re.compile(abnf_regexp.URL_PARSING_RE)
|
||||
|
||||
SUBAUTHORITY_MATCHER = re.compile((
|
||||
'^(?:(?P<userinfo>[A-Za-z0-9_.~\-%:]+)@)?' # userinfo
|
||||
'(?P<host>{0}?)' # host
|
||||
':?(?P<port>[0-9]+)?$' # port
|
||||
).format(HOST_PATTERN))
|
||||
|
||||
IPv4_MATCHER = re.compile('^' + ipv4 + '$')
|
||||
'^(?:(?P<userinfo>{0})@)?' # userinfo
|
||||
'(?P<host>{1}?)' # host
|
||||
':?(?P<port>{2})?$' # port
|
||||
).format(abnf_regexp.USERINFO_RE,
|
||||
abnf_regexp.HOST_PATTERN,
|
||||
abnf_regexp.PORT_RE))
|
||||
|
||||
|
||||
# ####################
|
||||
# Path Matcher Section
|
||||
# ####################
|
||||
|
||||
# See http://tools.ietf.org/html/rfc3986#section-3.3 for more information
|
||||
# about the path patterns defined below.
|
||||
|
||||
# Percent encoded character values
|
||||
pct_encoded = '%[A-Fa-f0-9]{2}'
|
||||
pchar = ('([' + important_characters['re_unreserved']
|
||||
+ important_characters['re_sub_delimiters']
|
||||
+ ':@]|%s)' % pct_encoded)
|
||||
segments = {
|
||||
'segment': pchar + '*',
|
||||
# Non-zero length segment
|
||||
'segment-nz': pchar + '+',
|
||||
# Non-zero length segment without ":"
|
||||
'segment-nz-nc': pchar.replace(':', '') + '+'
|
||||
}
|
||||
|
||||
# Path types taken from Section 3.3 (linked above)
|
||||
path_empty = '^$'
|
||||
path_rootless = '%(segment-nz)s(/%(segment)s)*' % segments
|
||||
path_noscheme = '%(segment-nz-nc)s(/%(segment)s)*' % segments
|
||||
path_absolute = '/(%s)?' % path_rootless
|
||||
path_abempty = '(/%(segment)s)*' % segments
|
||||
IPv4_MATCHER = re.compile('^' + abnf_regexp.IPv4_RE + '$')
|
||||
|
||||
# Matcher used to validate path components
|
||||
PATH_MATCHER = re.compile('^(%s|%s|%s|%s|%s)$' % (
|
||||
path_abempty, path_absolute, path_noscheme, path_rootless, path_empty
|
||||
))
|
||||
PATH_MATCHER = re.compile(abnf_regexp.PATH_RE)
|
||||
|
||||
|
||||
# ##################################
|
||||
# Query and Fragment Matcher Section
|
||||
# ##################################
|
||||
|
||||
QUERY_MATCHER = re.compile(
|
||||
'^([/?:@' + important_characters['re_unreserved']
|
||||
+ important_characters['re_sub_delimiters']
|
||||
+ ']|%s)*$' % pct_encoded)
|
||||
QUERY_MATCHER = re.compile(abnf_regexp.QUERY_RE)
|
||||
|
||||
FRAGMENT_MATCHER = QUERY_MATCHER
|
||||
|
||||
# Scheme validation, see: http://tools.ietf.org/html/rfc3986#section-3.1
|
||||
SCHEME_MATCHER = re.compile('^[A-Za-z][A-Za-z0-9+.\-]*$')
|
||||
|
||||
# Relative reference matcher
|
||||
|
||||
# See http://tools.ietf.org/html/rfc3986#section-4.2 for details
|
||||
relative_part = '(//%s%s|%s|%s|%s)' % (
|
||||
component_pattern_dict['authority'], path_abempty, path_absolute,
|
||||
path_noscheme, path_empty
|
||||
)
|
||||
SCHEME_MATCHER = re.compile('^{0}$'.format(abnf_regexp.SCHEME_RE))
|
||||
|
||||
RELATIVE_REF_MATCHER = re.compile('^%s(\?%s)?(#%s)?$' % (
|
||||
relative_part, QUERY_MATCHER.pattern, FRAGMENT_MATCHER.pattern
|
||||
))
|
||||
|
||||
# See http://tools.ietf.org/html/rfc3986#section-3 for definition
|
||||
hier_part = '(//%s%s|%s|%s|%s)' % (
|
||||
component_pattern_dict['authority'], path_abempty, path_absolute,
|
||||
path_rootless, path_empty
|
||||
)
|
||||
abnf_regexp.RELATIVE_PART_RE, abnf_regexp.QUERY_RE,
|
||||
abnf_regexp.FRAGMENT_RE,
|
||||
))
|
||||
|
||||
# See http://tools.ietf.org/html/rfc3986#section-4.3
|
||||
ABSOLUTE_URI_MATCHER = re.compile('^%s:%s(\?%s)?$' % (
|
||||
component_pattern_dict['scheme'], hier_part, QUERY_MATCHER.pattern[1:-1]
|
||||
))
|
||||
abnf_regexp.COMPONENT_PATTERN_DICT['scheme'],
|
||||
abnf_regexp.HIER_PART_RE,
|
||||
abnf_regexp.QUERY_RE[1:-1],
|
||||
))
|
||||
|
||||
|
||||
# Path merger as defined in http://tools.ietf.org/html/rfc3986#section-5.2.3
|
||||
|
|
Loading…
Reference in New Issue