fix: URI template field parsing bugs (#1019)

Fix the following bugs:

    * Only the '.' special character is properly handled, even though
      some URL schemas, such as OData, make use of other characters,
      like parens.
    * Using duplicate field names leads to a nasty re error being
      raised in the case of them being contained to the same
      path segment, and a silent overwriting of param values in the
      case of field values being duplicated across multiple segments.
This commit is contained in:
Kurt Griffiths 2017-04-20 08:41:55 -06:00 committed by John Vrbanac
parent 082dca5991
commit b1517e8104
3 changed files with 153 additions and 43 deletions

View File

@ -341,9 +341,11 @@ class API(object):
pass
Individual path segments may contain one or more field
expressions::
expressions, and fields need not span the entire path
segment. For example::
/repos/{org}/{repo}/compare/{usr0}:{branch0}...{usr1}:{branch1}
/serviceRoot/People('{name}')
Note:
Because field names correspond to argument names in responder

View File

@ -18,7 +18,8 @@ import keyword
import re
TAB_STR = ' ' * 4
_FIELD_REGEX = re.compile('{([^}]*)}')
_TAB_STR = ' ' * 4
class CompiledRouter(object):
@ -57,13 +58,27 @@ class CompiledRouter(object):
raise ValueError('URI templates may not include whitespace.')
# NOTE(kgriffs): Ensure fields are valid Python identifiers,
# since they will be passed as kwargs to responders.
fields = re.findall('{([^}]*)}', uri_template)
for field in fields:
is_identifier = re.match('[A-Za-z_][A-Za-z0-9_]*$', field)
if not is_identifier or field in keyword.kwlist:
# since they will be passed as kwargs to responders. Also
# ensure there are no duplicate names, since that causes the
# following problems:
#
# 1. For simple nodes, values from deeper nodes overwrite
# values from more shallow nodes.
# 2. For complex nodes, re.compile() raises a nasty error
#
fields = _FIELD_REGEX.findall(uri_template)
used_names = set()
for name in fields:
is_identifier = re.match('[A-Za-z_][A-Za-z0-9_]*$', name)
if not is_identifier or name in keyword.kwlist:
raise ValueError('Field names must be valid identifiers '
'(`{}` invalid).'.format(field))
"('{}' is not valid).".format(name))
if name in used_names:
raise ValueError('Field names may not be duplicated '
"('{}' was used more than once)".format(name))
used_names.add(name)
path = uri_template.strip('/').split('/')
@ -141,7 +156,7 @@ class CompiledRouter(object):
"""Generates Python code for a routing tree or subtree."""
def line(text, indent_offset=0):
pad = TAB_STR * (indent + indent_offset)
pad = _TAB_STR * (indent + indent_offset)
self._code_lines.append(pad + text)
# NOTE(kgriffs): Base case
@ -246,14 +261,14 @@ class CompiledRouter(object):
self._expressions = []
self._code_lines = [
'def find(path, return_values, expressions, params):',
TAB_STR + 'path_len = len(path)',
_TAB_STR + 'path_len = len(path)',
]
self._compile_tree(self._roots)
self._code_lines.append(
# PERF(kgriffs): Explicit return of None is faster than implicit
TAB_STR + 'return None'
_TAB_STR + 'return None'
)
self._src = '\n'.join(self._code_lines)
@ -267,8 +282,6 @@ class CompiledRouter(object):
class CompiledRouterNode(object):
"""Represents a single URI segment in a URI."""
_regex_vars = re.compile('{([-_a-zA-Z0-9]+)}')
def __init__(self, raw_segment,
method_map=None, resource=None, uri_template=None):
self.children = []
@ -282,39 +295,50 @@ class CompiledRouterNode(object):
self.is_complex = False
self.var_name = None
seg = raw_segment.replace('.', '\\.')
# NOTE(kgriffs): CompiledRouter.add_route validates field names,
# so here we can just assume they are OK and use the simple
# _FIELD_REGEX to match them.
matches = list(_FIELD_REGEX.finditer(raw_segment))
matches = list(self._regex_vars.finditer(seg))
if matches:
if not matches:
self.is_var = False
else:
self.is_var = True
# NOTE(richardolsson): if there is a single variable and it spans
# the entire segment, the segment is uncomplex and the variable
# name is simply the string contained within curly braces.
if len(matches) == 1 and matches[0].span() == (0, len(seg)):
if len(matches) == 1 and matches[0].span() == (0, len(raw_segment)):
# NOTE(richardolsson): if there is a single variable and
# it spans the entire segment, the segment is not
# complex and the variable name is simply the string
# contained within curly braces.
self.is_complex = False
self.var_name = raw_segment[1:-1]
else:
# NOTE(richardolsson): Complex segments need to be converted
# into regular expressions will be used to match and extract
# variable values. The regular expressions contain both
# literal spans and named group expressions for the variables.
# NOTE(richardolsson): Complex segments need to be
# converted into regular expressions in order to match
# and extract variable values. The regular expressions
# contain both literal spans and named group expressions
# for the variables.
# NOTE(kgriffs): Don't use re.escape() since we do not
# want to escape '{' or '}', and we don't want to
# introduce any unexpected side-effects by escaping
# non-ASCII characters (it is probably safe, but let's
# not take that chance in a minor point release).
#
# NOTE(kgriffs): The substitution template parser in the
# re library does not look ahead when collapsing '\\':
# therefore in the case of r'\\g<0>' the first r'\\'
# would be consumed and collapsed to r'\', and then the
# parser would examine 'g<0>' and not realize it is a
# group-escape sequence. So we add an extra backslash to
# trick the parser into doing the right thing.
escaped_segment = re.sub(r'[\.\(\)\[\]\?\$\*\+\^\|]', r'\\\g<0>', raw_segment)
seg_pattern = _FIELD_REGEX.sub(r'(?P<\1>.+)', escaped_segment)
seg_pattern = '^' + seg_pattern + '$'
self.is_complex = True
seg_fields = []
prev_end_idx = 0
for match in matches:
var_start_idx, var_end_idx = match.span()
seg_fields.append(seg[prev_end_idx:var_start_idx])
var_name = match.groups()[0].replace('-', '_')
seg_fields.append('(?P<%s>[^/]+)' % var_name)
prev_end_idx = var_end_idx
seg_fields.append(seg[prev_end_idx:])
seg_pattern = ''.join(seg_fields)
self.var_regex = re.compile(seg_pattern)
else:
self.is_var = False
def matches(self, segment):
"""Returns True if this node matches the supplied template segment."""
@ -365,8 +389,8 @@ class CompiledRouterNode(object):
#
if self.is_complex:
if other.is_complex:
return (self._regex_vars.sub('v', self.raw_segment) ==
self._regex_vars.sub('v', segment))
return (_FIELD_REGEX.sub('v', self.raw_segment) ==
_FIELD_REGEX.sub('v', segment))
return False
else:

View File

@ -105,7 +105,7 @@ class ResourceWithId(object):
# =====================================================================
def test_regression_versioned_url():
def test_user_regression_versioned_url():
router = DefaultRouter()
router.add_route('/{version}/messages', {}, ResourceWithId(2))
@ -127,7 +127,7 @@ def test_regression_versioned_url():
assert route is None
def test_regression_case_recipes():
def test_user_regression_recipes():
router = DefaultRouter()
router.add_route(
'/recipes/{activity}/{type_id}', {}, ResourceWithId(1))
@ -144,6 +144,63 @@ def test_regression_case_recipes():
assert route is None
@pytest.mark.parametrize('uri_template,path,expected_params', [
('/serviceRoot/People|{field}', '/serviceRoot/People|susie', {'field': 'susie'}),
('/serviceRoot/People[{field}]', "/serviceRoot/People['calvin']", {'field': "'calvin'"}),
('/serviceRoot/People({field})', "/serviceRoot/People('hobbes')", {'field': "'hobbes'"}),
('/serviceRoot/People({field})', "/serviceRoot/People('hob)bes')", {'field': "'hob)bes'"}),
('/serviceRoot/People({field})(z)', '/serviceRoot/People(hobbes)(z)', {'field': 'hobbes'}),
("/serviceRoot/People('{field}')", "/serviceRoot/People('rosalyn')", {'field': 'rosalyn'}),
('/^{field}', '/^42', {'field': '42'}),
('/+{field}', '/+42', {'field': '42'}),
(
'/foo/{first}_{second}/bar',
'/foo/abc_def_ghijk/bar',
# NOTE(kgriffs): The regex pattern is greedy, so this is
# expected. We can not change this behavior in a minor
# release, since it would be a breaking change. If there
# is enough demand for it, we could introduce an option
# to toggle this behavior.
{'first': 'abc_def', 'second': 'ghijk'},
),
# NOTE(kgriffs): Why someone would use a question mark like this
# I have no idea (esp. since it would have to be encoded to avoid
# being mistaken for the query string separator). Including it only
# for completeness.
('/items/{x}?{y}', '/items/1080?768', {'x': '1080', 'y': '768'}),
('/items/{x}|{y}', '/items/1080|768', {'x': '1080', 'y': '768'}),
('/items/{x},{y}', '/items/1080,768', {'x': '1080', 'y': '768'}),
('/items/{x}^^{y}', '/items/1080^^768', {'x': '1080', 'y': '768'}),
('/items/{x}*{y}*', '/items/1080*768*', {'x': '1080', 'y': '768'}),
('/thing-2/something+{field}+', '/thing-2/something+42+', {'field': '42'}),
('/thing-2/something*{field}/notes', '/thing-2/something*42/notes', {'field': '42'}),
(
'/thing-2/something+{field}|{q}/notes',
'/thing-2/something+else|z/notes',
{'field': 'else', 'q': 'z'},
),
(
"serviceRoot/$metadata#Airports('{field}')/Name",
"serviceRoot/$metadata#Airports('KSFO')/Name",
{'field': 'KSFO'},
),
])
def test_user_regression_special_chars(uri_template, path, expected_params):
router = DefaultRouter()
router.add_route(uri_template, {}, ResourceWithId(1))
route = router.find(path)
assert route is not None
resource, __, params, __ = route
assert resource.resource_id == 1
assert params == expected_params
# =====================================================================
# Other tests
# =====================================================================
@ -168,6 +225,33 @@ def test_root_path():
assert resource.resource_id == 42
@pytest.mark.parametrize('uri_template', [
'/{field}{field}',
'/{field}...{field}',
'/{field}/{another}/{field}',
'/{field}/something/something/{field}/something',
])
def test_duplicate_field_names(uri_template):
router = DefaultRouter()
with pytest.raises(ValueError):
router.add_route(uri_template, {}, ResourceWithId(1))
@pytest.mark.parametrize('uri_template,path', [
('/items/thing', '/items/t'),
('/items/{x}|{y}|', '/items/1080|768'),
('/items/{x}*{y}foo', '/items/1080*768foobar'),
('/items/{x}*768*', '/items/1080*768***'),
])
def test_match_entire_path(uri_template, path):
router = DefaultRouter()
router.add_route(uri_template, {}, ResourceWithId(1))
route = router.find(path)
assert route is None
@pytest.mark.parametrize('uri_template', [
'/teams/{collision}', # simple vs simple
'/emojis/signs/{id_too}', # another simple vs simple