fix: URI template field parsing bugs (#1019)

Fix the following bugs: * Only the '.' special character is properly handled, even though some URL schemas, such as OData, make use of other characters, like parens. * Using duplicate field names leads to a nasty re error being raised in the case of them being contained to the same path segment, and a silent overwriting of param values in the case of field values being duplicated across multiple segments.
2017-04-20 08:41:55 -06:00 · 2017-04-20 08:41:55 -06:00 · b1517e8104
parent 082dca5991
commit b1517e8104
3 changed files with 153 additions and 43 deletions
--- a/falcon/api.py
+++ b/falcon/api.py
@ -341,9 +341,11 @@ class API(object):
                pass

        Individual path segments may contain one or more field
-        expressions::
+        expressions, and fields need not span the entire path
+        segment. For example::

            /repos/{org}/{repo}/compare/{usr0}:{branch0}...{usr1}:{branch1}
+            /serviceRoot/People('{name}')

        Note:
            Because field names correspond to argument names in responder
--- a/falcon/routing/compiled.py
+++ b/falcon/routing/compiled.py
@ -18,7 +18,8 @@ import keyword
 import re


-TAB_STR = ' ' * 4
+_FIELD_REGEX = re.compile('{([^}]*)}')
+_TAB_STR = ' ' * 4


 class CompiledRouter(object):
@ -57,13 +58,27 @@ class CompiledRouter(object):
            raise ValueError('URI templates may not include whitespace.')

        # NOTE(kgriffs): Ensure fields are valid Python identifiers,
-        # since they will be passed as kwargs to responders.
-        fields = re.findall('{([^}]*)}', uri_template)
-        for field in fields:
-            is_identifier = re.match('[A-Za-z_][A-Za-z0-9_]*$', field)
-            if not is_identifier or field in keyword.kwlist:
+        # since they will be passed as kwargs to responders. Also
+        # ensure there are no duplicate names, since that causes the
+        # following problems:
+        #
+        #   1. For simple nodes, values from deeper nodes overwrite
+        #      values from more shallow nodes.
+        #   2. For complex nodes, re.compile() raises a nasty error
+        #
+        fields = _FIELD_REGEX.findall(uri_template)
+        used_names = set()
+        for name in fields:
+            is_identifier = re.match('[A-Za-z_][A-Za-z0-9_]*$', name)
+            if not is_identifier or name in keyword.kwlist:
                raise ValueError('Field names must be valid identifiers '
-                                 '(`{}` invalid).'.format(field))
+                                 "('{}' is not valid).".format(name))
+
+            if name in used_names:
+                raise ValueError('Field names may not be duplicated '
+                                 "('{}' was used more than once)".format(name))
+
+            used_names.add(name)

        path = uri_template.strip('/').split('/')

@ -141,7 +156,7 @@ class CompiledRouter(object):
        """Generates Python code for a routing tree or subtree."""

        def line(text, indent_offset=0):
-            pad = TAB_STR * (indent + indent_offset)
+            pad = _TAB_STR * (indent + indent_offset)
            self._code_lines.append(pad + text)

        # NOTE(kgriffs): Base case
@ -246,14 +261,14 @@ class CompiledRouter(object):
        self._expressions = []
        self._code_lines = [
            'def find(path, return_values, expressions, params):',
-            TAB_STR + 'path_len = len(path)',
+            _TAB_STR + 'path_len = len(path)',
        ]

        self._compile_tree(self._roots)

        self._code_lines.append(
            # PERF(kgriffs): Explicit return of None is faster than implicit
-            TAB_STR + 'return None'
+            _TAB_STR + 'return None'
        )

        self._src = '\n'.join(self._code_lines)
@ -267,8 +282,6 @@ class CompiledRouter(object):
 class CompiledRouterNode(object):
    """Represents a single URI segment in a URI."""

-    _regex_vars = re.compile('{([-_a-zA-Z0-9]+)}')
-
    def __init__(self, raw_segment,
                 method_map=None, resource=None, uri_template=None):
        self.children = []
@ -282,39 +295,50 @@ class CompiledRouterNode(object):
        self.is_complex = False
        self.var_name = None

-        seg = raw_segment.replace('.', '\\.')
+        # NOTE(kgriffs): CompiledRouter.add_route validates field names,
+        # so here we can just assume they are OK and use the simple
+        # _FIELD_REGEX to match them.
+        matches = list(_FIELD_REGEX.finditer(raw_segment))

-        matches = list(self._regex_vars.finditer(seg))
-        if matches:
+        if not matches:
+            self.is_var = False
+        else:
            self.is_var = True
-            # NOTE(richardolsson): if there is a single variable and it spans
-            # the entire segment, the segment is uncomplex and the variable
-            # name is simply the string contained within curly braces.
-            if len(matches) == 1 and matches[0].span() == (0, len(seg)):
+
+            if len(matches) == 1 and matches[0].span() == (0, len(raw_segment)):
+                # NOTE(richardolsson): if there is a single variable and
+                # it spans the entire segment, the segment is not
+                # complex and the variable name is simply the string
+                # contained within curly braces.
                self.is_complex = False
                self.var_name = raw_segment[1:-1]
            else:
-                # NOTE(richardolsson): Complex segments need to be converted
-                # into regular expressions will be used to match and extract
-                # variable values. The regular expressions contain both
-                # literal spans and named group expressions for the variables.
+                # NOTE(richardolsson): Complex segments need to be
+                # converted into regular expressions in order to match
+                # and extract variable values. The regular expressions
+                # contain both literal spans and named group expressions
+                # for the variables.
+
+                # NOTE(kgriffs): Don't use re.escape() since we do not
+                # want to escape '{' or '}', and we don't want to
+                # introduce any unexpected side-effects by escaping
+                # non-ASCII characters (it is probably safe, but let's
+                # not take that chance in a minor point release).
+                #
+                # NOTE(kgriffs): The substitution template parser in the
+                # re library does not look ahead when collapsing '\\':
+                # therefore in the case of r'\\g<0>' the first r'\\'
+                # would be consumed and collapsed to r'\', and then the
+                # parser would examine 'g<0>' and not realize it is a
+                # group-escape sequence. So we add an extra backslash to
+                # trick the parser into doing the right thing.
+                escaped_segment = re.sub(r'[\.\(\)\[\]\?\$\*\+\^\|]', r'\\\g<0>', raw_segment)
+
+                seg_pattern = _FIELD_REGEX.sub(r'(?P<\1>.+)', escaped_segment)
+                seg_pattern = '^' + seg_pattern + '$'
+
                self.is_complex = True
-                seg_fields = []
-                prev_end_idx = 0
-                for match in matches:
-                    var_start_idx, var_end_idx = match.span()
-                    seg_fields.append(seg[prev_end_idx:var_start_idx])
-
-                    var_name = match.groups()[0].replace('-', '_')
-                    seg_fields.append('(?P<%s>[^/]+)' % var_name)
-
-                    prev_end_idx = var_end_idx
-
-                seg_fields.append(seg[prev_end_idx:])
-                seg_pattern = ''.join(seg_fields)
                self.var_regex = re.compile(seg_pattern)
-        else:
-            self.is_var = False

    def matches(self, segment):
        """Returns True if this node matches the supplied template segment."""
@ -365,8 +389,8 @@ class CompiledRouterNode(object):
            #
            if self.is_complex:
                if other.is_complex:
-                    return (self._regex_vars.sub('v', self.raw_segment) ==
-                            self._regex_vars.sub('v', segment))
+                    return (_FIELD_REGEX.sub('v', self.raw_segment) ==
+                            _FIELD_REGEX.sub('v', segment))

                return False
            else:
--- a/tests/test_default_router.py
+++ b/tests/test_default_router.py
@ -105,7 +105,7 @@ class ResourceWithId(object):
 # =====================================================================


-def test_regression_versioned_url():
+def test_user_regression_versioned_url():
    router = DefaultRouter()
    router.add_route('/{version}/messages', {}, ResourceWithId(2))

@ -127,7 +127,7 @@ def test_regression_versioned_url():
    assert route is None


-def test_regression_case_recipes():
+def test_user_regression_recipes():
    router = DefaultRouter()
    router.add_route(
        '/recipes/{activity}/{type_id}', {}, ResourceWithId(1))
@ -144,6 +144,63 @@ def test_regression_case_recipes():
    assert route is None


+@pytest.mark.parametrize('uri_template,path,expected_params', [
+    ('/serviceRoot/People|{field}', '/serviceRoot/People|susie', {'field': 'susie'}),
+    ('/serviceRoot/People[{field}]', "/serviceRoot/People['calvin']", {'field': "'calvin'"}),
+    ('/serviceRoot/People({field})', "/serviceRoot/People('hobbes')", {'field': "'hobbes'"}),
+    ('/serviceRoot/People({field})', "/serviceRoot/People('hob)bes')", {'field': "'hob)bes'"}),
+    ('/serviceRoot/People({field})(z)', '/serviceRoot/People(hobbes)(z)', {'field': 'hobbes'}),
+    ("/serviceRoot/People('{field}')", "/serviceRoot/People('rosalyn')", {'field': 'rosalyn'}),
+    ('/^{field}', '/^42', {'field': '42'}),
+    ('/+{field}', '/+42', {'field': '42'}),
+    (
+        '/foo/{first}_{second}/bar',
+        '/foo/abc_def_ghijk/bar',
+
+        # NOTE(kgriffs): The regex pattern is greedy, so this is
+        # expected. We can not change this behavior in a minor
+        # release, since it would be a breaking change. If there
+        # is enough demand for it, we could introduce an option
+        # to toggle this behavior.
+        {'first': 'abc_def', 'second': 'ghijk'},
+    ),
+
+    # NOTE(kgriffs): Why someone would use a question mark like this
+    # I have no idea (esp. since it would have to be encoded to avoid
+    # being mistaken for the query string separator). Including it only
+    # for completeness.
+    ('/items/{x}?{y}', '/items/1080?768', {'x': '1080', 'y': '768'}),
+
+    ('/items/{x}|{y}', '/items/1080|768', {'x': '1080', 'y': '768'}),
+    ('/items/{x},{y}', '/items/1080,768', {'x': '1080', 'y': '768'}),
+    ('/items/{x}^^{y}', '/items/1080^^768', {'x': '1080', 'y': '768'}),
+    ('/items/{x}*{y}*', '/items/1080*768*', {'x': '1080', 'y': '768'}),
+    ('/thing-2/something+{field}+', '/thing-2/something+42+', {'field': '42'}),
+    ('/thing-2/something*{field}/notes', '/thing-2/something*42/notes', {'field': '42'}),
+    (
+        '/thing-2/something+{field}|{q}/notes',
+        '/thing-2/something+else|z/notes',
+        {'field': 'else', 'q': 'z'},
+    ),
+    (
+        "serviceRoot/$metadata#Airports('{field}')/Name",
+        "serviceRoot/$metadata#Airports('KSFO')/Name",
+        {'field': 'KSFO'},
+    ),
+])
+def test_user_regression_special_chars(uri_template, path, expected_params):
+    router = DefaultRouter()
+
+    router.add_route(uri_template, {}, ResourceWithId(1))
+
+    route = router.find(path)
+    assert route is not None
+
+    resource, __, params, __ = route
+    assert resource.resource_id == 1
+    assert params == expected_params
+
+
 # =====================================================================
 # Other tests
 # =====================================================================
@ -168,6 +225,33 @@ def test_root_path():
    assert resource.resource_id == 42


+@pytest.mark.parametrize('uri_template', [
+    '/{field}{field}',
+    '/{field}...{field}',
+    '/{field}/{another}/{field}',
+    '/{field}/something/something/{field}/something',
+])
+def test_duplicate_field_names(uri_template):
+    router = DefaultRouter()
+    with pytest.raises(ValueError):
+        router.add_route(uri_template, {}, ResourceWithId(1))
+
+
+@pytest.mark.parametrize('uri_template,path', [
+    ('/items/thing', '/items/t'),
+    ('/items/{x}|{y}|', '/items/1080|768'),
+    ('/items/{x}*{y}foo', '/items/1080*768foobar'),
+    ('/items/{x}*768*', '/items/1080*768***'),
+])
+def test_match_entire_path(uri_template, path):
+    router = DefaultRouter()
+
+    router.add_route(uri_template, {}, ResourceWithId(1))
+
+    route = router.find(path)
+    assert route is None
+
+
@pytest.mark.parametrize('uri_template', [
    '/teams/{collision}',  # simple vs simple
    '/emojis/signs/{id_too}',  # another simple vs simple