build smaller name regexes for validation

The previous name regexes were built by walking the entire utf8 4 byte space and building out a regex with all characters that were printable. When dumped over the wire on a validation error this creates a nearly 2 MB text string (which is also pushed to the logs). We can be smarter and assemble character ranges because a-z in regex means all character values between those 2 character positions. This makes a validate regex approximately 4% the size of the previous one (len 1920 vs. len 54263). It thus mitigates the data dump (though it does not really give a more clear message). Tests were added to ensure we were building the ranges correctly, as this is sufficiently tricky. There will be follow up to create a better error message. A few additional low level validation tests of the regex themselves were added in the process of debugging the exclusion support for cells. Change-Id: I1d3cc40928d80b4397756846ab749b8bacf17fc9 Partial-Bug: #1541691
2016-02-16 11:45:47 -05:00 · 2016-02-16 11:45:47 -05:00 · f467af7c55
parent dbdf27021c
commit f467af7c55
2 changed files with 124 additions and 24 deletions
--- a/nova/api/validation/parameter_types.py
+++ b/nova/api/validation/parameter_types.py
@ -48,31 +48,72 @@ def _get_all_chars():
 # empty string is tested. Otherwise it is not deterministic which
 # constraint fails and this causes issues for some unittests when
 # PYTHONHASHSEED is set randomly.
-def _get_printable(exclude=None):
+
+def _build_regex_range(ws=True, invert=False, exclude=None):
+    """Build a range regex for a set of characters in utf8.
+
+    This builds a valid range regex for characters in utf8 by
+    iterating the entire space and building up a set of x-y ranges for
+    all the characters we find which are valid.
+
+    :param ws: should we include whitespace in this range.
+    :param exclude: any characters we want to exclude
+    :param invert: invert the logic
+
+    The inversion is useful when we want to generate a set of ranges
+    which is everything that's not a certain class. For instance,
+    produce all all the non printable characters as a set of ranges.
+    """
    if exclude is None:
        exclude = []
-    return ''.join(c for c in _get_all_chars()
-                       if _is_printable(c) and c not in exclude)
+    regex = ""
+    # are we currently in a range
+    in_range = False
+    # last character we found, for closing ranges
+    last = None
+    # last character we added to the regex, this lets us know that we
+    # already have B in the range, which means we don't need to close
+    # it out with B-B. While the later seems to work, it's kind of bad form.
+    last_added = None

+    def valid_char(char):
+        if char in exclude:
+            result = False
+        elif ws:
+            result = _is_printable(char)
+        else:
+            # Zs is the unicode class for space characters, of which
+            # there are about 10 in this range.
+            result = (_is_printable(char) and
+                      unicodedata.category(char) != "Zs")
+        if invert is True:
+            return not result
+        return result

-_printable_ws = ''.join(c for c in _get_all_chars()
-                        if unicodedata.category(c) == "Zs")
-
-
-def _get_printable_no_ws(exclude=None):
-    if exclude is None:
-        exclude = []
-    return ''.join(c for c in _get_all_chars()
-                   if _is_printable(c) and
-                       unicodedata.category(c) != "Zs" and
-                       c not in exclude)
+    # iterate through the entire character range. in_
+    for c in _get_all_chars():
+        if valid_char(c):
+            if not in_range:
+                regex += re.escape(c)
+                last_added = c
+            in_range = True
+        else:
+            if in_range and last != last_added:
+                regex += "-" + re.escape(last)
+            in_range = False
+        last = c
+    else:
+        if in_range:
+            regex += "-" + re.escape(c)
+    return regex

 valid_name_regex_base = '^(?![%s])[%s]*(?<![%s])$'


 valid_name_regex = valid_name_regex_base % (
-    re.escape(_printable_ws), re.escape(_get_printable()),
-    re.escape(_printable_ws))
+    _build_regex_range(ws=False, invert=True),
+    _build_regex_range(),
+    _build_regex_range(ws=False, invert=True))


 # This regex allows leading/trailing whitespace
@ -82,22 +123,22 @@ valid_name_leading_trailing_spaces_regex_base = (


 valid_cell_name_regex = valid_name_regex_base % (
-    re.escape(_printable_ws),
-    re.escape(_get_printable(exclude=['!', '.', '@'])),
-    re.escape(_printable_ws))
+    _build_regex_range(ws=False, invert=True),
+    _build_regex_range(exclude=['!', '.', '@']),
+    _build_regex_range(ws=False, invert=True))


 # cell's name disallow '!',  '.' and '@'.
 valid_cell_name_leading_trailing_spaces_regex = (
    valid_name_leading_trailing_spaces_regex_base % {
-        'ws': re.escape(_printable_ws),
-        'no_ws': re.escape(_get_printable_no_ws(exclude=['!', '.', '@']))})
+        'ws': _build_regex_range(exclude=['!', '.', '@']),
+        'no_ws': _build_regex_range(ws=False, exclude=['!', '.', '@'])})


 valid_name_leading_trailing_spaces_regex = (
    valid_name_leading_trailing_spaces_regex_base % {
-        'ws': re.escape(_printable_ws),
-        'no_ws': re.escape(_get_printable_no_ws())})
+        'ws': _build_regex_range(),
+        'no_ws': _build_regex_range(ws=False)})


 valid_name_regex_obj = re.compile(valid_name_regex, re.UNICODE)
@ -107,7 +148,7 @@ valid_description_regex_base = '^[%s]*$'


 valid_description_regex = valid_description_regex_base % (
-    re.escape(_get_printable()))
+    _build_regex_range())


 boolean = {
--- a/nova/tests/unit/test_api_validation.py
+++ b/nova/tests/unit/test_api_validation.py
@ -15,6 +15,9 @@
 import copy
 import re

+import fixtures
+import six
+
 from nova.api.openstack import api_version_request as api_version
 from nova.api import validation
 from nova.api.validation import parameter_types
@ -31,6 +34,62 @@ class FakeRequest(object):
        return self.legacy_v2


+class ValidationRegex(test.NoDBTestCase):
+    def test_cell_names(self):
+        cellre = re.compile(parameter_types.valid_cell_name_regex)
+        self.assertTrue(cellre.search('foo'))
+        self.assertFalse(cellre.search('foo.bar'))
+        self.assertFalse(cellre.search('foo@bar'))
+        self.assertFalse(cellre.search('foo!bar'))
+        self.assertFalse(cellre.search(' foo!bar'))
+        self.assertFalse(cellre.search('\nfoo!bar'))
+
+    def test_build_regex_range(self):
+        # this is much easier to think about if we only use the ascii
+        # subset because it's a printable range we can think
+        # about. The algorithm works for all ranges.
+        def _get_all_chars():
+            for i in range(0x7F):
+                yield six.unichr(i)
+
+        self.useFixture(fixtures.MonkeyPatch(
+            'nova.api.validation.parameter_types._get_all_chars',
+            _get_all_chars))
+
+        r = parameter_types._build_regex_range(ws=False)
+        self.assertEqual(r, re.escape('!') + '-' + re.escape('~'))
+
+        # if we allow whitespace the range starts earlier
+        r = parameter_types._build_regex_range(ws=True)
+        self.assertEqual(r, re.escape(' ') + '-' + re.escape('~'))
+
+        # excluding a character will give us 2 ranges
+        r = parameter_types._build_regex_range(ws=True, exclude=['A'])
+        self.assertEqual(r,
+                         re.escape(' ') + '-' + re.escape('@') +
+                         'B' + '-' + re.escape('~'))
+
+        # inverting which gives us all the initial unprintable characters.
+        r = parameter_types._build_regex_range(ws=False, invert=True)
+        self.assertEqual(r,
+                         re.escape('\x00') + '-' + re.escape(' '))
+
+        # excluding characters that create a singleton. Naively this would be:
+        # ' -@B-BD-~' which seems to work, but ' -@BD-~' is more natural.
+        r = parameter_types._build_regex_range(ws=True, exclude=['A', 'C'])
+        self.assertEqual(r,
+                         re.escape(' ') + '-' + re.escape('@') +
+                         'B' + 'D' + '-' + re.escape('~'))
+
+        # ws=True means the positive regex has printable whitespaces,
+        # so the inverse will not. The inverse will include things we
+        # exclude.
+        r = parameter_types._build_regex_range(
+            ws=True, exclude=['A', 'B', 'C', 'Z'], invert=True)
+        self.assertEqual(r,
+                         re.escape('\x00') + '-' + re.escape('\x1f') + 'A-CZ')
+
+
 class APIValidationTestCase(test.NoDBTestCase):

    def check_validation_error(self, method, body, expected_detail, req=None):