build smaller name regexes for validation

The previous name regexes were built by walking the entire utf8 4 byte
space and building out a regex with all characters that were
printable. When dumped over the wire on a validation error this
creates a nearly 2 MB text string (which is also pushed to the logs).

We can be smarter and assemble character ranges because a-z in regex
means all character values between those 2 character positions. This
makes a validate regex approximately 4% the size of the previous one
(len 1920 vs. len 54263). It thus mitigates the data dump (though it
does not really give a more clear message).

Tests were added to ensure we were building the ranges correctly, as
this is sufficiently tricky.

There will be follow up to create a better error message. A few
additional low level validation tests of the regex themselves were
added in the process of debugging the exclusion support for cells.

Change-Id: I1d3cc40928d80b4397756846ab749b8bacf17fc9
Partial-Bug: #1541691
This commit is contained in:
Sean Dague 2016-02-16 11:45:47 -05:00
parent dbdf27021c
commit f467af7c55
2 changed files with 124 additions and 24 deletions

View File

@ -48,31 +48,72 @@ def _get_all_chars():
# empty string is tested. Otherwise it is not deterministic which
# constraint fails and this causes issues for some unittests when
# PYTHONHASHSEED is set randomly.
def _get_printable(exclude=None):
def _build_regex_range(ws=True, invert=False, exclude=None):
"""Build a range regex for a set of characters in utf8.
This builds a valid range regex for characters in utf8 by
iterating the entire space and building up a set of x-y ranges for
all the characters we find which are valid.
:param ws: should we include whitespace in this range.
:param exclude: any characters we want to exclude
:param invert: invert the logic
The inversion is useful when we want to generate a set of ranges
which is everything that's not a certain class. For instance,
produce all all the non printable characters as a set of ranges.
"""
if exclude is None:
exclude = []
return ''.join(c for c in _get_all_chars()
if _is_printable(c) and c not in exclude)
regex = ""
# are we currently in a range
in_range = False
# last character we found, for closing ranges
last = None
# last character we added to the regex, this lets us know that we
# already have B in the range, which means we don't need to close
# it out with B-B. While the later seems to work, it's kind of bad form.
last_added = None
def valid_char(char):
if char in exclude:
result = False
elif ws:
result = _is_printable(char)
else:
# Zs is the unicode class for space characters, of which
# there are about 10 in this range.
result = (_is_printable(char) and
unicodedata.category(char) != "Zs")
if invert is True:
return not result
return result
_printable_ws = ''.join(c for c in _get_all_chars()
if unicodedata.category(c) == "Zs")
def _get_printable_no_ws(exclude=None):
if exclude is None:
exclude = []
return ''.join(c for c in _get_all_chars()
if _is_printable(c) and
unicodedata.category(c) != "Zs" and
c not in exclude)
# iterate through the entire character range. in_
for c in _get_all_chars():
if valid_char(c):
if not in_range:
regex += re.escape(c)
last_added = c
in_range = True
else:
if in_range and last != last_added:
regex += "-" + re.escape(last)
in_range = False
last = c
else:
if in_range:
regex += "-" + re.escape(c)
return regex
valid_name_regex_base = '^(?![%s])[%s]*(?<![%s])$'
valid_name_regex = valid_name_regex_base % (
re.escape(_printable_ws), re.escape(_get_printable()),
re.escape(_printable_ws))
_build_regex_range(ws=False, invert=True),
_build_regex_range(),
_build_regex_range(ws=False, invert=True))
# This regex allows leading/trailing whitespace
@ -82,22 +123,22 @@ valid_name_leading_trailing_spaces_regex_base = (
valid_cell_name_regex = valid_name_regex_base % (
re.escape(_printable_ws),
re.escape(_get_printable(exclude=['!', '.', '@'])),
re.escape(_printable_ws))
_build_regex_range(ws=False, invert=True),
_build_regex_range(exclude=['!', '.', '@']),
_build_regex_range(ws=False, invert=True))
# cell's name disallow '!', '.' and '@'.
valid_cell_name_leading_trailing_spaces_regex = (
valid_name_leading_trailing_spaces_regex_base % {
'ws': re.escape(_printable_ws),
'no_ws': re.escape(_get_printable_no_ws(exclude=['!', '.', '@']))})
'ws': _build_regex_range(exclude=['!', '.', '@']),
'no_ws': _build_regex_range(ws=False, exclude=['!', '.', '@'])})
valid_name_leading_trailing_spaces_regex = (
valid_name_leading_trailing_spaces_regex_base % {
'ws': re.escape(_printable_ws),
'no_ws': re.escape(_get_printable_no_ws())})
'ws': _build_regex_range(),
'no_ws': _build_regex_range(ws=False)})
valid_name_regex_obj = re.compile(valid_name_regex, re.UNICODE)
@ -107,7 +148,7 @@ valid_description_regex_base = '^[%s]*$'
valid_description_regex = valid_description_regex_base % (
re.escape(_get_printable()))
_build_regex_range())
boolean = {

View File

@ -15,6 +15,9 @@
import copy
import re
import fixtures
import six
from nova.api.openstack import api_version_request as api_version
from nova.api import validation
from nova.api.validation import parameter_types
@ -31,6 +34,62 @@ class FakeRequest(object):
return self.legacy_v2
class ValidationRegex(test.NoDBTestCase):
def test_cell_names(self):
cellre = re.compile(parameter_types.valid_cell_name_regex)
self.assertTrue(cellre.search('foo'))
self.assertFalse(cellre.search('foo.bar'))
self.assertFalse(cellre.search('foo@bar'))
self.assertFalse(cellre.search('foo!bar'))
self.assertFalse(cellre.search(' foo!bar'))
self.assertFalse(cellre.search('\nfoo!bar'))
def test_build_regex_range(self):
# this is much easier to think about if we only use the ascii
# subset because it's a printable range we can think
# about. The algorithm works for all ranges.
def _get_all_chars():
for i in range(0x7F):
yield six.unichr(i)
self.useFixture(fixtures.MonkeyPatch(
'nova.api.validation.parameter_types._get_all_chars',
_get_all_chars))
r = parameter_types._build_regex_range(ws=False)
self.assertEqual(r, re.escape('!') + '-' + re.escape('~'))
# if we allow whitespace the range starts earlier
r = parameter_types._build_regex_range(ws=True)
self.assertEqual(r, re.escape(' ') + '-' + re.escape('~'))
# excluding a character will give us 2 ranges
r = parameter_types._build_regex_range(ws=True, exclude=['A'])
self.assertEqual(r,
re.escape(' ') + '-' + re.escape('@') +
'B' + '-' + re.escape('~'))
# inverting which gives us all the initial unprintable characters.
r = parameter_types._build_regex_range(ws=False, invert=True)
self.assertEqual(r,
re.escape('\x00') + '-' + re.escape(' '))
# excluding characters that create a singleton. Naively this would be:
# ' -@B-BD-~' which seems to work, but ' -@BD-~' is more natural.
r = parameter_types._build_regex_range(ws=True, exclude=['A', 'C'])
self.assertEqual(r,
re.escape(' ') + '-' + re.escape('@') +
'B' + 'D' + '-' + re.escape('~'))
# ws=True means the positive regex has printable whitespaces,
# so the inverse will not. The inverse will include things we
# exclude.
r = parameter_types._build_regex_range(
ws=True, exclude=['A', 'B', 'C', 'Z'], invert=True)
self.assertEqual(r,
re.escape('\x00') + '-' + re.escape('\x1f') + 'A-CZ')
class APIValidationTestCase(test.NoDBTestCase):
def check_validation_error(self, method, body, expected_detail, req=None):