build smaller name regexes for validation
The previous name regexes were built by walking the entire utf8 4 byte space and building out a regex with all characters that were printable. When dumped over the wire on a validation error this creates a nearly 2 MB text string (which is also pushed to the logs). We can be smarter and assemble character ranges because a-z in regex means all character values between those 2 character positions. This makes a validate regex approximately 4% the size of the previous one (len 1920 vs. len 54263). It thus mitigates the data dump (though it does not really give a more clear message). Tests were added to ensure we were building the ranges correctly, as this is sufficiently tricky. There will be follow up to create a better error message. A few additional low level validation tests of the regex themselves were added in the process of debugging the exclusion support for cells. Change-Id: I1d3cc40928d80b4397756846ab749b8bacf17fc9 Partial-Bug: #1541691
This commit is contained in:
parent
dbdf27021c
commit
f467af7c55
|
@ -48,31 +48,72 @@ def _get_all_chars():
|
|||
# empty string is tested. Otherwise it is not deterministic which
|
||||
# constraint fails and this causes issues for some unittests when
|
||||
# PYTHONHASHSEED is set randomly.
|
||||
def _get_printable(exclude=None):
|
||||
|
||||
def _build_regex_range(ws=True, invert=False, exclude=None):
|
||||
"""Build a range regex for a set of characters in utf8.
|
||||
|
||||
This builds a valid range regex for characters in utf8 by
|
||||
iterating the entire space and building up a set of x-y ranges for
|
||||
all the characters we find which are valid.
|
||||
|
||||
:param ws: should we include whitespace in this range.
|
||||
:param exclude: any characters we want to exclude
|
||||
:param invert: invert the logic
|
||||
|
||||
The inversion is useful when we want to generate a set of ranges
|
||||
which is everything that's not a certain class. For instance,
|
||||
produce all all the non printable characters as a set of ranges.
|
||||
"""
|
||||
if exclude is None:
|
||||
exclude = []
|
||||
return ''.join(c for c in _get_all_chars()
|
||||
if _is_printable(c) and c not in exclude)
|
||||
regex = ""
|
||||
# are we currently in a range
|
||||
in_range = False
|
||||
# last character we found, for closing ranges
|
||||
last = None
|
||||
# last character we added to the regex, this lets us know that we
|
||||
# already have B in the range, which means we don't need to close
|
||||
# it out with B-B. While the later seems to work, it's kind of bad form.
|
||||
last_added = None
|
||||
|
||||
def valid_char(char):
|
||||
if char in exclude:
|
||||
result = False
|
||||
elif ws:
|
||||
result = _is_printable(char)
|
||||
else:
|
||||
# Zs is the unicode class for space characters, of which
|
||||
# there are about 10 in this range.
|
||||
result = (_is_printable(char) and
|
||||
unicodedata.category(char) != "Zs")
|
||||
if invert is True:
|
||||
return not result
|
||||
return result
|
||||
|
||||
_printable_ws = ''.join(c for c in _get_all_chars()
|
||||
if unicodedata.category(c) == "Zs")
|
||||
|
||||
|
||||
def _get_printable_no_ws(exclude=None):
|
||||
if exclude is None:
|
||||
exclude = []
|
||||
return ''.join(c for c in _get_all_chars()
|
||||
if _is_printable(c) and
|
||||
unicodedata.category(c) != "Zs" and
|
||||
c not in exclude)
|
||||
# iterate through the entire character range. in_
|
||||
for c in _get_all_chars():
|
||||
if valid_char(c):
|
||||
if not in_range:
|
||||
regex += re.escape(c)
|
||||
last_added = c
|
||||
in_range = True
|
||||
else:
|
||||
if in_range and last != last_added:
|
||||
regex += "-" + re.escape(last)
|
||||
in_range = False
|
||||
last = c
|
||||
else:
|
||||
if in_range:
|
||||
regex += "-" + re.escape(c)
|
||||
return regex
|
||||
|
||||
valid_name_regex_base = '^(?![%s])[%s]*(?<![%s])$'
|
||||
|
||||
|
||||
valid_name_regex = valid_name_regex_base % (
|
||||
re.escape(_printable_ws), re.escape(_get_printable()),
|
||||
re.escape(_printable_ws))
|
||||
_build_regex_range(ws=False, invert=True),
|
||||
_build_regex_range(),
|
||||
_build_regex_range(ws=False, invert=True))
|
||||
|
||||
|
||||
# This regex allows leading/trailing whitespace
|
||||
|
@ -82,22 +123,22 @@ valid_name_leading_trailing_spaces_regex_base = (
|
|||
|
||||
|
||||
valid_cell_name_regex = valid_name_regex_base % (
|
||||
re.escape(_printable_ws),
|
||||
re.escape(_get_printable(exclude=['!', '.', '@'])),
|
||||
re.escape(_printable_ws))
|
||||
_build_regex_range(ws=False, invert=True),
|
||||
_build_regex_range(exclude=['!', '.', '@']),
|
||||
_build_regex_range(ws=False, invert=True))
|
||||
|
||||
|
||||
# cell's name disallow '!', '.' and '@'.
|
||||
valid_cell_name_leading_trailing_spaces_regex = (
|
||||
valid_name_leading_trailing_spaces_regex_base % {
|
||||
'ws': re.escape(_printable_ws),
|
||||
'no_ws': re.escape(_get_printable_no_ws(exclude=['!', '.', '@']))})
|
||||
'ws': _build_regex_range(exclude=['!', '.', '@']),
|
||||
'no_ws': _build_regex_range(ws=False, exclude=['!', '.', '@'])})
|
||||
|
||||
|
||||
valid_name_leading_trailing_spaces_regex = (
|
||||
valid_name_leading_trailing_spaces_regex_base % {
|
||||
'ws': re.escape(_printable_ws),
|
||||
'no_ws': re.escape(_get_printable_no_ws())})
|
||||
'ws': _build_regex_range(),
|
||||
'no_ws': _build_regex_range(ws=False)})
|
||||
|
||||
|
||||
valid_name_regex_obj = re.compile(valid_name_regex, re.UNICODE)
|
||||
|
@ -107,7 +148,7 @@ valid_description_regex_base = '^[%s]*$'
|
|||
|
||||
|
||||
valid_description_regex = valid_description_regex_base % (
|
||||
re.escape(_get_printable()))
|
||||
_build_regex_range())
|
||||
|
||||
|
||||
boolean = {
|
||||
|
|
|
@ -15,6 +15,9 @@
|
|||
import copy
|
||||
import re
|
||||
|
||||
import fixtures
|
||||
import six
|
||||
|
||||
from nova.api.openstack import api_version_request as api_version
|
||||
from nova.api import validation
|
||||
from nova.api.validation import parameter_types
|
||||
|
@ -31,6 +34,62 @@ class FakeRequest(object):
|
|||
return self.legacy_v2
|
||||
|
||||
|
||||
class ValidationRegex(test.NoDBTestCase):
|
||||
def test_cell_names(self):
|
||||
cellre = re.compile(parameter_types.valid_cell_name_regex)
|
||||
self.assertTrue(cellre.search('foo'))
|
||||
self.assertFalse(cellre.search('foo.bar'))
|
||||
self.assertFalse(cellre.search('foo@bar'))
|
||||
self.assertFalse(cellre.search('foo!bar'))
|
||||
self.assertFalse(cellre.search(' foo!bar'))
|
||||
self.assertFalse(cellre.search('\nfoo!bar'))
|
||||
|
||||
def test_build_regex_range(self):
|
||||
# this is much easier to think about if we only use the ascii
|
||||
# subset because it's a printable range we can think
|
||||
# about. The algorithm works for all ranges.
|
||||
def _get_all_chars():
|
||||
for i in range(0x7F):
|
||||
yield six.unichr(i)
|
||||
|
||||
self.useFixture(fixtures.MonkeyPatch(
|
||||
'nova.api.validation.parameter_types._get_all_chars',
|
||||
_get_all_chars))
|
||||
|
||||
r = parameter_types._build_regex_range(ws=False)
|
||||
self.assertEqual(r, re.escape('!') + '-' + re.escape('~'))
|
||||
|
||||
# if we allow whitespace the range starts earlier
|
||||
r = parameter_types._build_regex_range(ws=True)
|
||||
self.assertEqual(r, re.escape(' ') + '-' + re.escape('~'))
|
||||
|
||||
# excluding a character will give us 2 ranges
|
||||
r = parameter_types._build_regex_range(ws=True, exclude=['A'])
|
||||
self.assertEqual(r,
|
||||
re.escape(' ') + '-' + re.escape('@') +
|
||||
'B' + '-' + re.escape('~'))
|
||||
|
||||
# inverting which gives us all the initial unprintable characters.
|
||||
r = parameter_types._build_regex_range(ws=False, invert=True)
|
||||
self.assertEqual(r,
|
||||
re.escape('\x00') + '-' + re.escape(' '))
|
||||
|
||||
# excluding characters that create a singleton. Naively this would be:
|
||||
# ' -@B-BD-~' which seems to work, but ' -@BD-~' is more natural.
|
||||
r = parameter_types._build_regex_range(ws=True, exclude=['A', 'C'])
|
||||
self.assertEqual(r,
|
||||
re.escape(' ') + '-' + re.escape('@') +
|
||||
'B' + 'D' + '-' + re.escape('~'))
|
||||
|
||||
# ws=True means the positive regex has printable whitespaces,
|
||||
# so the inverse will not. The inverse will include things we
|
||||
# exclude.
|
||||
r = parameter_types._build_regex_range(
|
||||
ws=True, exclude=['A', 'B', 'C', 'Z'], invert=True)
|
||||
self.assertEqual(r,
|
||||
re.escape('\x00') + '-' + re.escape('\x1f') + 'A-CZ')
|
||||
|
||||
|
||||
class APIValidationTestCase(test.NoDBTestCase):
|
||||
|
||||
def check_validation_error(self, method, body, expected_detail, req=None):
|
||||
|
|
Loading…
Reference in New Issue