Merge "Fix codec error when format=csv"

This commit is contained in:
Zuul 2017-12-04 15:48:15 +00:00 committed by Gerrit Code Review
commit e14f905a4f
4 changed files with 122 additions and 7 deletions

View File

@ -118,10 +118,10 @@ class App(object):
stdin = codecs.getreader(encoding)(sys.stdin)
if not (stdout or isinstance(sys.stdout, codecs.StreamWriter)):
stdout = codecs.getwriter(encoding)(sys.stdout)
stdout = utils.getwriter(encoding)(sys.stdout)
if not (stderr or isinstance(sys.stderr, codecs.StreamWriter)):
stderr = codecs.getwriter(encoding)(sys.stderr)
stderr = utils.getwriter(encoding)(sys.stderr)
self.stdin = stdin or sys.stdin
self.stdout = stdout or sys.stdout

View File

@ -47,11 +47,24 @@ class CSVLister(ListFormatter):
)
def emit_list(self, column_names, data, stdout, parsed_args):
writer = csv.writer(stdout,
quoting=self.QUOTE_MODES[parsed_args.quote_mode],
lineterminator=os.linesep,
escapechar='\\',
)
writer_kwargs = dict(
quoting=self.QUOTE_MODES[parsed_args.quote_mode],
lineterminator=os.linesep,
escapechar='\\',
)
# In Py2 we replace the csv module with unicodecsv because the
# Py2 csv module cannot handle unicode. unicodecsv encodes
# unicode objects based on the value of it's encoding keyword
# with the result unicodecsv emits encoded bytes in a str
# object. The utils.getwriter assures no attempt is made to
# re-encode the encoded bytes in the str object.
if six.PY2:
writer_kwargs['encoding'] = (getattr(stdout, 'encoding', None)
or 'utf-8')
writer = csv.writer(stdout, **writer_kwargs)
writer.writerow(column_names)
for row in data:
writer.writerow(

View File

@ -498,3 +498,53 @@ class TestIO(base.TestBase):
self.assertIs(sys.stdin, app.stdin)
self.assertIs(sys.stdout, app.stdout)
self.assertIs(io, app.stderr)
def test_writer_encoding(self):
# The word "test" with the e replaced by
# Unicode latin small letter e with acute,
# U+00E9, utf-8 encoded as 0xC3 0xA9
text = u't\u00E9st'
text_utf8 = text.encode('utf-8')
if six.PY2:
# In PY2 StreamWriter can't accept non-ASCII encoded characters
# because it must first promote the encoded byte stream to
# unicode in order to encode it in the desired encoding.
# Because the encoding of the byte stream is not known at this
# point the default-encoding of ASCII is utilized, but you can't
# decode a non-ASCII charcater to ASCII.
io = six.StringIO()
writer = codecs.getwriter('utf-8')(io)
self.assertRaises(UnicodeDecodeError,
writer.write,
text_utf8)
# In PY2 with our override of codecs.getwriter we do not
# attempt to encode bytes in a str object (only unicode
# objects) therefore the final output string should be the
# utf-8 encoded byte sequence
io = six.StringIO()
writer = utils.getwriter('utf-8')(io)
writer.write(text)
output = io.getvalue()
self.assertEqual(text_utf8, output)
io = six.StringIO()
writer = utils.getwriter('utf-8')(io)
writer.write(text_utf8)
output = io.getvalue()
self.assertEqual(text_utf8, output)
else:
# In PY3 you can't write encoded bytes to a text writer
# instead text functions require text.
io = six.StringIO()
writer = utils.getwriter('utf-8')(io)
self.assertRaises(TypeError,
writer.write,
text)
io = six.StringIO()
writer = utils.getwriter('utf-8')(io)
self.assertRaises(TypeError,
writer.write,
text_utf8)

View File

@ -11,11 +11,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import codecs
import ctypes
import os
import struct
import sys
import six
# Each edit operation is assigned different cost, such as:
# 'w' means swap operation, the cost is 0;
# 's' means substitution operation, the cost is 2;
@ -153,3 +156,52 @@ def _get_terminal_width_ioctl(stdout):
return columns
except IOError:
return None
if six.PY2:
def getwriter(encoding):
'''Override codecs.getwriter() to prevent codec errors.
The StreamWriter returned by codecs.getwriter has an unfortunate
property, it will attempt to encode every object presented to it's
write() function. Normally we only want unicode objects to be
encoded to a byte stream. If bytes are presented (e.g. str in
Python2) we make the assumption those bytes represent an already
encoded text stream or they are indeed binary bytes and hence
should not be encoded.
When the core StreamWriter attempts to encode a str object Python
will first promote the str object to a unicode object. The
promotion of str to unicode requires the str bytes to be
decoded. However the encoding associated with the str object is
not known therefore Python applies the default-encoding which is
ASCII. In the case where the str object contains utf-8 encoded
non-ASCII characters a decoding error is raised. By not attempting
to encode a byte stream we avoid this error.
It really does not make much sense to try and encode a byte
stream. First of all a byte stream should not be encoded if it's
not text (e.g. binary data). If the byte stream is encoded text
the only way to re-encode it is if we known it's encoding so we
can decode it into a canonical form (e.g. unicode). Thus to
re-encode it we encode from the canonical form (e.g. unicode) to
the new binary encoding. The problem in Python2 is we never know
if the bytes in a str object are text or binary data and if it's
text which encoding it is, hence we should not try to apply
an encoding to a str object.
'''
class _StreamWriter(codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
def encode(self, msg, errors='strict'):
if isinstance(msg, six.text_type):
return self.encoder(msg, errors)
return msg, len(msg)
_StreamWriter.encoder = codecs.getencoder(encoding)
_StreamWriter.encoding = encoding
return _StreamWriter
else:
getwriter = codecs.getwriter