Merge "Fix codec error when format=csv"
This commit is contained in:
commit
e14f905a4f
|
@ -118,10 +118,10 @@ class App(object):
|
|||
stdin = codecs.getreader(encoding)(sys.stdin)
|
||||
|
||||
if not (stdout or isinstance(sys.stdout, codecs.StreamWriter)):
|
||||
stdout = codecs.getwriter(encoding)(sys.stdout)
|
||||
stdout = utils.getwriter(encoding)(sys.stdout)
|
||||
|
||||
if not (stderr or isinstance(sys.stderr, codecs.StreamWriter)):
|
||||
stderr = codecs.getwriter(encoding)(sys.stderr)
|
||||
stderr = utils.getwriter(encoding)(sys.stderr)
|
||||
|
||||
self.stdin = stdin or sys.stdin
|
||||
self.stdout = stdout or sys.stdout
|
||||
|
|
|
@ -47,11 +47,24 @@ class CSVLister(ListFormatter):
|
|||
)
|
||||
|
||||
def emit_list(self, column_names, data, stdout, parsed_args):
|
||||
writer = csv.writer(stdout,
|
||||
quoting=self.QUOTE_MODES[parsed_args.quote_mode],
|
||||
lineterminator=os.linesep,
|
||||
escapechar='\\',
|
||||
)
|
||||
writer_kwargs = dict(
|
||||
quoting=self.QUOTE_MODES[parsed_args.quote_mode],
|
||||
lineterminator=os.linesep,
|
||||
escapechar='\\',
|
||||
)
|
||||
|
||||
# In Py2 we replace the csv module with unicodecsv because the
|
||||
# Py2 csv module cannot handle unicode. unicodecsv encodes
|
||||
# unicode objects based on the value of it's encoding keyword
|
||||
# with the result unicodecsv emits encoded bytes in a str
|
||||
# object. The utils.getwriter assures no attempt is made to
|
||||
# re-encode the encoded bytes in the str object.
|
||||
|
||||
if six.PY2:
|
||||
writer_kwargs['encoding'] = (getattr(stdout, 'encoding', None)
|
||||
or 'utf-8')
|
||||
|
||||
writer = csv.writer(stdout, **writer_kwargs)
|
||||
writer.writerow(column_names)
|
||||
for row in data:
|
||||
writer.writerow(
|
||||
|
|
|
@ -498,3 +498,53 @@ class TestIO(base.TestBase):
|
|||
self.assertIs(sys.stdin, app.stdin)
|
||||
self.assertIs(sys.stdout, app.stdout)
|
||||
self.assertIs(io, app.stderr)
|
||||
|
||||
def test_writer_encoding(self):
|
||||
# The word "test" with the e replaced by
|
||||
# Unicode latin small letter e with acute,
|
||||
# U+00E9, utf-8 encoded as 0xC3 0xA9
|
||||
text = u't\u00E9st'
|
||||
text_utf8 = text.encode('utf-8')
|
||||
|
||||
if six.PY2:
|
||||
# In PY2 StreamWriter can't accept non-ASCII encoded characters
|
||||
# because it must first promote the encoded byte stream to
|
||||
# unicode in order to encode it in the desired encoding.
|
||||
# Because the encoding of the byte stream is not known at this
|
||||
# point the default-encoding of ASCII is utilized, but you can't
|
||||
# decode a non-ASCII charcater to ASCII.
|
||||
io = six.StringIO()
|
||||
writer = codecs.getwriter('utf-8')(io)
|
||||
self.assertRaises(UnicodeDecodeError,
|
||||
writer.write,
|
||||
text_utf8)
|
||||
|
||||
# In PY2 with our override of codecs.getwriter we do not
|
||||
# attempt to encode bytes in a str object (only unicode
|
||||
# objects) therefore the final output string should be the
|
||||
# utf-8 encoded byte sequence
|
||||
io = six.StringIO()
|
||||
writer = utils.getwriter('utf-8')(io)
|
||||
writer.write(text)
|
||||
output = io.getvalue()
|
||||
self.assertEqual(text_utf8, output)
|
||||
|
||||
io = six.StringIO()
|
||||
writer = utils.getwriter('utf-8')(io)
|
||||
writer.write(text_utf8)
|
||||
output = io.getvalue()
|
||||
self.assertEqual(text_utf8, output)
|
||||
else:
|
||||
# In PY3 you can't write encoded bytes to a text writer
|
||||
# instead text functions require text.
|
||||
io = six.StringIO()
|
||||
writer = utils.getwriter('utf-8')(io)
|
||||
self.assertRaises(TypeError,
|
||||
writer.write,
|
||||
text)
|
||||
|
||||
io = six.StringIO()
|
||||
writer = utils.getwriter('utf-8')(io)
|
||||
self.assertRaises(TypeError,
|
||||
writer.write,
|
||||
text_utf8)
|
||||
|
|
|
@ -11,11 +11,14 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import codecs
|
||||
import ctypes
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
|
||||
import six
|
||||
|
||||
# Each edit operation is assigned different cost, such as:
|
||||
# 'w' means swap operation, the cost is 0;
|
||||
# 's' means substitution operation, the cost is 2;
|
||||
|
@ -153,3 +156,52 @@ def _get_terminal_width_ioctl(stdout):
|
|||
return columns
|
||||
except IOError:
|
||||
return None
|
||||
|
||||
|
||||
if six.PY2:
|
||||
def getwriter(encoding):
|
||||
'''Override codecs.getwriter() to prevent codec errors.
|
||||
|
||||
The StreamWriter returned by codecs.getwriter has an unfortunate
|
||||
property, it will attempt to encode every object presented to it's
|
||||
write() function. Normally we only want unicode objects to be
|
||||
encoded to a byte stream. If bytes are presented (e.g. str in
|
||||
Python2) we make the assumption those bytes represent an already
|
||||
encoded text stream or they are indeed binary bytes and hence
|
||||
should not be encoded.
|
||||
|
||||
When the core StreamWriter attempts to encode a str object Python
|
||||
will first promote the str object to a unicode object. The
|
||||
promotion of str to unicode requires the str bytes to be
|
||||
decoded. However the encoding associated with the str object is
|
||||
not known therefore Python applies the default-encoding which is
|
||||
ASCII. In the case where the str object contains utf-8 encoded
|
||||
non-ASCII characters a decoding error is raised. By not attempting
|
||||
to encode a byte stream we avoid this error.
|
||||
|
||||
It really does not make much sense to try and encode a byte
|
||||
stream. First of all a byte stream should not be encoded if it's
|
||||
not text (e.g. binary data). If the byte stream is encoded text
|
||||
the only way to re-encode it is if we known it's encoding so we
|
||||
can decode it into a canonical form (e.g. unicode). Thus to
|
||||
re-encode it we encode from the canonical form (e.g. unicode) to
|
||||
the new binary encoding. The problem in Python2 is we never know
|
||||
if the bytes in a str object are text or binary data and if it's
|
||||
text which encoding it is, hence we should not try to apply
|
||||
an encoding to a str object.
|
||||
'''
|
||||
class _StreamWriter(codecs.StreamWriter):
|
||||
def __init__(self, stream, errors='strict'):
|
||||
codecs.StreamWriter.__init__(self, stream, errors)
|
||||
|
||||
def encode(self, msg, errors='strict'):
|
||||
if isinstance(msg, six.text_type):
|
||||
return self.encoder(msg, errors)
|
||||
return msg, len(msg)
|
||||
|
||||
_StreamWriter.encoder = codecs.getencoder(encoding)
|
||||
_StreamWriter.encoding = encoding
|
||||
return _StreamWriter
|
||||
|
||||
else:
|
||||
getwriter = codecs.getwriter
|
||||
|
|
Loading…
Reference in New Issue