Fix codec error when format=csv

The Py2 version of the csv module cannot accept unicode text. In Py2
we replaced the csv module with the unicodecsv module which encodes
unicode text prior to calling the standard Py2 csv module. Thus when
the cvs formatted output is emitted on stdout it is presented as a
encoded byte stream. But stdout has been replaced with a StreamWriter
which encodes to the desired encoding. The problem is the StreamWriter
attempts to encode all objects passed to it's write function,
including str objects. Instead it should only encode unicode text objects
and allow bytes to pass through unmodified.

This patch adds an override of the codecs.getwriter function which
only encodes unicode text objects. In addtion we pass the encoding
value obtained from the stream to the unicodecsv writer.

The patch fixes the codec error when outputing csv formated text that
contains a non-ASCII character. The unicodecsv implmentation will emit
byte encoded str objects to the stream. When the core StreamWriter
attempts to encode a str object Python will first promote the str
object to a unicode object. The promotion of str to unicode requires
the str bytes to be decoded. However the encoding associated with the
str object is not known therefore Python applies the default-encoding
which is ASCII. In the case where the str object contains utf-8
encoded non-ASCII characters a decoding error is raised. By not
attempting to encode a byte stream we avoid this error.

A more complete discussion of the above issues can be found here:

https://github.com/fedora-infra/kitchen/blob/develop/kitchen2/docs/unicode-frustrations.rst#frustration-4-now-it-doesnt-take-byte-strings

Conflicts:
    cliff/tests/test_app.py

Change-Id: I22b5ad8bf0e227ec75a2a36986f0487191f7cbc2
Closes-Bug: 1720115
Signed-off-by: John Dennis <jdennis@redhat.com>
(cherry picked from commit c61cc30060)
This commit is contained in:
John Dennis 2017-10-01 11:03:08 -04:00 committed by Julie Pichon
parent 616dde46e2
commit 132c948aed
4 changed files with 122 additions and 7 deletions

View File

@ -118,10 +118,10 @@ class App(object):
stdin = codecs.getreader(encoding)(sys.stdin)
if not (stdout or isinstance(sys.stdout, codecs.StreamWriter)):
stdout = codecs.getwriter(encoding)(sys.stdout)
stdout = utils.getwriter(encoding)(sys.stdout)
if not (stderr or isinstance(sys.stderr, codecs.StreamWriter)):
stderr = codecs.getwriter(encoding)(sys.stderr)
stderr = utils.getwriter(encoding)(sys.stderr)
self.stdin = stdin or sys.stdin
self.stdout = stdout or sys.stdout

View File

@ -47,11 +47,24 @@ class CSVLister(ListFormatter):
)
def emit_list(self, column_names, data, stdout, parsed_args):
writer = csv.writer(stdout,
quoting=self.QUOTE_MODES[parsed_args.quote_mode],
lineterminator=os.linesep,
escapechar='\\',
)
writer_kwargs = dict(
quoting=self.QUOTE_MODES[parsed_args.quote_mode],
lineterminator=os.linesep,
escapechar='\\',
)
# In Py2 we replace the csv module with unicodecsv because the
# Py2 csv module cannot handle unicode. unicodecsv encodes
# unicode objects based on the value of it's encoding keyword
# with the result unicodecsv emits encoded bytes in a str
# object. The utils.getwriter assures no attempt is made to
# re-encode the encoded bytes in the str object.
if six.PY2:
writer_kwargs['encoding'] = (getattr(stdout, 'encoding', None)
or 'utf-8')
writer = csv.writer(stdout, **writer_kwargs)
writer.writerow(column_names)
for row in data:
writer.writerow(

View File

@ -496,3 +496,53 @@ def test_io_streams():
assert app.stdin is sys.stdin
assert app.stdout is sys.stdout
assert app.stderr is io
def test_writer_encoding(self):
# The word "test" with the e replaced by
# Unicode latin small letter e with acute,
# U+00E9, utf-8 encoded as 0xC3 0xA9
text = u't\u00E9st'
text_utf8 = text.encode('utf-8')
if six.PY2:
# In PY2 StreamWriter can't accept non-ASCII encoded characters
# because it must first promote the encoded byte stream to
# unicode in order to encode it in the desired encoding.
# Because the encoding of the byte stream is not known at this
# point the default-encoding of ASCII is utilized, but you can't
# decode a non-ASCII charcater to ASCII.
io = six.StringIO()
writer = codecs.getwriter('utf-8')(io)
self.assertRaises(UnicodeDecodeError,
writer.write,
text_utf8)
# In PY2 with our override of codecs.getwriter we do not
# attempt to encode bytes in a str object (only unicode
# objects) therefore the final output string should be the
# utf-8 encoded byte sequence
io = six.StringIO()
writer = utils.getwriter('utf-8')(io)
writer.write(text)
output = io.getvalue()
self.assertEqual(text_utf8, output)
io = six.StringIO()
writer = utils.getwriter('utf-8')(io)
writer.write(text_utf8)
output = io.getvalue()
self.assertEqual(text_utf8, output)
else:
# In PY3 you can't write encoded bytes to a text writer
# instead text functions require text.
io = six.StringIO()
writer = utils.getwriter('utf-8')(io)
self.assertRaises(TypeError,
writer.write,
text)
io = six.StringIO()
writer = utils.getwriter('utf-8')(io)
self.assertRaises(TypeError,
writer.write,
text_utf8)

View File

@ -11,11 +11,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import codecs
import ctypes
import os
import struct
import sys
import six
# Each edit operation is assigned different cost, such as:
# 'w' means swap operation, the cost is 0;
# 's' means substitution operation, the cost is 2;
@ -153,3 +156,52 @@ def _get_terminal_width_ioctl(stdout):
return columns
except IOError:
return None
if six.PY2:
def getwriter(encoding):
'''Override codecs.getwriter() to prevent codec errors.
The StreamWriter returned by codecs.getwriter has an unfortunate
property, it will attempt to encode every object presented to it's
write() function. Normally we only want unicode objects to be
encoded to a byte stream. If bytes are presented (e.g. str in
Python2) we make the assumption those bytes represent an already
encoded text stream or they are indeed binary bytes and hence
should not be encoded.
When the core StreamWriter attempts to encode a str object Python
will first promote the str object to a unicode object. The
promotion of str to unicode requires the str bytes to be
decoded. However the encoding associated with the str object is
not known therefore Python applies the default-encoding which is
ASCII. In the case where the str object contains utf-8 encoded
non-ASCII characters a decoding error is raised. By not attempting
to encode a byte stream we avoid this error.
It really does not make much sense to try and encode a byte
stream. First of all a byte stream should not be encoded if it's
not text (e.g. binary data). If the byte stream is encoded text
the only way to re-encode it is if we known it's encoding so we
can decode it into a canonical form (e.g. unicode). Thus to
re-encode it we encode from the canonical form (e.g. unicode) to
the new binary encoding. The problem in Python2 is we never know
if the bytes in a str object are text or binary data and if it's
text which encoding it is, hence we should not try to apply
an encoding to a str object.
'''
class _StreamWriter(codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
def encode(self, msg, errors='strict'):
if isinstance(msg, six.text_type):
return self.encoder(msg, errors)
return msg, len(msg)
_StreamWriter.encoder = codecs.getencoder(encoding)
_StreamWriter.encoding = encoding
return _StreamWriter
else:
getwriter = codecs.getwriter