Merge "Fix codec error when format=csv"

2017-12-04 15:48:15 +00:00 · 2017-12-04 15:48:15 +00:00 · e14f905a4f
parent 10963e0aba c61cc30060
commit e14f905a4f
4 changed files with 122 additions and 7 deletions
--- a/cliff/app.py
+++ b/cliff/app.py
@ -118,10 +118,10 @@ class App(object):
                    stdin = codecs.getreader(encoding)(sys.stdin)

                if not (stdout or isinstance(sys.stdout, codecs.StreamWriter)):
-                    stdout = codecs.getwriter(encoding)(sys.stdout)
+                    stdout = utils.getwriter(encoding)(sys.stdout)

                if not (stderr or isinstance(sys.stderr, codecs.StreamWriter)):
-                    stderr = codecs.getwriter(encoding)(sys.stderr)
+                    stderr = utils.getwriter(encoding)(sys.stderr)

        self.stdin = stdin or sys.stdin
        self.stdout = stdout or sys.stdout
--- a/cliff/formatters/commaseparated.py
+++ b/cliff/formatters/commaseparated.py
@ -47,11 +47,24 @@ class CSVLister(ListFormatter):
        )

    def emit_list(self, column_names, data, stdout, parsed_args):
-        writer = csv.writer(stdout,
-                            quoting=self.QUOTE_MODES[parsed_args.quote_mode],
-                            lineterminator=os.linesep,
-                            escapechar='\\',
-                            )
+        writer_kwargs = dict(
+            quoting=self.QUOTE_MODES[parsed_args.quote_mode],
+            lineterminator=os.linesep,
+            escapechar='\\',
+        )
+
+        # In Py2 we replace the csv module with unicodecsv because the
+        # Py2 csv module cannot handle unicode. unicodecsv encodes
+        # unicode objects based on the value of it's encoding keyword
+        # with the result unicodecsv emits encoded bytes in a str
+        # object. The utils.getwriter assures no attempt is made to
+        # re-encode the encoded bytes in the str object.
+
+        if six.PY2:
+            writer_kwargs['encoding'] = (getattr(stdout, 'encoding', None)
+                                         or 'utf-8')
+
+        writer = csv.writer(stdout, **writer_kwargs)
        writer.writerow(column_names)
        for row in data:
            writer.writerow(
--- a/cliff/tests/test_app.py
+++ b/cliff/tests/test_app.py
@ -498,3 +498,53 @@ class TestIO(base.TestBase):
            self.assertIs(sys.stdin, app.stdin)
            self.assertIs(sys.stdout, app.stdout)
            self.assertIs(io, app.stderr)
+
+    def test_writer_encoding(self):
+        # The word "test" with the e replaced by
+        # Unicode latin small letter e with acute,
+        # U+00E9, utf-8 encoded as 0xC3 0xA9
+        text = u't\u00E9st'
+        text_utf8 = text.encode('utf-8')
+
+        if six.PY2:
+            # In PY2 StreamWriter can't accept non-ASCII encoded characters
+            # because it must first promote the encoded byte stream to
+            # unicode in order to encode it in the desired encoding.
+            # Because the encoding of the byte stream is not known at this
+            # point the default-encoding of ASCII is utilized, but you can't
+            # decode a non-ASCII charcater to ASCII.
+            io = six.StringIO()
+            writer = codecs.getwriter('utf-8')(io)
+            self.assertRaises(UnicodeDecodeError,
+                              writer.write,
+                              text_utf8)
+
+            # In PY2 with our override of codecs.getwriter we do not
+            # attempt to encode bytes in a str object (only unicode
+            # objects) therefore the final output string should be the
+            # utf-8 encoded byte sequence
+            io = six.StringIO()
+            writer = utils.getwriter('utf-8')(io)
+            writer.write(text)
+            output = io.getvalue()
+            self.assertEqual(text_utf8, output)
+
+            io = six.StringIO()
+            writer = utils.getwriter('utf-8')(io)
+            writer.write(text_utf8)
+            output = io.getvalue()
+            self.assertEqual(text_utf8, output)
+        else:
+            # In PY3 you can't write encoded bytes to a text writer
+            # instead text functions require text.
+            io = six.StringIO()
+            writer = utils.getwriter('utf-8')(io)
+            self.assertRaises(TypeError,
+                              writer.write,
+                              text)
+
+            io = six.StringIO()
+            writer = utils.getwriter('utf-8')(io)
+            self.assertRaises(TypeError,
+                              writer.write,
+                              text_utf8)
--- a/cliff/utils.py
+++ b/cliff/utils.py
@ -11,11 +11,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import codecs
 import ctypes
 import os
 import struct
 import sys

+import six
+
 # Each edit operation is assigned different cost, such as:
 #  'w' means swap operation, the cost is 0;
 #  's' means substitution operation, the cost is 2;
@ -153,3 +156,52 @@ def _get_terminal_width_ioctl(stdout):
        return columns
    except IOError:
        return None
+
+
+if six.PY2:
+    def getwriter(encoding):
+        '''Override codecs.getwriter() to prevent codec errors.
+
+        The StreamWriter returned by codecs.getwriter has an unfortunate
+        property, it will attempt to encode every object presented to it's
+        write() function. Normally we only want unicode objects to be
+        encoded to a byte stream. If bytes are presented (e.g. str in
+        Python2) we make the assumption those bytes represent an already
+        encoded text stream or they are indeed binary bytes and hence
+        should not be encoded.
+
+        When the core StreamWriter attempts to encode a str object Python
+        will first promote the str object to a unicode object. The
+        promotion of str to unicode requires the str bytes to be
+        decoded. However the encoding associated with the str object is
+        not known therefore Python applies the default-encoding which is
+        ASCII. In the case where the str object contains utf-8 encoded
+        non-ASCII characters a decoding error is raised. By not attempting
+        to encode a byte stream we avoid this error.
+
+        It really does not make much sense to try and encode a byte
+        stream. First of all a byte stream should not be encoded if it's
+        not text (e.g. binary data). If the byte stream is encoded text
+        the only way to re-encode it is if we known it's encoding so we
+        can decode it into a canonical form (e.g. unicode). Thus to
+        re-encode it we encode from the canonical form (e.g. unicode) to
+        the new binary encoding. The problem in Python2 is we never know
+        if the bytes in a str object are text or binary data and if it's
+        text which encoding it is, hence we should not try to apply
+        an encoding to a str object.
+        '''
+        class _StreamWriter(codecs.StreamWriter):
+            def __init__(self, stream, errors='strict'):
+                codecs.StreamWriter.__init__(self, stream, errors)
+
+            def encode(self, msg, errors='strict'):
+                if isinstance(msg, six.text_type):
+                    return self.encoder(msg, errors)
+                return msg, len(msg)
+
+        _StreamWriter.encoder = codecs.getencoder(encoding)
+        _StreamWriter.encoding = encoding
+        return _StreamWriter
+
+else:
+    getwriter = codecs.getwriter