Detect the encoding using chardet

Instead of always assuming utf-8 (which is typically
ok) switch to using the chardet library and use it to
detect the incoming files encoding in a more accurate
manner.

Change-Id: I907c8f4e1a6682138b220b49fcfcff6eef85a0b3
This commit is contained in:
Joshua Harlow 2014-08-16 23:42:42 -07:00
parent 8ea62aff22
commit 687addf9b0
2 changed files with 17 additions and 15 deletions

View File

@ -17,6 +17,7 @@
import errno
import os
import chardet
from docutils import frontend
from docutils import parsers as docutils_parser
from docutils import utils
@ -25,25 +26,13 @@ import six
class ParsedFile(object):
def __init__(self, filename, encoding='utf8'):
def __init__(self, filename, encoding=None):
self._filename = filename
self._content = None
self._raw_content = None
self._encoding = encoding
self._doc = None
self._errors = None
self._defaults = {
'input_encoding': self._encoding,
'halt_level': 5,
'report_level': 5,
'quiet': True,
'file_insertion_enabled': False,
'traceback': True,
# Development use only.
'dump_settings': False,
'dump_internals': False,
'dump_transforms': False,
}
@property
def errors(self):
@ -61,7 +50,17 @@ class ParsedFile(object):
# mature).
parser_cls = docutils_parser.get_parser_class("rst")
parser = parser_cls()
defaults = dict(self._defaults)
defaults = {
'halt_level': 5,
'report_level': 5,
'quiet': True,
'file_insertion_enabled': False,
'traceback': True,
# Development use only.
'dump_settings': False,
'dump_internals': False,
'dump_transforms': False,
}
opt = frontend.OptionParser(components=[parser], defaults=defaults)
doc = utils.new_document(source_path=self.filename,
settings=opt.get_default_values())
@ -83,6 +82,8 @@ class ParsedFile(object):
@property
def encoding(self):
if self._encoding is None:
self._encoding = chardet.detect(self.raw_contents)['encoding']
return self._encoding
@property
@ -100,7 +101,7 @@ class ParsedFile(object):
return self._content
def parse(filename, encoding="utf8"):
def parse(filename, encoding=None):
if not os.path.isfile(filename):
raise IOError(errno.ENOENT, 'File not found', filename)
return ParsedFile(filename, encoding=encoding)

View File

@ -1,4 +1,5 @@
argparse
chardet
docutils
restructuredtext-lint>=0.7
six