Allow overriding file encoding

Chardet doesn't always seem to correctly detect files encoding
correctly in all circumstances, to make it so that a user can
specify the exact encoding of there files allow a new config
option and a new CLI option that allows for manually overriding
the encoding that chardet will try to determine.

If enabled chardet detection will no longer run.

Fixes bug 1384463

Change-Id: Ie8baf3f79083e1495c7420a9d0569390cad2115e
This commit is contained in:
Joshua Harlow 2014-10-22 16:13:33 -07:00
parent 8b8f22329b
commit 04a710c687
3 changed files with 35 additions and 1 deletions

View File

@ -59,6 +59,10 @@ Command line usage
--default-extension extension
Default file extension to use when a file is found
without a file extension.
--file-encoding encoding
Override encoding to use when attempting to determine
an input files text encoding (providing this avoids
using `chardet` to automatically detect encoding/s)
--max-line-length int
maximum allowed line length (default: 79)
-e extension, --extension extension
@ -110,6 +114,7 @@ Option Overrides Merges
``ignore-path`` No Yes
``ignore`` No Yes
``max-line-length`` Yes No
``file-encoding`` Yes No
``sphinx`` Yes No
===================== =========== ========

View File

@ -109,6 +109,10 @@ def extract_config(args):
cfg['verbose'] = parser.getboolean("doc8", "verbose")
except (configparser.NoSectionError, configparser.NoOptionError):
pass
try:
cfg['file_encoding'] = parser.get("doc8", "file-encoding")
except (configparser.NoSectionError, configparser.NoOptionError):
pass
try:
cfg['default_extension'] = parser.get("doc8", "default-extension")
except (configparser.NoSectionError, configparser.NoOptionError):
@ -160,6 +164,7 @@ def scan(cfg):
file_iter = utils.find_files(cfg.get('paths', []),
cfg.get('extension', []), ignored_paths)
default_extension = cfg.get('default_extension')
file_encoding = cfg.get('file_encoding')
for filename, ignoreable in file_iter:
if ignoreable:
files_ignored += 1
@ -167,7 +172,8 @@ def scan(cfg):
print(" Ignoring '%s'" % (filename))
else:
f = file_parser.parse(filename,
default_extension=default_extension)
default_extension=default_extension,
encoding=file_encoding)
files.append(f)
if cfg.get('verbose'):
print(" Selecting '%s'" % (filename))
@ -275,6 +281,13 @@ def main():
" found without a file extension.",
default='', dest='default_extension',
metavar='extension')
parser.add_argument("--file-encoding", action="store",
help="Override encoding to use when attempting"
" to determine an input files text encoding "
"(providing this avoids using `chardet` to"
" automatically detect encoding/s)",
default='', dest='file_encoding',
metavar='encoding')
parser.add_argument("--max-line-length", action="store", metavar="int",
type=int,
help="Maximum allowed line"

View File

@ -89,6 +89,22 @@ test
(line, code, msg) = errors[0]
self.assertIn(code, check.REPORTS)
def test_correct_length(self):
conf = {
'max_line_length': 79,
'allow_long_titles': True,
}
with tempfile.NamedTemporaryFile(suffix='.rst') as fh:
fh.write(b'known exploit in the wild, for example'
' \xe2\x80\x93 the time'
' between advance notification')
fh.flush()
parsed_file = parser.ParsedFile(fh.name, encoding='utf-8')
check = checks.CheckMaxLineLength(conf)
errors = list(check.report_iter(parsed_file))
self.assertEqual(0, len(errors))
def test_unsplittable_length(self):
content = """
===