A multitude of adjustments

- Use docutils to do the rst file parsing and use the results of its parsing to do the max line length analysis on. This ensures that we do not create our own rst parser in doc8 but use a more hardened an mature one instead. - Allow long-titles to be excluded from the max line length checks by allowing a CLI or config option to be provided that specifies this. - Allow the ignored errors to be provided on the CLI.
2014-05-18 01:57:29 -07:00 · 2014-05-18 01:57:29 -07:00 · 93cb0c61ee
parent 0e23284187
commit 93cb0c61ee
2 changed files with 132 additions and 68 deletions
--- a/scripts/doc8
+++ b/scripts/doc8
@ -22,22 +22,24 @@

 What is checked:
    - lines should not be longer than 79 characters - D001
-      - exception: line with no whitespace except maybe in the beginning
-      - exception: line that starts with '..' -- longer directives are allowed,
-        including footnotes
+      - exception: line with no whitespace except in the beginning
+      - exception: lines with http or https urls
+      - exception: doctest and literal blocks
+      - exception: rst directives
    - no trailing whitespace - D002
    - no tabulation for indentation - D003
    - no carriage returns (use unix newlines) - D004
 """

 import argparse
-import collections
 import fnmatch
-import functools
 import os
 import re
 import sys

+from docutils import core
+from docutils import nodes as doc_nodes
+
 import six
 from six.moves import configparser

@ -54,67 +56,106 @@ CONFIG_FILENAMES = [
 ]


-def check_max_length(max_line_length, contents):
-
-    def starting_whitespace(line):
-        m = re.match(r"^(\s+)(.*)$", line)
-        if not m:
-            return 0
-        return len(m.group(1))
-
-    def all_whitespace(line):
-        return bool(re.match(r"^(\s*)$", line))
+def check_max_length(cfg, contents):

    def contains_url(line):
        if "http://" in line or "https://" in line:
            return True
        return False

-    def find_directive_end(start, lines):
-        after_lines = collections.deque(lines[start + 1:])
-        k = 0
-        while after_lines:
-            line = after_lines.popleft()
-            if all_whitespace(line) or starting_whitespace(line) >= 1:
-                k += 1
-            else:
-                break
-        return start + k
+    doc = core.publish_doctree(
+        source=contents,
+        settings_overrides={'traceback': True, 'report': 5,
+                            'quiet': True, 'input_encoding': 'utf-8',
+                            'dump_settings': False, 'report_level': 5,
+                            'dump_transforms': False, 'dump_internals': False})

-    # Find where directives start & end so that we can exclude content in
-    # these directive regions.
-    lines = contents.split("\n")
-    directives = []
-    for i, line in enumerate(lines):
-        if re.match(r"^..\s(.*?)::\s*", line):
-            directives.append((i, find_directive_end(i, lines)))
-        elif re.match(r"^::\s*$", line):
-            directives.append((i, find_directive_end(i, lines)))
+    def extract_lines(node, start_line):
+        lines = [start_line]
+        if isinstance(node, (doc_nodes.literal_block, doc_nodes.title)):
+            lines.append(start_line - len(node.rawsource.splitlines()))
+        return lines

-    for i, line in enumerate(lines):
-        in_directive = False
-        for (start, end) in directives:
-            if i >= start and i <= end:
-                in_directive = True
-                break
-        if in_directive:
+    def gather_lines(node):
+        lines = []
+        for n in node.traverse(include_self=True):
+            lines.extend(extract_lines(n, find_line(n)))
+        return lines
+
+    def find_line(node):
+        if node.line is not None:
+            return node.line
+        n = node.parent
+        while n is not None:
+            if n.line is not None:
+                return n.line
+            n = n.parent
+        return None
+
+    node_lines = []
+    first_line = -1
+    for n in doc.traverse(include_self=True):
+        line = find_line(n)
+        if line is None:
            continue
+        if first_line == -1:
+            first_line = line
+        contained_lines = []
+        contained_lines.extend(gather_lines(n))
+        node_lines.append((n, (min(contained_lines),
+                               max(contained_lines))))
+
+    def find_node(num):
+        if num < first_line:
+            return node_lines[0][0]
+        contained_in = []
+        for (n, (line_min, line_max)) in node_lines:
+            if num >= line_min and num <= line_max:
+                contained_in.append((n, (line_min, line_max)))
+        smallest_span = None
+        best_nodes = []
+        for (n, (line_min, line_max)) in contained_in:
+            span = line_max - line_min
+            if smallest_span is None:
+                smallest_span = span
+                best_nodes = [n]
+            elif span < smallest_span:
+                smallest_span = span
+                best_nodes = [n]
+            elif span == smallest_span:
+                best_nodes.append(n)
+        return best_nodes
+
+    skip_types = (
+        doc_nodes.target,
+        doc_nodes.literal_block,
+    )
+    title_types = (
+        doc_nodes.title,
+    )
+    max_line_length = cfg['max_line_length']
+    allow_long = cfg['allow_long_titles']
+    for i, line in enumerate(contents.split("\n")):
        if len(line) > max_line_length:
            stripped = line.strip()
-            # line can't be split
            if ' ' not in stripped:
                continue
            if contains_url(stripped):
                continue
+            nodes = find_node(i + 1)
+            if any([isinstance(n, skip_types) for n in nodes]):
+                continue
+            if allow_long and any([isinstance(n, title_types) for n in nodes]):
+                continue
            yield (i + 1, 'D001', 'Line too long')


-def check_trailing_whitespace(line):
+def check_trailing_whitespace(cfg, line):
    if TRAILING_WHITESPACE_REGEX.search(line):
        yield ('D002', 'Trailing whitespace')


-def check_indentation_no_tab(line):
+def check_indentation_no_tab(cfg, line):
    match = STARTING_WHITESPACE_REGEX.search(line)
    if match:
        spaces = match.group(1)
@ -122,29 +163,29 @@ def check_indentation_no_tab(line):
            yield ('D003', 'Tabulation used for indentation')


-def check_carriage_return(line):
+def check_carriage_return(cfg, line):
    if "\r" in line:
        yield ('D004', 'Found literal carriage return')


-def check_lines(lines, line_checks):
+def check_lines(cfg, lines, line_checks):
    for idx, line in enumerate(lines, 1):
        line = six.text_type(line, encoding='utf8')
        line = line.rstrip('\n')
        for check in line_checks:
-            for code, message in check(line):
+            for code, message in check(cfg, line):
                yield idx, code, message


-def check_files(filenames, line_checks, content_checks):
+def check_files(cfg, filenames, line_checks, content_checks):
    for fn in filenames:
-        with open(fn, 'rb') as f:
-            content = six.text_type(f.read(), encoding='utf8')
+        with open(fn, 'rb') as fh:
+            content = six.text_type(fh.read(), encoding='utf8')
            for content_check in content_checks:
-                for line_num, code, message in content_check(content):
+                for line_num, code, message in content_check(cfg, content):
                    yield fn, line_num, code, message
-            f.seek(0)
-            for line_num, code, message in check_lines(f, line_checks):
+            fh.seek(0)
+            for line_num, code, message in check_lines(cfg, fh, line_checks):
                yield fn, line_num, code, message


@ -162,7 +203,17 @@ def find_files(pathes, patterns):
            print('Invalid path: %s' % path)


-def extract_config(args):
+def split_uniq_string(text):
+    items = set()
+    for i in text.split(","):
+        i = i.strip()
+        if not i:
+            continue
+        items.add(i)
+    return items
+
+
+def extract_config(args, default_cfg):
    if args.config:
        parser = configparser.RawConfigParser()
        for fn in list(args.config):
@ -171,23 +222,22 @@ def extract_config(args):
    else:
        parser = configparser.RawConfigParser()
        parser.read(CONFIG_FILENAMES)
-    cfg = {}
+    cfg = dict(default_cfg)
    try:
        cfg['max_line_length'] = parser.getint("doc8", "max-line-length")
    except (configparser.NoSectionError, configparser.NoOptionError):
-        cfg['max_line_length'] = MAX_LINE_LENGTH
+        pass
    try:
        ignores = parser.get("doc8", "ignore")
    except (configparser.NoSectionError, configparser.NoOptionError):
-        cfg['ignore'] = set()
+        pass
    else:
-        ignoreables = set()
-        for i in ignores.split(","):
-            i = i.strip()
-            if not i:
-                continue
-            ignoreables.add(i)
-        cfg['ignore'] = ignoreables
+        cfg['ignore'].update(split_uniq_string(ignores))
+    try:
+        cfg['allow_long_titles'] = parser.getboolean("doc8",
+                                                     "allow-long-titles")
+    except (configparser.NoSectionError, configparser.NoOptionError):
+        pass
    return cfg


@ -213,19 +263,32 @@ def main():
    parser.add_argument("--config", metavar='path', action="append",
                        help="user config file location"
                             " (default: %s)" % default_configs)
+    parser.add_argument("--allow-long-titles", action="store_true",
+                        help="allow long section titles (default: False)",
+                        default=False)
+    parser.add_argument("--ignore", action="append", metavar="code",
+                        help="ignore the given errors code/codes",
+                        default=[])
    args = parser.parse_args()
-    dirs = list(unique_itr(args.paths))
-    cfg = extract_config(args)
+    default_cfg = {
+        'max_line_length': MAX_LINE_LENGTH,
+        'ignore': set(),
+        'allow_long_titles': args.allow_long_titles,
+    }
+    for c in args.ignore:
+        default_cfg['ignore'].update(split_uniq_string(c))
+    cfg = extract_config(args, default_cfg)
    line_checks = [
        check_trailing_whitespace,
        check_indentation_no_tab,
        check_carriage_return,
    ]
    content_checks = [
-        functools.partial(check_max_length, cfg['max_line_length']),
+        check_max_length,
    ]
    ok = True
-    for error in check_files(find_files(dirs, FILE_PATTERNS),
+    paths = unique_itr(args.paths)
+    for error in check_files(cfg, find_files(paths, FILE_PATTERNS),
                             line_checks, content_checks):
        if error[2] in cfg['ignore']:
            continue
--- a/setup.py
+++ b/setup.py
@ -41,6 +41,7 @@ setup(name='doc8',
      license="ASL 2.0",
      install_requires=[
          'argparse',
+          'docutils',
          'six',
      ],
      classifiers=[