Move over the rest of the checks

Get the rest of the checks working (the max line length check now works). Remove the old script now that it is not needed anymore.
2014-05-18 18:12:03 -07:00 · 2014-05-18 18:12:03 -07:00 · 51cbe4f914
parent 50528d3741
commit 51cbe4f914
4 changed files with 201 additions and 364 deletions
--- a/doc8/checks.py
+++ b/doc8/checks.py
@ -17,10 +17,14 @@
 # under the License.

 import abc
+import collections
 import re

+from docutils import nodes as docutils_nodes
 import six

+from doc8 import utils
+

@six.add_metaclass(abc.ABCMeta)
 class ContentCheck(object):
@ -74,5 +78,142 @@ class CheckCarriageReturn(LineCheck):
 class CheckMaxLineLength(ContentCheck):
    REPORTS = frozenset(["D001"])

+    def _extract_node_lines(self, doc):
+
+        def extract_lines(node, start_line):
+            lines = [start_line]
+            if isinstance(node, (docutils_nodes.title)):
+                start = start_line - len(node.rawsource.splitlines())
+                if start >= 0:
+                    lines.append(start)
+            if isinstance(node, (docutils_nodes.literal_block)):
+                end = start_line + len(node.rawsource.splitlines()) - 1
+                lines.append(end)
+            return lines
+
+        def gather_lines(node):
+            lines = []
+            for n in node.traverse(include_self=True):
+                lines.extend(extract_lines(n, find_line(n)))
+            return lines
+
+        def find_line(node):
+            n = node
+            while n is not None:
+                if n.line is not None:
+                    return n.line
+                n = n.parent
+            return None
+
+        def filter_systems(node):
+            if utils.has_any_node_type(node, (docutils_nodes.system_message,)):
+                return False
+            return True
+
+        nodes_lines = []
+        first_line = -1
+        for n in utils.filtered_traverse(doc, filter_systems):
+            line = find_line(n)
+            if line is None:
+                continue
+            if first_line == -1:
+                first_line = line
+            contained_lines = set(gather_lines(n))
+            nodes_lines.append((n, (min(contained_lines),
+                                    max(contained_lines))))
+        return (nodes_lines, first_line)
+
+    def _extract_directives(self, lines):
+
+        def starting_whitespace(line):
+            m = re.match(r"^(\s+)(.*)$", line)
+            if not m:
+                return 0
+            return len(m.group(1))
+
+        def all_whitespace(line):
+            return bool(re.match(r"^(\s*)$", line))
+
+        def find_directive_end(start, lines):
+            after_lines = collections.deque(lines[start + 1:])
+            k = 0
+            while after_lines:
+                line = after_lines.popleft()
+                if all_whitespace(line) or starting_whitespace(line) >= 1:
+                    k += 1
+                else:
+                    break
+            return start + k
+
+        # Find where directives start & end so that we can exclude content in
+        # these directive regions (the rst parser may not handle this correctly
+        # for unknown directives, so we have to do it manually).
+        directives = []
+        for i, line in enumerate(lines):
+            if re.match(r"^..\s(.*?)::\s*", line):
+                directives.append((i, find_directive_end(i, lines)))
+            elif re.match(r"^::\s*$", line):
+                directives.append((i, find_directive_end(i, lines)))
+        return directives
+
    def report_iter(self, parsed_file):
-        pass
+        doc = parsed_file.document
+        lines = list(parsed_file.lines_iter())
+
+        nodes_lines, first_line = self._extract_node_lines(doc)
+        directives = self._extract_directives(lines)
+
+        def find_containing_nodes(num):
+            if num < first_line and len(nodes_lines):
+                return [nodes_lines[0][0]]
+            contained_in = []
+            for (n, (line_min, line_max)) in nodes_lines:
+                if num >= line_min and num <= line_max:
+                    contained_in.append((n, (line_min, line_max)))
+            smallest_span = None
+            best_nodes = []
+            for (n, (line_min, line_max)) in contained_in:
+                span = line_max - line_min
+                if smallest_span is None:
+                    smallest_span = span
+                    best_nodes = [n]
+                elif span < smallest_span:
+                    smallest_span = span
+                    best_nodes = [n]
+                elif span == smallest_span:
+                    best_nodes.append(n)
+            return best_nodes
+
+        def any_types(nodes, types):
+            return any([isinstance(n, types) for n in nodes])
+
+        skip_types = (
+            docutils_nodes.target,
+            docutils_nodes.literal_block,
+        )
+        title_types = (
+            docutils_nodes.title,
+        )
+        max_line_length = self._cfg['max_line_length']
+        allow_long_titles = self._cfg['allow_long_titles']
+        for i, line in enumerate(lines):
+            if len(line) > max_line_length:
+                in_directive = False
+                for (start, end) in directives:
+                    if i >= start and i <= end:
+                        in_directive = True
+                        break
+                if in_directive:
+                    continue
+                stripped = line.lstrip()
+                if ' ' not in stripped:
+                    # No room to split even if we could.
+                    continue
+                if utils.contains_url(stripped):
+                    continue
+                nodes = find_containing_nodes(i + 1)
+                if any_types(nodes, skip_types):
+                    continue
+                if allow_long_titles and any_types(nodes, title_types):
+                    continue
+                yield (i + 1, 'D001', 'Line too long')
--- a/doc8/main.py
+++ b/doc8/main.py
@ -35,6 +35,7 @@ import os

 from six.moves import configparser

+from doc8 import checks
 from doc8 import parser as file_parser
 from doc8 import utils

@ -88,6 +89,15 @@ def extract_config(args):
    return cfg


+def fetch_checks(cfg):
+    return [
+        checks.CheckTrailingWhitespace(cfg),
+        checks.CheckIndentationNoTab(cfg),
+        checks.CheckCarriageReturn(cfg),
+        checks.CheckMaxLineLength(cfg),
+    ]
+
+
 def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
@ -121,5 +131,36 @@ def main():
    args.update(cfg)

    files = []
-    for filename in utils.find_files(args['paths'], FILE_PATTERNS):
-        files.append(file_parser.parse(filename))
+    for filename in utils.find_files(args.pop('paths', []), FILE_PATTERNS):
+        files.append(file_parser.parse(filename))
+
+    ignoreables = frozenset(args.pop('ignore', []))
+    errors = 0
+    for f in files:
+        for c in fetch_checks(args):
+            try:
+                reports = set(c.REPORTS)
+            except AttributeError:
+                pass
+            else:
+                reports = reports - ignoreables
+                if not reports:
+                    continue
+            if isinstance(c, checks.ContentCheck):
+                for line_num, code, message in c.report_iter(f):
+                    print('%s:%s: %s %s'
+                          % (f.filename, line_num, code, message))
+                    errors += 1
+            elif isinstance(c, checks.LineCheck):
+                for line_num, line in enumerate(f.lines_iter(), 1):
+                    for code, message in c.report_iter(line):
+                        print('%s:%s: %s %s'
+                              % (f.filename, line_num, code, message))
+                        errors += 1
+            else:
+                raise TypeError("Unknown check type: %s, %s"
+                                % (type(c), c))
+    if errors:
+        return 1
+    else:
+        return 0
--- a/doc8/utils.py
+++ b/doc8/utils.py
@ -34,7 +34,22 @@ def find_files(paths, patterns):
            raise IOError('Invalid path: %s' % path)


-def filter_document(document, filter_func):
+def filtered_traverse(document, filter_func):
    for n in document.traverse(include_self=True):
        if filter_func(n):
            yield n
+
+
+def contains_url(line):
+    if "http://" in line or "https://" in line:
+        return True
+    return False
+
+
+def has_any_node_type(node, node_types):
+    n = node
+    while n is not None:
+        if isinstance(n, node_types):
+            return True
+        n = n.parent
+    return False
--- a/scripts/doc8
+++ b/scripts/doc8
@ -1,360 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-# Copyright (C) 2014 Ivan Melnikov <iv at altlinux dot org>
-#
-# Author: Joshua Harlow <harlowja@yahoo-inc.com>
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-
-"""Check documentation for simple style requirements.
-
-What is checked:
-    - lines should not be longer than 79 characters - D001
-      - exception: line with no whitespace except in the beginning
-      - exception: lines with http or https urls
-      - exception: literal blocks
-      - exception: rst target directives
-    - no trailing whitespace - D002
-    - no tabulation for indentation - D003
-    - no carriage returns (use unix newlines) - D004
-"""
-
-import argparse
-import collections
-import fnmatch
-import os
-import re
-import sys
-
-from docutils import frontend
-from docutils import nodes as doc_nodes
-from docutils.parsers import rst
-from docutils import utils
-
-import six
-from six.moves import configparser
-
-
-FILE_PATTERNS = ['*.rst', '*.txt']
-MAX_LINE_LENGTH = 79
-TRAILING_WHITESPACE_REGEX = re.compile('\s$')
-STARTING_WHITESPACE_REGEX = re.compile('^(\s+)')
-CONFIG_FILENAMES = [
-    "doc8.ini",
-    "tox.ini",
-    "pep8.ini",
-    "setup.cfg",
-]
-
-
-def check_max_length(fn, cfg, contents):
-
-    def contains_url(line):
-        if "http://" in line or "https://" in line:
-            return True
-        return False
-
-    def any_node_type(node, node_types):
-        n = node
-        node_types = tuple(node_types)
-        while n is not None:
-            if isinstance(n, node_types):
-                return True
-            n = n.parent
-        return False
-
-    def extract_lines(node, start_line):
-        lines = [start_line]
-        if isinstance(node, (doc_nodes.title)):
-            start = start_line - len(node.rawsource.splitlines())
-            if start >= 0:
-                lines.append(start)
-        if isinstance(node, (doc_nodes.literal_block)):
-            end = start_line + len(node.rawsource.splitlines()) - 1
-            lines.append(end)
-        return lines
-
-    def gather_lines(node):
-        lines = []
-        for n in node.traverse(include_self=True):
-            lines.extend(extract_lines(n, find_line(n)))
-        return lines
-
-    def find_line(node):
-        n = node
-        while n is not None:
-            if n.line is not None:
-                return n.line
-            n = n.parent
-        return None
-
-    def find_containing_nodes(num, node_lines, first_line):
-        if num < first_line and len(node_lines):
-            return [node_lines[0][0]]
-        contained_in = []
-        for (n, (line_min, line_max)) in node_lines:
-            if num >= line_min and num <= line_max:
-                contained_in.append((n, (line_min, line_max)))
-        smallest_span = None
-        best_nodes = []
-        for (n, (line_min, line_max)) in contained_in:
-            span = line_max - line_min
-            if smallest_span is None:
-                smallest_span = span
-                best_nodes = [n]
-            elif span < smallest_span:
-                smallest_span = span
-                best_nodes = [n]
-            elif span == smallest_span:
-                best_nodes.append(n)
-        return best_nodes
-
-    def find_directive_end(start, lines):
-
-        def starting_whitespace(line):
-            m = re.match(r"^(\s+)(.*)$", line)
-            if not m:
-                return 0
-            return len(m.group(1))
-
-        def all_whitespace(line):
-            return bool(re.match(r"^(\s*)$", line))
-
-        after_lines = collections.deque(lines[start + 1:])
-        k = 0
-        while after_lines:
-            line = after_lines.popleft()
-            if all_whitespace(line) or starting_whitespace(line) >= 1:
-                k += 1
-            else:
-                break
-        return start + k
-
-    # Use the rst parsers document output to do as much of the validation
-    # as we can without resorting to custom logic (this parser is what sphinx
-    # and others use anyway so it's very mature).
-    parser = rst.Parser()
-    defaults = {
-        'input_encoding': 'utf8',
-        'halt_level': 5,
-        'report_level': 5,
-        'quiet': True,
-        'file_insertion_enabled': False,
-        'traceback': True,
-    }
-    opt = frontend.OptionParser(components=[parser], defaults=defaults)
-    doc = utils.new_document(source_path=fn, settings=opt.get_default_values())
-    parser.parse(contents, doc)
-    node_lines = []
-    first_line = -1
-    for n in doc.traverse(include_self=True):
-        line = find_line(n)
-        if line is None:
-            continue
-        if any_node_type(n, [doc_nodes.system_message]):
-            # These are failures, and there node content isn't correct,
-            # so skip them; we should work on making it so that the parser
-            # stops doing this custom parent creation in the first place.
-            continue
-        if first_line == -1:
-            first_line = line
-        contained_lines = set(gather_lines(n))
-        node_lines.append((n, (min(contained_lines), max(contained_lines))))
-
-    # Find where directives start & end so that we can exclude content in
-    # these directive regions (the rst parser may not handle this correctly
-    # for unknown directives, so we have to do it manually).
-    lines = contents.split("\n")
-    directives = []
-    for i, line in enumerate(lines):
-        if re.match(r"^..\s(.*?)::\s*", line):
-            directives.append((i, find_directive_end(i, lines)))
-        elif re.match(r"^::\s*$", line):
-            directives.append((i, find_directive_end(i, lines)))
-
-    skip_types = (
-        doc_nodes.target,
-        doc_nodes.literal_block,
-    )
-    title_types = (
-        doc_nodes.title,
-    )
-    max_line_length = cfg['max_line_length']
-    allow_long = cfg['allow_long_titles']
-    for i, line in enumerate(lines):
-        if len(line) > max_line_length:
-            in_directive = False
-            for (start, end) in directives:
-                if i >= start and i <= end:
-                    in_directive = True
-                    break
-            if in_directive:
-                continue
-            stripped = line.strip()
-            if ' ' not in stripped:
-                continue
-            if contains_url(stripped):
-                continue
-            nodes = find_containing_nodes(i + 1, node_lines, first_line)
-            if any([isinstance(n, skip_types) for n in nodes]):
-                continue
-            if allow_long and any([isinstance(n, title_types) for n in nodes]):
-                continue
-            yield (i + 1, 'D001', 'Line too long')
-
-
-def check_trailing_whitespace(fn, cfg, line):
-    if TRAILING_WHITESPACE_REGEX.search(line):
-        yield ('D002', 'Trailing whitespace')
-
-
-def check_indentation_no_tab(fn, cfg, line):
-    match = STARTING_WHITESPACE_REGEX.search(line)
-    if match:
-        spaces = match.group(1)
-        if '\t' in spaces:
-            yield ('D003', 'Tabulation used for indentation')
-
-
-def check_carriage_return(fn, cfg, line):
-    if "\r" in line:
-        yield ('D004', 'Found literal carriage return')
-
-
-def check_lines(fn, cfg, lines, line_checks):
-    for idx, line in enumerate(lines, 1):
-        line = six.text_type(line, encoding='utf8')
-        line = line.rstrip('\n')
-        for check in line_checks:
-            for code, message in check(fn, cfg, line):
-                yield idx, code, message
-
-
-def check_files(cfg, filenames, line_checks, content_checks):
-    for fn in filenames:
-        with open(fn, 'rb') as fh:
-            content = six.text_type(fh.read(), encoding='utf8')
-            for content_check in content_checks:
-                for line_num, code, message in content_check(fn, cfg, content):
-                    yield fn, line_num, code, message
-            fh.seek(0)
-            for line_num, code, message in check_lines(fn, cfg,
-                                                       fh, line_checks):
-                yield fn, line_num, code, message
-
-
-def find_files(pathes, patterns):
-    for path in pathes:
-        if os.path.isfile(path):
-            yield path
-        elif os.path.isdir(path):
-            for root, dirnames, filenames in os.walk(path):
-                for filename in filenames:
-                    if any(fnmatch.fnmatch(filename, pattern)
-                           for pattern in patterns):
-                        yield os.path.join(root, filename)
-        else:
-            print('Invalid path: %s' % path)
-
-
-def split_string(text):
-    return [i.strip() for i in text.split(",") if i.strip()]
-
-
-def extract_config(args, default_cfg):
-    if args.config:
-        parser = configparser.RawConfigParser()
-        for fn in list(args.config):
-            with open(fn, 'r') as fh:
-                parser.readfp(fh, filename=fn)
-    else:
-        parser = configparser.RawConfigParser()
-        parser.read(CONFIG_FILENAMES)
-    cfg = dict(default_cfg)
-    try:
-        cfg['max_line_length'] = parser.getint("doc8", "max-line-length")
-    except (configparser.NoSectionError, configparser.NoOptionError):
-        pass
-    try:
-        ignores = parser.get("doc8", "ignore")
-    except (configparser.NoSectionError, configparser.NoOptionError):
-        pass
-    else:
-        cfg['ignore'].update(split_string(ignores))
-    try:
-        cfg['allow_long_titles'] = parser.getboolean("doc8",
-                                                     "allow-long-titles")
-    except (configparser.NoSectionError, configparser.NoOptionError):
-        pass
-    return cfg
-
-
-def unique_itr(itr):
-    seen = set()
-    for i in itr:
-        if i in seen:
-            continue
-        yield i
-        seen.add(i)
-
-
-def main():
-    file_types = ", ".join(FILE_PATTERNS)
-    default_configs = ", ".join(CONFIG_FILENAMES)
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.RawDescriptionHelpFormatter)
-    parser.add_argument("paths", metavar='path', type=str, nargs='*',
-                        help=("path to scan for %s files"
-                              " (default: os.getcwd())") % file_types,
-                        default=[os.getcwd()])
-    parser.add_argument("--config", metavar='path', action="append",
-                        help="user config file location"
-                             " (default: %s)" % default_configs)
-    parser.add_argument("--allow-long-titles", action="store_true",
-                        help="allow long section titles (default: False)",
-                        default=False)
-    parser.add_argument("--ignore", action="append", metavar="code",
-                        help="ignore the given errors code/codes",
-                        default=[])
-    args = parser.parse_args()
-    default_cfg = {
-        'max_line_length': MAX_LINE_LENGTH,
-        'ignore': set(),
-        'allow_long_titles': args.allow_long_titles,
-    }
-    for c in args.ignore:
-        default_cfg['ignore'].update(split_string(c))
-    cfg = extract_config(args, default_cfg)
-    line_checks = [
-        check_trailing_whitespace,
-        check_indentation_no_tab,
-        check_carriage_return,
-    ]
-    content_checks = [
-        check_max_length,
-    ]
-    ok = True
-    paths = unique_itr(args.paths)
-    for error in check_files(cfg, find_files(paths, FILE_PATTERNS),
-                             line_checks, content_checks):
-        if error[2] in cfg['ignore']:
-            continue
-        ok = False
-        print('%s:%s: %s %s' % error)
-    sys.exit(0 if ok else 1)
-
-if __name__ == '__main__':
-    main()