doc8/scripts/doc8

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright (C) 2014 Ivan Melnikov <iv at altlinux dot org>
#
# Author: Joshua Harlow <harlowja@yahoo-inc.com>
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.


"""Check documentation for simple style requirements.

What is checked:
    - lines should not be longer than 79 characters - D001
      - exception: line with no whitespace except in the beginning
      - exception: lines with http or https urls
      - exception: literal blocks
      - exception: rst target directives
    - no trailing whitespace - D002
    - no tabulation for indentation - D003
    - no carriage returns (use unix newlines) - D004
"""

import argparse
import collections
import fnmatch
import os
import re
import sys

from docutils import frontend
from docutils import nodes as doc_nodes
from docutils.parsers import rst
from docutils import utils

import six
from six.moves import configparser


FILE_PATTERNS = ['*.rst', '*.txt']
MAX_LINE_LENGTH = 79
TRAILING_WHITESPACE_REGEX = re.compile('\s$')
STARTING_WHITESPACE_REGEX = re.compile('^(\s+)')
CONFIG_FILENAMES = [
    "doc8.ini",
    "tox.ini",
    "pep8.ini",
    "setup.cfg",
]


def check_max_length(fn, cfg, contents):

    def contains_url(line):
        if "http://" in line or "https://" in line:
            return True
        return False

    def any_node_type(node, node_types):
        n = node
        node_types = tuple(node_types)
        while n is not None:
            if isinstance(n, node_types):
                return True
            n = n.parent
        return False

    def extract_lines(node, start_line):
        lines = [start_line]
        if isinstance(node, (doc_nodes.title)):
            start = start_line - len(node.rawsource.splitlines())
            if start >= 0:
                lines.append(start)
        if isinstance(node, (doc_nodes.literal_block)):
            end = start_line + len(node.rawsource.splitlines()) - 1
            lines.append(end)
        return lines

    def gather_lines(node):
        lines = []
        for n in node.traverse(include_self=True):
            lines.extend(extract_lines(n, find_line(n)))
        return lines

    def find_line(node):
        n = node
        while n is not None:
            if n.line is not None:
                return n.line
            n = n.parent
        return None

    def find_containing_nodes(num, node_lines, first_line):
        if num < first_line and len(node_lines):
            return [node_lines[0][0]]
        contained_in = []
        for (n, (line_min, line_max)) in node_lines:
            if num >= line_min and num <= line_max:
                contained_in.append((n, (line_min, line_max)))
        smallest_span = None
        best_nodes = []
        for (n, (line_min, line_max)) in contained_in:
            span = line_max - line_min
            if smallest_span is None:
                smallest_span = span
                best_nodes = [n]
            elif span < smallest_span:
                smallest_span = span
                best_nodes = [n]
            elif span == smallest_span:
                best_nodes.append(n)
        return best_nodes

    def find_directive_end(start, lines):

        def starting_whitespace(line):
            m = re.match(r"^(\s+)(.*)$", line)
            if not m:
                return 0
            return len(m.group(1))

        def all_whitespace(line):
            return bool(re.match(r"^(\s*)$", line))

        after_lines = collections.deque(lines[start + 1:])
        k = 0
        while after_lines:
            line = after_lines.popleft()
            if all_whitespace(line) or starting_whitespace(line) >= 1:
                k += 1
            else:
                break
        return start + k

    # Use the rst parsers document output to do as much of the validation
    # as we can without resorting to custom logic (this parser is what sphinx
    # and others use anyway so it's very mature).
    parser = rst.Parser()
    defaults = {
        'input_encoding': 'utf8',
        'halt_level': 5,
        'report_level': 5,
        'quiet': True,
        'file_insertion_enabled': False,
        'traceback': True,
    }
    opt = frontend.OptionParser(components=[parser], defaults=defaults)
    doc = utils.new_document(source_path=fn, settings=opt.get_default_values())
    parser.parse(contents, doc)
    node_lines = []
    first_line = -1
    for n in doc.traverse(include_self=True):
        line = find_line(n)
        if line is None:
            continue
        if any_node_type(n, [doc_nodes.system_message]):
            # These are failures, and there node content isn't correct,
            # so skip them; we should work on making it so that the parser
            # stops doing this custom parent creation in the first place.
            continue
        if first_line == -1:
            first_line = line
        contained_lines = set(gather_lines(n))
        node_lines.append((n, (min(contained_lines), max(contained_lines))))

    # Find where directives start & end so that we can exclude content in
    # these directive regions (the rst parser may not handle this correctly
    # for unknown directives, so we have to do it manually).
    lines = contents.split("\n")
    directives = []
    for i, line in enumerate(lines):
        if re.match(r"^..\s(.*?)::\s*", line):
            directives.append((i, find_directive_end(i, lines)))
        elif re.match(r"^::\s*$", line):
            directives.append((i, find_directive_end(i, lines)))

    skip_types = (
        doc_nodes.target,
        doc_nodes.literal_block,
    )
    title_types = (
        doc_nodes.title,
    )
    max_line_length = cfg['max_line_length']
    allow_long = cfg['allow_long_titles']
    for i, line in enumerate(lines):
        if len(line) > max_line_length:
            in_directive = False
            for (start, end) in directives:
                if i >= start and i <= end:
                    in_directive = True
                    break
            if in_directive:
                continue
            stripped = line.strip()
            if ' ' not in stripped:
                continue
            if contains_url(stripped):
                continue
            nodes = find_containing_nodes(i + 1, node_lines, first_line)
            if any([isinstance(n, skip_types) for n in nodes]):
                continue
            if allow_long and any([isinstance(n, title_types) for n in nodes]):
                continue
            yield (i + 1, 'D001', 'Line too long')


def check_trailing_whitespace(fn, cfg, line):
    if TRAILING_WHITESPACE_REGEX.search(line):
        yield ('D002', 'Trailing whitespace')


def check_indentation_no_tab(fn, cfg, line):
    match = STARTING_WHITESPACE_REGEX.search(line)
    if match:
        spaces = match.group(1)
        if '\t' in spaces:
            yield ('D003', 'Tabulation used for indentation')


def check_carriage_return(fn, cfg, line):
    if "\r" in line:
        yield ('D004', 'Found literal carriage return')


def check_lines(fn, cfg, lines, line_checks):
    for idx, line in enumerate(lines, 1):
        line = six.text_type(line, encoding='utf8')
        line = line.rstrip('\n')
        for check in line_checks:
            for code, message in check(fn, cfg, line):
                yield idx, code, message


def check_files(cfg, filenames, line_checks, content_checks):
    for fn in filenames:
        with open(fn, 'rb') as fh:
            content = six.text_type(fh.read(), encoding='utf8')
            for content_check in content_checks:
                for line_num, code, message in content_check(fn, cfg, content):
                    yield fn, line_num, code, message
            fh.seek(0)
            for line_num, code, message in check_lines(fn, cfg,
                                                       fh, line_checks):
                yield fn, line_num, code, message


def find_files(pathes, patterns):
    for path in pathes:
        if os.path.isfile(path):
            yield path
        elif os.path.isdir(path):
            for root, dirnames, filenames in os.walk(path):
                for filename in filenames:
                    if any(fnmatch.fnmatch(filename, pattern)
                           for pattern in patterns):
                        yield os.path.join(root, filename)
        else:
            print('Invalid path: %s' % path)


def split_string(text):
    return [i.strip() for i in text.split(",") if i.strip()]


def extract_config(args, default_cfg):
    if args.config:
        parser = configparser.RawConfigParser()
        for fn in list(args.config):
            with open(fn, 'r') as fh:
                parser.readfp(fh, filename=fn)
    else:
        parser = configparser.RawConfigParser()
        parser.read(CONFIG_FILENAMES)
    cfg = dict(default_cfg)
    try:
        cfg['max_line_length'] = parser.getint("doc8", "max-line-length")
    except (configparser.NoSectionError, configparser.NoOptionError):
        pass
    try:
        ignores = parser.get("doc8", "ignore")
    except (configparser.NoSectionError, configparser.NoOptionError):
        pass
    else:
        cfg['ignore'].update(split_string(ignores))
    try:
        cfg['allow_long_titles'] = parser.getboolean("doc8",
                                                     "allow-long-titles")
    except (configparser.NoSectionError, configparser.NoOptionError):
        pass
    return cfg


def unique_itr(itr):
    seen = set()
    for i in itr:
        if i in seen:
            continue
        yield i
        seen.add(i)


def main():
    file_types = ", ".join(FILE_PATTERNS)
    default_configs = ", ".join(CONFIG_FILENAMES)
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("paths", metavar='path', type=str, nargs='*',
                        help=("path to scan for %s files"
                              " (default: os.getcwd())") % file_types,
                        default=[os.getcwd()])
    parser.add_argument("--config", metavar='path', action="append",
                        help="user config file location"
                             " (default: %s)" % default_configs)
    parser.add_argument("--allow-long-titles", action="store_true",
                        help="allow long section titles (default: False)",
                        default=False)
    parser.add_argument("--ignore", action="append", metavar="code",
                        help="ignore the given errors code/codes",
                        default=[])
    args = parser.parse_args()
    default_cfg = {
        'max_line_length': MAX_LINE_LENGTH,
        'ignore': set(),
        'allow_long_titles': args.allow_long_titles,
    }
    for c in args.ignore:
        default_cfg['ignore'].update(split_string(c))
    cfg = extract_config(args, default_cfg)
    line_checks = [
        check_trailing_whitespace,
        check_indentation_no_tab,
        check_carriage_return,
    ]
    content_checks = [
        check_max_length,
    ]
    ok = True
    paths = unique_itr(args.paths)
    for error in check_files(cfg, find_files(paths, FILE_PATTERNS),
                             line_checks, content_checks):
        if error[2] in cfg['ignore']:
            continue
        ok = False
        print('%s:%s: %s %s' % error)
    sys.exit(0 if ok else 1)

if __name__ == '__main__':
    main()