From 40f2f0a96b31212c5bdba3183739ee580a57f614 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sun, 25 Feb 2018 21:31:46 +0000 Subject: [PATCH 1/5] DOC: Adding script to validate docstrings, and generate list of all functions/methods with state --- scripts/validate_docstrings.py | 256 +++++++++++++++++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100755 scripts/validate_docstrings.py diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py new file mode 100755 index 0000000000000..2dffbf65cc223 --- /dev/null +++ b/scripts/validate_docstrings.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python +""" +Analyze docstrings to detect errors. + +If no argument is provided, it does a quick check of docstrings and returns +a csv with all API functions and results of basic checks. + +If a function or method is provided in the form "pandas.function", +"pandas.module.class.method", etc. a list of all errors in the docstring for +the specified function or method. + +Usage:: + $ ./validate_docstrings.py + $ ./validate_docstrings.py pandas.DataFrame.head +""" +import os +import sys +import csv +import re +import argparse +import inspect +import types +import importlib +import doctest +import numpy +import pandas + +BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +sys.path.insert(0, os.path.join(BASE_PATH, 'doc', 'sphinxext')) +from numpydoc.docscrape import NumpyDocString + + +class Docstring: + def __init__(self, method_name, method_obj): + self.method_name = method_name + self.method_obj = method_obj + self.raw_doc = method_obj.__doc__ or '' + self.doc = NumpyDocString(self.raw_doc) + + def __len__(self): + return len(self.raw_doc) + + @property + def first_line_blank(self): + if self.raw_doc: + return not bool(self.raw_doc.split('\n')[0].strip()) + + @property + def summary(self): + return ' '.join(self.doc['Summary']) + + @property + def extended_summary(self): + return ' '.join(self.doc['Extended Summary']) + + @property + def needs_summary(self): + return not (bool(self.summary) and bool(self.extended_summary)) + + @property + def doc_parameters(self): + return self.doc['Parameters'] + + @property + def signature_parameters(self): + # skip properties and C functions + if not isinstance(self.method_obj, types.FunctionType): + return tuple() + params = tuple(inspect.signature(self.method_obj).parameters.keys()) + if params and params[0] in ('self', 'cls'): + return params[1:] + return params + + @property + def correct_parameters(self): + if self.doc_parameters: + doc_param_names = list(zip(*self.doc_parameters))[0] + return doc_param_names == self.signature_parameters + + return not bool(self.signature_parameters) + + @property + def see_also(self): + return self.doc['See Also'] + + @property + def examples(self): + return self.doc['Examples'] + + @property + def first_line_ends_in_dot(self): + if self.doc: + return self.doc.split('\n')[0][-1] == '.' + + @property + def deprecated(self): + pattern = re.compile('.. deprecated:: ') + return (self.method_name.startswith('pandas.Panel') or + bool(pattern.search(self.summary)) or + bool(pattern.search(self.extended_summary))) + + @property + def examples_pass_tests(self): + finder = doctest.DocTestFinder() + runner = doctest.DocTestRunner() + context = {'np': numpy, 'pd': pandas} + for test in finder.find(self.raw_doc, self.method_name, globs=context): + print(runner.run(test)) + + +def get_api_items(remove_duplicates=True): + api_fname = os.path.join(BASE_PATH, 'doc', 'source', 'api.rst') + + seen = set() + position = None + with open(api_fname) as f: + for line in f: + if line.startswith('.. currentmodule::'): + current_module = line.replace('.. currentmodule::', '').strip() + continue + + if line == '.. autosummary::\n': + position = 'autosummary' + continue + + if position == 'autosummary': + if line == '\n': + position = 'items' + continue + + if position == 'items': + if line == '\n': + position = None + continue + item = line.strip() + func = importlib.import_module(current_module) + for part in item.split('.'): + func = getattr(func, part) + + # skip properties and C functions + code_line = '' + fname = '' + if isinstance(func, types.FunctionType): + fname = inspect.getsourcefile(func) + fname = os.path.relpath(fname, BASE_PATH) + code_line = inspect.getsourcelines(func)[-1] + key = fname, code_line + if remove_duplicates and key in seen: + continue + else: + seen.add(key) + + yield '.'.join([current_module, item]), func, fname, code_line + + +def validate_all(): + writer = csv.writer(sys.stdout) + writer.writerow(['Function or method', + 'File', + 'Code line', + 'GitHub link', + 'Is deprecated', + 'Has summary', + 'Has extended summary', + 'Parameters ok', + 'Has examples']) + for func_name, func_obj, fname, code_line in get_api_items(): + doc = Docstring(func_name, func_obj) + url = '' + if fname: + url = 'https://github.com/pandas-dev/pandas/blob/master/' + url += '{}#L{}'.format(fname, code_line) + writer.writerow([func_name, + fname, + code_line, + url, + int(doc.deprecated), + int(bool(doc.summary)), + int(bool(doc.extended_summary)), + int(doc.correct_parameters), + int(bool(doc.examples))]) + + return 0 + + +def validate_one(func_name): + for maxsplit in range(1, func_name.count('.') + 1): + # TODO when py3 only replace by: module, *func_parts = ... + func_name_split = func_name.rsplit('.', maxsplit=maxsplit) + module = func_name_split[0] + func_parts = func_name_split[1:] + try: + func_obj = importlib.import_module(module) + except ImportError: + pass + else: + continue + + if 'module' not in locals(): + raise ImportError('No module can be imported ' + 'from "{}"'.format(func_name)) + + for part in func_parts: + func_obj = getattr(func_obj, part) + + doc = Docstring(func_name, func_obj) + + errs = [] + if not doc.summary: + errs.append('No summary found') + else: + if not doc.summary[0].isupper(): + errs.append('Summary does not start with capital') + if doc.summary[-1] != '.': + errs.append('Summary does not end with dot') + if doc.summary.split(' ')[0][-1] == 's': + errs.append('Summary must start with infinitive verb, ' + 'not third person (e.g. use "Generate" instead of ' + '"Generates")') + if not doc.extended_summary: + errs.append('No extended summary found') + if not doc.correct_parameters: + errs.append('Documented parameters do not match the signature') + if not doc.examples: + errs.append('No examples') + + if errs: + sys.stderr.write('Errors for "{}" docstring:\n'.format(func_name)) + for err in errs: + sys.stderr.write('\t* {}\n'.format(err)) + else: + sys.stderr.write('Docstring for "{}" correct. :)\n'.format(func_name)) + + return len(errs) + + +def main(function): + if function is None: + return validate_all() + else: + return validate_one(function) + + +if __name__ == '__main__': + argparser = argparse.ArgumentParser( + description='validate pandas docstrings') + argparser.add_argument('function', + nargs='?', + default=None, + help=('function or method to validate ' + '(e.g. pandas.DataFrame.head) ' + 'if not provided, all docstrings ' + 'are validated')) + args = argparser.parse_args() + sys.exit(main(args.function)) From 070841e9cf274df2f933056d55b33d7e91d7d1f0 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Wed, 28 Feb 2018 18:12:04 +0000 Subject: [PATCH 2/5] Some refactoring, trying harder to get the source code, and removing the filter of duplicates in favor of a column to show functions with shared code --- scripts/validate_docstrings.py | 108 ++++++++++++++++++++++----------- 1 file changed, 71 insertions(+), 37 deletions(-) diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 2dffbf65cc223..945d1ed645647 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -17,20 +17,39 @@ import sys import csv import re +import functools import argparse import inspect -import types import importlib import doctest import numpy -import pandas BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.insert(0, os.path.join(BASE_PATH, 'doc', 'sphinxext')) +sys.path.insert(0, os.path.join(BASE_PATH)) +import pandas + +sys.path.insert(1, os.path.join(BASE_PATH, 'doc', 'sphinxext')) from numpydoc.docscrape import NumpyDocString +def _to_original_callable(obj): + while True: + if inspect.isfunction(obj) or inspect.isclass(obj): + f = inspect.getfile(obj) + if f.startswith('<') and f.endswith('>'): + return None + return obj + if inspect.ismethod(obj): + obj = obj.__func__ + elif isinstance(obj, functools.partial): + obj = obj.func + elif isinstance(obj, property): + obj = obj.fget + else: + return None + + class Docstring: def __init__(self, method_name, method_obj): self.method_name = method_name @@ -41,6 +60,27 @@ def __init__(self, method_name, method_obj): def __len__(self): return len(self.raw_doc) + @property + def source_file_name(self): + fname = inspect.getsourcefile(self.method_obj) + if fname: + fname = os.path.relpath(fname, BASE_PATH) + return fname + + @property + def source_file_def_line(self): + try: + return inspect.getsourcelines(self.method_obj)[-1] + except OSError: + pass + + @property + def github_url(self): + url = 'https://github.com/pandas-dev/pandas/blob/master/' + url += '{}#L{}'.format(self.source_file_name, + self.source_file_def_line) + return url + @property def first_line_blank(self): if self.raw_doc: @@ -64,8 +104,7 @@ def doc_parameters(self): @property def signature_parameters(self): - # skip properties and C functions - if not isinstance(self.method_obj, types.FunctionType): + if not inspect.isfunction(self.method_obj): return tuple() params = tuple(inspect.signature(self.method_obj).parameters.keys()) if params and params[0] in ('self', 'cls'): @@ -109,10 +148,9 @@ def examples_pass_tests(self): print(runner.run(test)) -def get_api_items(remove_duplicates=True): +def get_api_items(): api_fname = os.path.join(BASE_PATH, 'doc', 'source', 'api.rst') - seen = set() position = None with open(api_fname) as f: for line in f: @@ -138,25 +176,13 @@ def get_api_items(remove_duplicates=True): for part in item.split('.'): func = getattr(func, part) - # skip properties and C functions - code_line = '' - fname = '' - if isinstance(func, types.FunctionType): - fname = inspect.getsourcefile(func) - fname = os.path.relpath(fname, BASE_PATH) - code_line = inspect.getsourcelines(func)[-1] - key = fname, code_line - if remove_duplicates and key in seen: - continue - else: - seen.add(key) - - yield '.'.join([current_module, item]), func, fname, code_line + yield '.'.join([current_module, item]), func def validate_all(): writer = csv.writer(sys.stdout) writer.writerow(['Function or method', + 'Type', 'File', 'Code line', 'GitHub link', @@ -164,22 +190,30 @@ def validate_all(): 'Has summary', 'Has extended summary', 'Parameters ok', - 'Has examples']) - for func_name, func_obj, fname, code_line in get_api_items(): - doc = Docstring(func_name, func_obj) - url = '' - if fname: - url = 'https://github.com/pandas-dev/pandas/blob/master/' - url += '{}#L{}'.format(fname, code_line) - writer.writerow([func_name, - fname, - code_line, - url, - int(doc.deprecated), - int(bool(doc.summary)), - int(bool(doc.extended_summary)), - int(doc.correct_parameters), - int(bool(doc.examples))]) + 'Has examples', + 'Shared code with']) + seen = {} + for func_name, func in get_api_items(): + obj_type = type(func).__name__ + original_callable = _to_original_callable(func) + if original_callable is None: + writer.writerow([func_name, obj_type] + [''] * 9) + else: + doc = Docstring(func_name, original_callable) + key = doc.source_file_name, doc.source_file_def_line + shared_code = seen.get(key, '') + seen[key] = func_name + writer.writerow([func_name, + obj_type, + doc.source_file_name, + doc.source_file_def_line, + doc.github_url, + int(doc.deprecated), + int(bool(doc.summary)), + int(bool(doc.extended_summary)), + int(doc.correct_parameters), + int(bool(doc.examples)), + shared_code]) return 0 From 78419b716fe7c0cc13800ece095a77f844248f8e Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Wed, 28 Feb 2018 23:37:15 +0000 Subject: [PATCH 3/5] Testing examples when validating docstring --- scripts/validate_docstrings.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 945d1ed645647..b2f485164e701 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -141,11 +141,14 @@ def deprecated(self): @property def examples_pass_tests(self): + flags = doctest.NORMALIZE_WHITESPACE finder = doctest.DocTestFinder() - runner = doctest.DocTestRunner() + runner = doctest.DocTestRunner(optionflags=flags) context = {'np': numpy, 'pd': pandas} + total_failed = 0 for test in finder.find(self.raw_doc, self.method_name, globs=context): - print(runner.run(test)) + total_failed += runner.run(test).failed + return total_failed == 0 def get_api_items(): @@ -258,6 +261,9 @@ def validate_one(func_name): errs.append('Documented parameters do not match the signature') if not doc.examples: errs.append('No examples') + else: + if not doc.examples_pass_tests: + errs.append('Examples do not pass test') if errs: sys.stderr.write('Errors for "{}" docstring:\n'.format(func_name)) From b32df4b39fbec453c6dcdf328f8e24b8440d9217 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 2 Mar 2018 00:48:21 +0000 Subject: [PATCH 4/5] Addressing comments from review --- scripts/validate_docstrings.py | 89 ++++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 16 deletions(-) diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index b2f485164e701..8ceafebec7dc5 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -19,9 +19,14 @@ import re import functools import argparse +import contextlib import inspect import importlib import doctest +try: + from io import StringIO +except ImportError: + from cStringIO import StringIO import numpy BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -50,6 +55,18 @@ def _to_original_callable(obj): return None +def _output_header(title, width=80, char='#'): + full_line = char * width + side_len = (width - len(title) - 2) // 2 + adj = '' if len(title) % 2 == 0 else ' ' + title_line = '{side} {title}{adj} {side}'.format(side=char * side_len, + title=title, + adj=adj) + + return '\n{full_line}\n{title_line}\n{full_line}\n\n'.format( + full_line=full_line, title_line=title_line) + + class Docstring: def __init__(self, method_name, method_obj): self.method_name = method_name @@ -88,10 +105,14 @@ def first_line_blank(self): @property def summary(self): + if not self.doc['Extended Summary'] and len(self.doc['Summary']) > 1: + return '' return ' '.join(self.doc['Summary']) @property def extended_summary(self): + if not self.doc['Extended Summary'] and len(self.doc['Summary']) > 1: + return ' '.join(self.doc['Summary']) return ' '.join(self.doc['Extended Summary']) @property @@ -112,12 +133,30 @@ def signature_parameters(self): return params @property - def correct_parameters(self): + def parameter_mismatches(self): + errs = [] + signature_params = self.signature_parameters if self.doc_parameters: - doc_param_names = list(zip(*self.doc_parameters))[0] - return doc_param_names == self.signature_parameters + doc_params = list(zip(*self.doc_parameters))[0] + else: + doc_params = [] + + missing = set(signature_params) - set(doc_params) + if missing: + errs.append('Parameters {!r} not documented'.format(missing)) + extra = set(doc_params) - set(signature_params) + if extra: + errs.append('Unknown parameters {!r}'.format(extra)) + if not missing and not extra and signature_params != doc_params: + errs.append('Wrong parameters order. ' + + 'Actual: {!r}. '.format(signature_params) + + 'Documented: {!r}'.format(doc_params)) - return not bool(self.signature_parameters) + return errs + + @property + def correct_parameters(self): + return not bool(self.parameter_mismatches) @property def see_also(self): @@ -140,15 +179,18 @@ def deprecated(self): bool(pattern.search(self.extended_summary))) @property - def examples_pass_tests(self): - flags = doctest.NORMALIZE_WHITESPACE + def examples_errors(self): + flags = doctest.NORMALIZE_WHITESPACE | doctest.IGNORE_EXCEPTION_DETAIL finder = doctest.DocTestFinder() runner = doctest.DocTestRunner(optionflags=flags) context = {'np': numpy, 'pd': pandas} - total_failed = 0 + error_msgs = '' for test in finder.find(self.raw_doc, self.method_name, globs=context): - total_failed += runner.run(test).failed - return total_failed == 0 + f = StringIO() + with contextlib.redirect_stdout(f): + runner.run(test) + error_msgs += f.getvalue() + return error_msgs def get_api_items(): @@ -243,6 +285,9 @@ def validate_one(func_name): doc = Docstring(func_name, func_obj) + sys.stderr.write(_output_header('Docstring ({})'.format(func_name))) + sys.stderr.write('{}\n'.format(doc.raw_doc)) + errs = [] if not doc.summary: errs.append('No summary found') @@ -257,21 +302,33 @@ def validate_one(func_name): '"Generates")') if not doc.extended_summary: errs.append('No extended summary found') - if not doc.correct_parameters: - errs.append('Documented parameters do not match the signature') + + param_errs = doc.parameter_mismatches + if param_errs: + errs.append('Errors in parameters section') + for param_err in param_errs: + errs.append('\t{}'.format(param_err)) + + examples_errs = '' if not doc.examples: - errs.append('No examples') + errs.append('No examples section found') else: - if not doc.examples_pass_tests: - errs.append('Examples do not pass test') + examples_errs = doc.examples_errors + if examples_errs: + errs.append('Examples do not pass tests') + sys.stderr.write(_output_header('Validation')) if errs: - sys.stderr.write('Errors for "{}" docstring:\n'.format(func_name)) + sys.stderr.write('Errors found:\n') for err in errs: - sys.stderr.write('\t* {}\n'.format(err)) + sys.stderr.write('\t{}\n'.format(err)) else: sys.stderr.write('Docstring for "{}" correct. :)\n'.format(func_name)) + if examples_errs: + sys.stderr.write(_output_header('Doctests')) + sys.stderr.write(examples_errs) + return len(errs) From da24d08fa21f8b848e26a4aaf20e1344e787fcd7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 2 Mar 2018 10:35:57 +0100 Subject: [PATCH 5/5] dedent docstring --- scripts/validate_docstrings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 8ceafebec7dc5..7807785c24751 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -23,6 +23,7 @@ import inspect import importlib import doctest +import textwrap try: from io import StringIO except ImportError: @@ -72,6 +73,7 @@ def __init__(self, method_name, method_obj): self.method_name = method_name self.method_obj = method_obj self.raw_doc = method_obj.__doc__ or '' + self.raw_doc = textwrap.dedent(self.raw_doc) self.doc = NumpyDocString(self.raw_doc) def __len__(self):