From b3f92285bdb8e162638bb975ceb887100a9c4740 Mon Sep 17 00:00:00 2001 From: Frank Harrison Date: Fri, 27 Mar 2020 07:39:29 +0000 Subject: [PATCH 1/3] mapreduce| chore| Updating ChangeLog/Contributors/Release notes --- CONTRIBUTORS.txt | 2 ++ ChangeLog | 4 ++++ doc/whatsnew/2.5.rst | 2 ++ 3 files changed, 8 insertions(+) diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index 689be864f3..107492dbe5 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -423,3 +423,5 @@ contributors: * Joshua Cannon: contributor * Giuseppe Valente: contributor + +* Frank Harrison (doublethefish): contributor diff --git a/ChangeLog b/ChangeLog index d0ee2fed91..b51af8657f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -216,6 +216,10 @@ What's New in Pylint 2.5.0? Release date: 2020-04-27 +* Fixes duplicate-errors not working with -j2+ + + Close #3314 + * Fix a false negative for ``undefined-variable`` when using class attribute in comprehension. Close #3494 diff --git a/doc/whatsnew/2.5.rst b/doc/whatsnew/2.5.rst index 172484ceac..577d365114 100644 --- a/doc/whatsnew/2.5.rst +++ b/doc/whatsnew/2.5.rst @@ -100,3 +100,5 @@ multiple types of string formatting to be used. The type of formatting to use is chosen through enabling and disabling messages rather than through the logging-format-style option. The fstr value of the logging-format-style option is not valid. + +* Fixes duplicate code detection for --jobs=2+ From 796b29319c5015bc6dba97ae6207cb6181171543 Mon Sep 17 00:00:00 2001 From: Frank Harrison Date: Thu, 26 Mar 2020 10:50:55 +0000 Subject: [PATCH 2/3] mapreduce| Adds map/reduce functionality to SimilarChecker Before adding a new mixin this proves the concept works, adding tests as examples of how this would work in the main linter. The idea here is that, because `check_parallel()` uses a multiprocess `map` function, that the natural follow on is to use a 'reduce` paradigm. This should demonstrate that. --- pylint/checkers/similar.py | 30 ++++++- tests/checkers/unittest_similar.py | 140 +++++++++++++++++++++++++++++ tests/input/similar_lines_a.py | 63 +++++++++++++ tests/input/similar_lines_b.py | 36 ++++++++ 4 files changed, 268 insertions(+), 1 deletion(-) create mode 100644 tests/input/similar_lines_a.py create mode 100644 tests/input/similar_lines_b.py diff --git a/pylint/checkers/similar.py b/pylint/checkers/similar.py index 82f79e8cc8..20b67088be 100644 --- a/pylint/checkers/similar.py +++ b/pylint/checkers/similar.py @@ -160,6 +160,20 @@ def _iter_sims(self): for lineset2 in self.linesets[idx + 1 :]: yield from self._find_common(lineset, lineset2) + def get_map_data(self): + """ Returns the data we can use for a map/reduce process + + In this case we are returning this instance's Linesets, that is all file + information that will later be used for vectorisation. + """ + return self.linesets + + def combine_mapreduce_data(self, linesets_collection): + """ Reduces and recombines data into a format that we can report on + + The partner function of get_map_data() """ + self.linesets = [line for lineset in linesets_collection for line in lineset] + def stripped_lines(lines, ignore_comments, ignore_docstrings, ignore_imports): """return lines with leading/trailing whitespace and any ignored code @@ -352,7 +366,7 @@ def __init__(self, linter=None): def set_option(self, optname, value, action=None, optdict=None): """method called to set an option (registered in the options list) - overridden to report options setting to Similar + Overridden to report options setting to Similar """ BaseChecker.set_option(self, optname, value, action, optdict) if optname == "min-similarity-lines": @@ -402,6 +416,20 @@ def close(self): stats["nb_duplicated_lines"] = duplicated stats["percent_duplicated_lines"] = total and duplicated * 100.0 / total + def get_map_data(self): + """ Passthru override """ + return Similar.get_map_data(self) + + @classmethod + def reduce_map_data(cls, linter, data): + """ Reduces and recombines data into a format that we can report on + + The partner function of get_map_data() """ + recombined = SimilarChecker(linter) + recombined.open() + Similar.combine_mapreduce_data(recombined, linesets_collection=data) + recombined.close() + def register(linter): """required method to auto register this checker """ diff --git a/tests/checkers/unittest_similar.py b/tests/checkers/unittest_similar.py index ed4af2f5cd..a17eadd343 100644 --- a/tests/checkers/unittest_similar.py +++ b/tests/checkers/unittest_similar.py @@ -21,6 +21,8 @@ import pytest from pylint.checkers import similar +from pylint.lint import PyLinter +from pylint.testutils import TestReporter as Reporter INPUT = Path(__file__).parent / ".." / "input" SIMILAR1 = str(INPUT / "similar1") @@ -234,3 +236,141 @@ def test_no_args(): assert ex.code == 1 else: pytest.fail("not system exit") + + +def test_get_map_data(): + """ Tests that a SimilarChecker respects the MapReduceMixin interface + """ + linter = PyLinter(reporter=Reporter()) + + # Add a parallel checker to ensure it can map and reduce + linter.register_checker(similar.SimilarChecker(linter)) + + source_streams = ( + str(INPUT / "similar_lines_a.py"), + str(INPUT / "similar_lines_b.py"), + ) + expected_linelists = ( + ( + "", + "", + "", + "", + "", + "", + "def adipiscing(elit):", + 'etiam = "id"', + 'dictum = "purus,"', + 'vitae = "pretium"', + 'neque = "Vivamus"', + 'nec = "ornare"', + 'tortor = "sit"', + "return etiam, dictum, vitae, neque, nec, tortor", + "", + "", + "class Amet:", + "def similar_function_3_lines(self, tellus):", + "agittis = 10", + "tellus *= 300", + "return agittis, tellus", + "", + "def lorem(self, ipsum):", + 'dolor = "sit"', + 'amet = "consectetur"', + "return (lorem, dolor, amet)", + "", + "def similar_function_5_lines(self, similar):", + "some_var = 10", + "someother_var *= 300", + 'fusce = "sit"', + 'amet = "tortor"', + "return some_var, someother_var, fusce, amet", + "", + 'def __init__(self, moleskie, lectus="Mauris", ac="pellentesque"):', + 'metus = "ut"', + 'lobortis = "urna."', + 'Integer = "nisl"', + '(mauris,) = "interdum"', + 'non = "odio"', + 'semper = "aliquam"', + 'malesuada = "nunc."', + 'iaculis = "dolor"', + 'facilisis = "ultrices"', + 'vitae = "ut."', + "", + "return (", + "metus,", + "lobortis,", + "Integer,", + "mauris,", + "non,", + "semper,", + "malesuada,", + "iaculis,", + "facilisis,", + "vitae,", + ")", + "", + "def similar_function_3_lines(self, tellus):", + "agittis = 10", + "tellus *= 300", + "return agittis, tellus", + ), + ( + "", + "", + "", + "", + "", + "", + "", + "class Nulla:", + 'tortor = "ultrices quis porta in"', + 'sagittis = "ut tellus"', + "", + "def pulvinar(self, blandit, metus):", + "egestas = [mauris for mauris in zip(blandit, metus)]", + "neque = (egestas, blandit)", + "", + "def similar_function_5_lines(self, similar):", + "some_var = 10", + "someother_var *= 300", + 'fusce = "sit"', + 'amet = "tortor"', + 'iaculis = "dolor"', + "return some_var, someother_var, fusce, amet, iaculis, iaculis", + "", + "", + "def tortor(self):", + "ultrices = 2", + 'quis = ultricies * "porta"', + "return ultricies, quis", + "", + "", + "class Commodo:", + "def similar_function_3_lines(self, tellus):", + "agittis = 10", + "tellus *= 300", + 'laoreet = "commodo "', + "return agittis, tellus, laoreet", + ), + ) + + data = [] + + # Manually perform a 'map' type function + for source_fname in source_streams: + sim = similar.SimilarChecker(linter) + with open(source_fname) as stream: + sim.append_stream(source_fname, stream) + # The map bit, can you tell? ;) + data.extend(sim.get_map_data()) + + assert len(expected_linelists) == len(data) + for source_fname, expected_lines, lineset_obj in zip( + source_streams, expected_linelists, data + ): + assert source_fname == lineset_obj.name + # There doesn't seem to be a faster way of doing this, yet. + lines = (line for idx, line in lineset_obj.enumerate_stripped()) + assert tuple(expected_lines) == tuple(lines) diff --git a/tests/input/similar_lines_a.py b/tests/input/similar_lines_a.py new file mode 100644 index 0000000000..65a72a79d5 --- /dev/null +++ b/tests/input/similar_lines_a.py @@ -0,0 +1,63 @@ +""" A file designed to have lines of similarity when compared to similar_lines_b + +We use lorm-ipsum to generate 'random' code. """ +# Copyright (c) 2020 Frank Harrison + + +def adipiscing(elit): + etiam = "id" + dictum = "purus," + vitae = "pretium" + neque = "Vivamus" + nec = "ornare" + tortor = "sit" + return etiam, dictum, vitae, neque, nec, tortor + + +class Amet: + def similar_function_3_lines(self, tellus): # line same #1 + agittis = 10 # line same #2 + tellus *= 300 # line same #3 + return agittis, tellus # line diff + + def lorem(self, ipsum): + dolor = "sit" + amet = "consectetur" + return (lorem, dolor, amet) + + def similar_function_5_lines(self, similar): # line same #1 + some_var = 10 # line same #2 + someother_var *= 300 # line same #3 + fusce = "sit" # line same #4 + amet = "tortor" # line same #5 + return some_var, someother_var, fusce, amet # line diff + + def __init__(self, moleskie, lectus="Mauris", ac="pellentesque"): + metus = "ut" + lobortis = "urna." + Integer = "nisl" + (mauris,) = "interdum" + non = "odio" + semper = "aliquam" + malesuada = "nunc." + iaculis = "dolor" + facilisis = "ultrices" + vitae = "ut." + + return ( + metus, + lobortis, + Integer, + mauris, + non, + semper, + malesuada, + iaculis, + facilisis, + vitae, + ) + + def similar_function_3_lines(self, tellus): # line same #1 + agittis = 10 # line same #2 + tellus *= 300 # line same #3 + return agittis, tellus # line diff diff --git a/tests/input/similar_lines_b.py b/tests/input/similar_lines_b.py new file mode 100644 index 0000000000..21634883d8 --- /dev/null +++ b/tests/input/similar_lines_b.py @@ -0,0 +1,36 @@ +""" The sister file of similar_lines_a, another file designed to have lines of +similarity when compared to its sister file + +As with the sister file, we use lorm-ipsum to generate 'random' code. """ +# Copyright (c) 2020 Frank Harrison + + +class Nulla: + tortor = "ultrices quis porta in" + sagittis = "ut tellus" + + def pulvinar(self, blandit, metus): + egestas = [mauris for mauris in zip(blandit, metus)] + neque = (egestas, blandit) + + def similar_function_5_lines(self, similar): # line same #1 + some_var = 10 # line same #2 + someother_var *= 300 # line same #3 + fusce = "sit" # line same #4 + amet = "tortor" # line same #5 + iaculis = "dolor" # line diff + return some_var, someother_var, fusce, amet, iaculis, iaculis # line diff + + +def tortor(self): + ultrices = 2 + quis = ultricies * "porta" + return ultricies, quis + + +class Commodo: + def similar_function_3_lines(self, tellus): # line same #1 + agittis = 10 # line same #2 + tellus *= 300 # line same #3 + laoreet = "commodo " # line diff + return agittis, tellus, laoreet # line diff From cd21a1d3fc18618d6d8e9a7d590bc5dd71efa416 Mon Sep 17 00:00:00 2001 From: Frank Harrison Date: Thu, 26 Mar 2020 11:41:22 +0000 Subject: [PATCH 3/3] mapreduce| Fixes -jN for map/reduce Checkers (e.g. SimilarChecker) This integrate the map/reduce functionality into lint.check_process(). We previously had `map` being invoked, here we add `reduce` support. We do this by collecting the map-data by worker and then passing it to a reducer function on the Checker object, if available - determined by whether they confirm to the `mapreduce_checker.MapReduceMixin` mixin interface or nor. This allows Checker objects to function across file-streams when using multiprocessing/-j2+. For example SimilarChecker needs to be able to compare data across all files. The tests, that we also add here, check that a Checker instance returns and reports expected data and errors, such as error-messages and stats - at least in a exit-ok (0) situation. On a personal note, as we are copying more data across process boundaries, I suspect that the memory implications of this might cause issues for large projects already running with -jN and duplicate code detection on. That said, given that it takes a long time to perform lints of large code bases that is an issue for the [near?] future and likely to be part of the performance work. Either way but let's get it working first and deal with memory and perforamnce considerations later - I say this as there are many quick wins we can make here, e.g. file-batching, hashing lines, data compression and so on. --- pylint/checkers/__init__.py | 2 ++ pylint/checkers/mapreduce_checker.py | 18 ++++++++++ pylint/checkers/similar.py | 14 ++++---- pylint/lint/parallel.py | 50 ++++++++++++++++++++++++---- tests/checkers/unittest_similar.py | 3 +- tests/test_check_parallel.py | 27 +++++++++++---- tests/test_self.py | 5 +-- 7 files changed, 96 insertions(+), 23 deletions(-) create mode 100644 pylint/checkers/mapreduce_checker.py diff --git a/pylint/checkers/__init__.py b/pylint/checkers/__init__.py index bfde22b3d7..389e3d3d86 100644 --- a/pylint/checkers/__init__.py +++ b/pylint/checkers/__init__.py @@ -10,6 +10,7 @@ # Copyright (c) 2018-2019 Pierre Sassoulas # Copyright (c) 2018 ssolanki # Copyright (c) 2019 Bruno P. Kinoshita +# Copyright (c) 2020 Frank Harrison # Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html # For details: https://github.com/PyCQA/pylint/blob/master/COPYING @@ -43,6 +44,7 @@ """ from pylint.checkers.base_checker import BaseChecker, BaseTokenChecker +from pylint.checkers.mapreduce_checker import MapReduceMixin from pylint.utils import register_plugins diff --git a/pylint/checkers/mapreduce_checker.py b/pylint/checkers/mapreduce_checker.py new file mode 100644 index 0000000000..17a1090bf2 --- /dev/null +++ b/pylint/checkers/mapreduce_checker.py @@ -0,0 +1,18 @@ +# Copyright (c) 2020 Frank Harrison + +# Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html +# For details: https://github.com/PyCQA/pylint/blob/master/COPYING +import abc + + +class MapReduceMixin(metaclass=abc.ABCMeta): + """ A mixin design to allow multiprocess/threaded runs of a Checker """ + + @abc.abstractmethod + def get_map_data(self): + """ Returns mergable/reducible data that will be examined """ + + @classmethod + @abc.abstractmethod + def reduce_map_data(cls, linter, data): + """ For a given Checker, receives data for all mapped runs """ diff --git a/pylint/checkers/similar.py b/pylint/checkers/similar.py index 20b67088be..1f817ada25 100644 --- a/pylint/checkers/similar.py +++ b/pylint/checkers/similar.py @@ -31,7 +31,7 @@ import astroid -from pylint.checkers import BaseChecker, table_lines_from_stats +from pylint.checkers import BaseChecker, MapReduceMixin, table_lines_from_stats from pylint.interfaces import IRawChecker from pylint.reporters.ureports.nodes import Table from pylint.utils import decoding_stream @@ -161,7 +161,7 @@ def _iter_sims(self): yield from self._find_common(lineset, lineset2) def get_map_data(self): - """ Returns the data we can use for a map/reduce process + """Returns the data we can use for a map/reduce process In this case we are returning this instance's Linesets, that is all file information that will later be used for vectorisation. @@ -169,9 +169,9 @@ def get_map_data(self): return self.linesets def combine_mapreduce_data(self, linesets_collection): - """ Reduces and recombines data into a format that we can report on + """Reduces and recombines data into a format that we can report on - The partner function of get_map_data() """ + The partner function of get_map_data()""" self.linesets = [line for lineset in linesets_collection for line in lineset] @@ -302,7 +302,7 @@ def report_similarities(sect, stats, old_stats): # wrapper to get a pylint checker from the similar class -class SimilarChecker(BaseChecker, Similar): +class SimilarChecker(BaseChecker, Similar, MapReduceMixin): """checks for similarities and duplicated code. This computation may be memory / CPU intensive, so you should disable it if you experiment some problems. @@ -422,9 +422,9 @@ def get_map_data(self): @classmethod def reduce_map_data(cls, linter, data): - """ Reduces and recombines data into a format that we can report on + """Reduces and recombines data into a format that we can report on - The partner function of get_map_data() """ + The partner function of get_map_data()""" recombined = SimilarChecker(linter) recombined.open() Similar.combine_mapreduce_data(recombined, linesets_collection=data) diff --git a/pylint/lint/parallel.py b/pylint/lint/parallel.py index b9257191cc..204cbdf742 100644 --- a/pylint/lint/parallel.py +++ b/pylint/lint/parallel.py @@ -67,28 +67,59 @@ def _worker_check_single_file(file_item): _worker_linter.open() _worker_linter.check_single_file(name, filepath, modname) - + mapreduce_data = collections.defaultdict(list) + for checker in _worker_linter.get_checkers(): + try: + data = checker.get_map_data() + except AttributeError: + continue + mapreduce_data[checker.name].append(data) msgs = [_get_new_args(m) for m in _worker_linter.reporter.messages] _worker_linter.reporter.reset() return ( + id(multiprocessing.current_process()), _worker_linter.current_name, filepath, _worker_linter.file_state.base_name, msgs, _worker_linter.stats, _worker_linter.msg_status, + mapreduce_data, ) +def _merge_mapreduce_data(linter, all_mapreduce_data): + """ Merges map/reduce data across workers, invoking relevant APIs on checkers """ + # First collate the data, preparing it so we can send it to the checkers for + # validation. The intent here is to collect all the mapreduce data for all checker- + # runs across processes - that will then be passed to a static method on the + # checkers to be reduced and further processed. + collated_map_reduce_data = collections.defaultdict(list) + for linter_data in all_mapreduce_data.values(): + for run_data in linter_data: + for checker_name, data in run_data.items(): + collated_map_reduce_data[checker_name].extend(data) + + # Send the data to checkers that support/require consolidated data + original_checkers = linter.get_checkers() + for checker in original_checkers: + if checker.name in collated_map_reduce_data: + # Assume that if the check has returned map/reduce data that it has the + # reducer function + checker.reduce_map_data(linter, collated_map_reduce_data[checker.name]) + + def check_parallel(linter, jobs, files, arguments=None): - """Use the given linter to lint the files with given amount of workers (jobs)""" - # The reporter does not need to be passed to worker processess, i.e. the reporter does - # not need to be pickleable + """Use the given linter to lint the files with given amount of workers (jobs) + This splits the work filestream-by-filestream. If you need to do work across + multiple files, as in the similarity-checker, then inherit from MapReduceMixin and + implement the map/reduce mixin functionality""" + # The reporter does not need to be passed to worker processes, i.e. the reporter does original_reporter = linter.reporter linter.reporter = None # The linter is inherited by all the pool's workers, i.e. the linter - # is identical to the linter object here. This is requred so that + # is identical to the linter object here. This is required so that # a custom PyLinter object can be used. initializer = functools.partial(_worker_initialize, arguments=arguments) with multiprocessing.Pool(jobs, initializer=initializer, initargs=[linter]) as pool: @@ -99,14 +130,20 @@ def check_parallel(linter, jobs, files, arguments=None): linter.open() all_stats = [] + all_mapreduce_data = collections.defaultdict(list) + # Maps each file to be worked on by a single _worker_check_single_file() call, + # collecting any map/reduce data by checker module so that we can 'reduce' it + # later. for ( + worker_idx, # used to merge map/reduce data across workers module, file_path, base_name, messages, stats, msg_status, + mapreduce_data, ) in pool.imap_unordered(_worker_check_single_file, files): linter.file_state.base_name = base_name linter.set_current_module(module, file_path) @@ -115,8 +152,9 @@ def check_parallel(linter, jobs, files, arguments=None): linter.reporter.handle_message(msg) all_stats.append(stats) + all_mapreduce_data[worker_idx].append(mapreduce_data) linter.msg_status |= msg_status - + _merge_mapreduce_data(linter, all_mapreduce_data) linter.stats = _merge_stats(all_stats) # Insert stats data to local checkers. diff --git a/tests/checkers/unittest_similar.py b/tests/checkers/unittest_similar.py index a17eadd343..f7a91df5ae 100644 --- a/tests/checkers/unittest_similar.py +++ b/tests/checkers/unittest_similar.py @@ -239,8 +239,7 @@ def test_no_args(): def test_get_map_data(): - """ Tests that a SimilarChecker respects the MapReduceMixin interface - """ + """Tests that a SimilarChecker respects the MapReduceMixin interface""" linter = PyLinter(reporter=Reporter()) # Add a parallel checker to ensure it can map and reduce diff --git a/tests/test_check_parallel.py b/tests/test_check_parallel.py index f44ce666dc..f1c968216d 100644 --- a/tests/test_check_parallel.py +++ b/tests/test_check_parallel.py @@ -103,9 +103,17 @@ def test_worker_check_single_file_uninitialised(self): def test_worker_check_single_file_no_checkers(self): linter = PyLinter(reporter=Reporter()) worker_initialize(linter=linter) - (name, _, _, msgs, stats, msg_status) = worker_check_single_file( - _gen_file_data() - ) + + ( + _, # proc-id + name, + _, # file_path + _, # base_name + msgs, + stats, + msg_status, + _, # mapreduce_data + ) = worker_check_single_file(_gen_file_data()) assert name == "--test-file_data-name-0--" assert [] == msgs no_errors_status = 0 @@ -140,9 +148,16 @@ def test_worker_check_sequential_checker(self): # Add the only checker we care about in this test linter.register_checker(SequentialTestChecker(linter)) - (name, _, _, msgs, stats, msg_status) = worker_check_single_file( - _gen_file_data() - ) + ( + _, # proc-id + name, + _, # file_path + _, # base_name + msgs, + stats, + msg_status, + _, # mapreduce_data + ) = worker_check_single_file(_gen_file_data()) # Ensure we return the same data as the single_file_no_checkers test assert name == "--test-file_data-name-0--" diff --git a/tests/test_self.py b/tests/test_self.py index cd138e7892..f7c88862ae 100644 --- a/tests/test_self.py +++ b/tests/test_self.py @@ -46,7 +46,7 @@ import pytest -from pylint.constants import MAIN_CHECKER_NAME +from pylint.constants import MAIN_CHECKER_NAME, MSG_TYPES_STATUS from pylint.lint import Run from pylint.reporters import JSONReporter from pylint.reporters.text import BaseReporter, ColorizedTextReporter, TextReporter @@ -249,7 +249,8 @@ def test_parallel_execution(self): join(HERE, "functional", "a", "arguments.py"), join(HERE, "functional", "a", "arguments.py"), ], - code=2, + # We expect similarities to fail and an error + code=MSG_TYPES_STATUS["R"] | MSG_TYPES_STATUS["E"], ) def test_parallel_execution_missing_arguments(self):