Skip to content

Commit 0e9f425

Browse files
committed
Refactor parallel linting
The previous implementation created new PyLinter objects in the worker (child) process causing failure when running under Prospector because Prospector uses a custom PyLinter class (a class inherited from PyLinter) and PyLint naturally just creates PyLinter object. This caused linting to fail because there is options for Prospector's IndentChecker which was not created in the worker process. The new implementation passes the original PyLinter object into workers when the workers are created. See https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods Note that as Windows uses spawn method by default, PyLinter object (and its) members need to be pickleable from now on with the exception being PyLinter.reporter which is not passed to child processes. The performance has remained about the same based on quick tests done with Django project containing about 30 000 lines of code; with the old implementation linting took 26-28 seconds with 8 jobs on quad core i7 and 24-27 seconds with the new implementation.
1 parent d7daa8e commit 0e9f425

File tree

7 files changed

+140
-179
lines changed

7 files changed

+140
-179
lines changed

ChangeLog

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,14 @@ Pylint's ChangeLog
55
What's New in Pylint 2.4.0?
66
===========================
77

8+
* Allow parallel linting when run under Prospector
9+
10+
Pass the actual PyLinter object to sub processes to allow using custom
11+
PyLinter classes.
12+
13+
PyLinter object (and all its members except reporter) needs to support
14+
pickling so the PyLinter object can be passed to worker processes.
15+
816
* Refactor file checking
917

1018
Remove code duplication from file checking.

pylint/checkers/classes.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -640,11 +640,15 @@ def _has_same_layout_slots(slots, assigned_value):
640640
}
641641

642642

643+
def _scope_default():
644+
return collections.defaultdict(list)
645+
646+
643647
class ScopeAccessMap:
644648
"""Store the accessed variables per scope."""
645649

646650
def __init__(self):
647-
self._scopes = collections.defaultdict(lambda: collections.defaultdict(list))
651+
self._scopes = collections.defaultdict(_scope_default)
648652

649653
def set_accessed(self, node):
650654
"""Set the given node as accessed."""

pylint/config.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import configparser
4040
import contextlib
4141
import copy
42+
import functools
4243
import io
4344
import optparse
4445
import os
@@ -693,10 +694,8 @@ def read_config_file(self, config_file=None, verbose=None):
693694
opt = "-".join(["long"] * helplevel) + "-help"
694695
if opt in self._all_options:
695696
break # already processed
696-
# pylint: disable=unused-argument
697-
def helpfunc(option, opt, val, p, level=helplevel):
698-
print(self.help(level))
699-
sys.exit(0)
697+
698+
helpfunc = functools.partial(self.helpfunc, level=helplevel)
700699

701700
helpmsg = "%s verbose help." % " ".join(["more"] * helplevel)
702701
optdict = {"action": "callback", "callback": helpfunc, "help": helpmsg}
@@ -790,6 +789,10 @@ def help(self, level=0):
790789
with _patch_optparse():
791790
return self.cmdline_parser.format_help()
792791

792+
def helpfunc(self, option, opt, val, p, level): # pylint: disable=unused-argument
793+
print(self.help(level))
794+
sys.exit(0)
795+
793796

794797
class OptionsProviderMixIn:
795798
"""Mixin to provide options to an OptionsManager"""

pylint/lint.py

Lines changed: 102 additions & 160 deletions
Original file line numberDiff line numberDiff line change
@@ -243,70 +243,8 @@ def _cpu_count() -> int:
243243
return 1
244244

245245

246-
if multiprocessing is not None:
247-
248-
class ChildLinter(multiprocessing.Process):
249-
def run(self):
250-
# pylint: disable=no-member, unbalanced-tuple-unpacking
251-
tasks_queue, results_queue, self._config = self._args
252-
253-
self._config["jobs"] = 1 # Child does not parallelize any further.
254-
self._python3_porting_mode = self._config.pop("python3_porting_mode", None)
255-
self._plugins = self._config.pop("plugins", None)
256-
257-
# Run linter for received files/modules.
258-
for file_or_module in iter(tasks_queue.get, "STOP"):
259-
try:
260-
result = self._run_linter(file_or_module[0])
261-
results_queue.put(result)
262-
except Exception as ex:
263-
print(
264-
"internal error with sending report for module %s"
265-
% file_or_module,
266-
file=sys.stderr,
267-
)
268-
print(ex, file=sys.stderr)
269-
results_queue.put({})
270-
271-
def _run_linter(self, file_or_module):
272-
linter = PyLinter()
273-
274-
# Register standard checkers.
275-
linter.load_default_plugins()
276-
# Load command line plugins.
277-
if self._plugins:
278-
linter.load_plugin_modules(self._plugins)
279-
280-
linter.load_configuration_from_config(self._config)
281-
282-
# Load plugin specific configuration
283-
linter.load_plugin_configuration()
284-
285-
linter.set_reporter(reporters.CollectingReporter())
286-
287-
# Enable the Python 3 checker mode. This option is
288-
# passed down from the parent linter up to here, since
289-
# the Python 3 porting flag belongs to the Run class,
290-
# instead of the Linter class.
291-
if self._python3_porting_mode:
292-
linter.python3_porting_mode()
293-
294-
# Run the checks.
295-
linter.check(file_or_module)
296-
297-
msgs = [_get_new_args(m) for m in linter.reporter.messages]
298-
return (
299-
file_or_module,
300-
linter.file_state.base_name,
301-
linter.current_name,
302-
msgs,
303-
linter.stats,
304-
linter.msg_status,
305-
)
306-
307-
308246
# pylint: disable=too-many-instance-attributes
309-
class PyLinter(
247+
class PyLinter( # pylint: disable=too-many-public-methods
310248
config.OptionsManagerMixIn,
311249
MessagesHandlerMixIn,
312250
reporters.ReportsHandlerMixIn,
@@ -323,6 +261,9 @@ class PyLinter(
323261
IDE plugin developers: you may have to call
324262
`astroid.builder.MANAGER.astroid_cache.clear()` across runs if you want
325263
to ensure the latest code version is actually checked.
264+
265+
This class needs to support pickling for parallel linting to work. The exception
266+
is reporter member; see check_parallel function for more details.
326267
"""
327268

328269
__implements__ = (interfaces.ITokenChecker,)
@@ -971,16 +912,20 @@ def should_analyze_file(modname, path, is_argument=False):
971912

972913
# pylint: enable=unused-argument
973914

974-
def check(self, files_or_modules):
975-
"""main checking entry: check a list of files or modules from their
976-
name.
977-
"""
915+
def initialize(self):
978916
# initialize msgs_state now that all messages have been registered into
979917
# the store
980918
for msg in self.msgs_store.messages:
981919
if not msg.may_be_emitted():
982920
self._msgs_state[msg.msgid] = False
983921

922+
def check(self, files_or_modules):
923+
"""main checking entry: check a list of files or modules from their
924+
name.
925+
"""
926+
927+
self.initialize()
928+
984929
if not isinstance(files_or_modules, (list, tuple)):
985930
files_or_modules = (files_or_modules,)
986931

@@ -998,100 +943,21 @@ def check(self, files_or_modules):
998943
elif self.config.jobs == 1:
999944
self._check_files(self.get_ast, self._iterate_file_descrs(files_or_modules))
1000945
else:
1001-
self._parallel_check(files_or_modules)
1002-
1003-
def _get_jobs_config(self):
1004-
child_config = collections.OrderedDict()
1005-
filter_options = {"long-help"}
1006-
filter_options.update((opt_name for opt_name, _ in self._external_opts))
1007-
for opt_providers in self._all_options.values():
1008-
for optname, optdict, val in opt_providers.options_and_values():
1009-
if optdict.get("deprecated"):
1010-
continue
1011-
1012-
if optname not in filter_options:
1013-
child_config[optname] = utils._format_option_value(optdict, val)
1014-
child_config["python3_porting_mode"] = self._python3_porting_mode
1015-
child_config["plugins"] = self._dynamic_plugins
1016-
return child_config
1017-
1018-
def _parallel_task(self, files_or_modules):
1019-
# Prepare configuration for child linters.
1020-
child_config = self._get_jobs_config()
1021-
1022-
children = []
1023-
manager = multiprocessing.Manager()
1024-
tasks_queue = manager.Queue()
1025-
results_queue = manager.Queue()
1026-
1027-
# Send files to child linters.
1028-
expanded_files = []
1029-
for descr in self._expand_files(files_or_modules):
1030-
modname, filepath, is_arg = descr["name"], descr["path"], descr["isarg"]
1031-
if self.should_analyze_file(modname, filepath, is_argument=is_arg):
1032-
expanded_files.append(descr)
1033-
1034-
# do not start more jobs than needed
1035-
for _ in range(min(self.config.jobs, len(expanded_files))):
1036-
child_linter = ChildLinter(args=(tasks_queue, results_queue, child_config))
1037-
child_linter.start()
1038-
children.append(child_linter)
1039-
1040-
for files_or_module in expanded_files:
1041-
path = files_or_module["path"]
1042-
tasks_queue.put([path])
1043-
1044-
# collect results from child linters
1045-
failed = False
1046-
for _ in expanded_files:
1047-
try:
1048-
result = results_queue.get()
1049-
except Exception as ex:
1050-
print(
1051-
"internal error while receiving results from child linter",
1052-
file=sys.stderr,
1053-
)
1054-
print(ex, file=sys.stderr)
1055-
failed = True
1056-
break
1057-
yield result
1058-
1059-
# Stop child linters and wait for their completion.
1060-
for _ in range(self.config.jobs):
1061-
tasks_queue.put("STOP")
1062-
for child in children:
1063-
child.join()
1064-
1065-
if failed:
1066-
print("Error occurred, stopping the linter.", file=sys.stderr)
1067-
sys.exit(32)
1068-
1069-
def _parallel_check(self, files_or_modules):
1070-
# Reset stats.
1071-
self.open()
1072-
1073-
all_stats = []
1074-
module = None
1075-
for result in self._parallel_task(files_or_modules):
1076-
if not result:
1077-
continue
1078-
(_, self.file_state.base_name, module, messages, stats, msg_status) = result
1079-
1080-
for msg in messages:
1081-
msg = Message(*msg)
1082-
self.set_current_module(module)
1083-
self.reporter.handle_message(msg)
946+
check_parallel(
947+
self, self.config.jobs, self._iterate_file_descrs(files_or_modules)
948+
)
1084949

1085-
all_stats.append(stats)
1086-
self.msg_status |= msg_status
950+
def check_single_file(self, name, filepath, modname):
951+
"""Check single file
1087952
1088-
self.stats = _merge_stats(all_stats)
1089-
self.current_name = module
953+
The arguments are the same that are documented in _check_files
1090954
1091-
# Insert stats data to local checkers.
1092-
for checker in self.get_checkers():
1093-
if checker is not self:
1094-
checker.stats = self.stats
955+
The initialize() method should be called before calling this method
956+
"""
957+
with self._astroid_module_checker() as check_astroid_module:
958+
self._check_file(
959+
self.get_ast, check_astroid_module, name, filepath, modname
960+
)
1095961

1096962
def _check_files(self, get_ast, file_descrs):
1097963
"""Check all files from file_descrs
@@ -1326,6 +1192,78 @@ def _report_evaluation(self):
13261192
self.reporter.display_reports(sect)
13271193

13281194

1195+
def check_parallel(linter, jobs, files):
1196+
"""Use the given linter to lint the files with given amount of workers (jobs)
1197+
"""
1198+
# The reporter does not need to be passed to worker processess, i.e. the reporter does
1199+
# not need to be pickleable
1200+
original_reporter = linter.reporter
1201+
linter.reporter = None
1202+
1203+
# The linter is inherited by all the pool's workers, i.e. the linter
1204+
# is identical to the linter object here. This is requirde so that
1205+
# a custom PyLinter object (inherited from PyLinter) can be used.
1206+
# See https://github.com/PyCQA/prospector/issues/320
1207+
with multiprocessing.Pool(
1208+
jobs, initializer=_worker_initialize, initargs=[linter]
1209+
) as pool:
1210+
# ..and now when the workers have inherited the linter, the actual reporter
1211+
# can be set back here on the parent process so that results get stored into
1212+
# correct reporter
1213+
linter.set_reporter(original_reporter)
1214+
linter.open()
1215+
1216+
all_stats = []
1217+
1218+
for module, messages, stats, msg_status in pool.imap_unordered(
1219+
_worker_check_single_file, files
1220+
):
1221+
linter.set_current_module(module)
1222+
for msg in messages:
1223+
msg = Message(*msg)
1224+
linter.reporter.handle_message(msg)
1225+
1226+
all_stats.append(stats)
1227+
linter.msg_status |= msg_status
1228+
1229+
linter.stats = _merge_stats(all_stats)
1230+
1231+
# Insert stats data to local checkers.
1232+
for checker in linter.get_checkers():
1233+
if checker is not linter:
1234+
checker.stats = linter.stats
1235+
1236+
1237+
# PyLinter object used by worker processes when checking files using multiprocessing
1238+
# should only be used by the worker processes
1239+
_worker_linter = None
1240+
1241+
1242+
def _worker_initialize(linter):
1243+
global _worker_linter # pylint: disable=global-statement
1244+
_worker_linter = linter
1245+
1246+
# On the worker process side the messages are just collected and passed back to
1247+
# parent process as _worker_check_file function's return value
1248+
_worker_linter.set_reporter(reporters.CollectingReporter())
1249+
_worker_linter.open()
1250+
1251+
1252+
def _worker_check_single_file(file_item):
1253+
name, filepath, modname = file_item
1254+
1255+
_worker_linter.open()
1256+
_worker_linter.check_single_file(name, filepath, modname)
1257+
1258+
msgs = [_get_new_args(m) for m in _worker_linter.reporter.messages]
1259+
return (
1260+
_worker_linter.current_name,
1261+
msgs,
1262+
_worker_linter.stats,
1263+
_worker_linter.msg_status,
1264+
)
1265+
1266+
13291267
# some reporting functions ####################################################
13301268

13311269

@@ -1479,6 +1417,10 @@ class Run:
14791417
),
14801418
)
14811419

1420+
@staticmethod
1421+
def _return_one(*args): # pylint: disable=unused-argument
1422+
return 1
1423+
14821424
def __init__(self, args, reporter=None, do_exit=True):
14831425
self._rcfile = None
14841426
self._plugins = []
@@ -1504,7 +1446,7 @@ def __init__(self, args, reporter=None, do_exit=True):
15041446
"rcfile",
15051447
{
15061448
"action": "callback",
1507-
"callback": lambda *args: 1,
1449+
"callback": Run._return_one,
15081450
"type": "string",
15091451
"metavar": "<file>",
15101452
"help": "Specify a configuration file.",
@@ -1514,7 +1456,7 @@ def __init__(self, args, reporter=None, do_exit=True):
15141456
"init-hook",
15151457
{
15161458
"action": "callback",
1517-
"callback": lambda *args: 1,
1459+
"callback": Run._return_one,
15181460
"type": "string",
15191461
"metavar": "<code>",
15201462
"level": 1,

0 commit comments

Comments
 (0)