Improve performance for errors on class with many attributes (#14379)

hauntsaninja · web-flow · commit f104914b9d4f · 2023-01-02T23:47:03.000-08:00
When checking manticore with `--check-untyped-defs`, this is a 4x total speedup from master, from ~320s to ~80s (uncompiled). I looked into this because of python/typeshed#9443 (comment)
diff --git a/mypy/messages.py b/mypy/messages.py
@@ -15,7 +15,7 @@
 import re
 from contextlib import contextmanager
 from textwrap import dedent
-from typing import Any, Callable, Iterable, Iterator, List, Sequence, cast
+from typing import Any, Callable, Collection, Iterable, Iterator, List, Sequence, cast
 from typing_extensions import Final
 
 from mypy import errorcodes as codes, message_registry
@@ -440,7 +440,7 @@ def has_no_attr(
                         alternatives.discard(member)
 
                         matches = [m for m in COMMON_MISTAKES.get(member, []) if m in alternatives]
-                        matches.extend(best_matches(member, alternatives)[:3])
+                        matches.extend(best_matches(member, alternatives, n=3))
                         if member == "__aiter__" and matches == ["__iter__"]:
                             matches = []  # Avoid misleading suggestion
                         if matches:
@@ -928,11 +928,11 @@ def unexpected_keyword_argument(
                     matching_type_args.append(callee_arg_name)
                 else:
                     not_matching_type_args.append(callee_arg_name)
-        matches = best_matches(name, matching_type_args)
+        matches = best_matches(name, matching_type_args, n=3)
         if not matches:
-            matches = best_matches(name, not_matching_type_args)
+            matches = best_matches(name, not_matching_type_args, n=3)
         if matches:
-            msg += f"; did you mean {pretty_seq(matches[:3], 'or')}?"
+            msg += f"; did you mean {pretty_seq(matches, 'or')}?"
         self.fail(msg, context, code=codes.CALL_ARG)
         module = find_defining_module(self.modules, callee)
         if module:
@@ -1695,10 +1695,10 @@ def typeddict_key_not_found(
                 context,
                 code=codes.TYPEDDICT_ITEM,
             )
-            matches = best_matches(item_name, typ.items.keys())
+            matches = best_matches(item_name, typ.items.keys(), n=3)
             if matches:
                 self.note(
-                    "Did you mean {}?".format(pretty_seq(matches[:3], "or")),
+                    "Did you mean {}?".format(pretty_seq(matches, "or")),
                     context,
                     code=codes.TYPEDDICT_ITEM,
                 )
@@ -2798,11 +2798,24 @@ def find_defining_module(modules: dict[str, MypyFile], typ: CallableType) -> Myp
 COMMON_MISTAKES: Final[dict[str, Sequence[str]]] = {"add": ("append", "extend")}
 
 
-def best_matches(current: str, options: Iterable[str]) -> list[str]:
-    ratios = {v: difflib.SequenceMatcher(a=current, b=v).ratio() for v in options}
-    return sorted(
-        (o for o in options if ratios[o] > 0.75), reverse=True, key=lambda v: (ratios[v], v)
-    )
+def _real_quick_ratio(a: str, b: str) -> float:
+    # this is an upper bound on difflib.SequenceMatcher.ratio
+    # similar to difflib.SequenceMatcher.real_quick_ratio, but faster since we don't instantiate
+    al = len(a)
+    bl = len(b)
+    return 2.0 * min(al, bl) / (al + bl)
+
+
+def best_matches(current: str, options: Collection[str], n: int) -> list[str]:
+    # narrow down options cheaply
+    assert current
+    options = [o for o in options if _real_quick_ratio(current, o) > 0.75]
+    if len(options) >= 50:
+        options = [o for o in options if abs(len(o) - len(current)) <= 1]
+
+    ratios = {option: difflib.SequenceMatcher(a=current, b=option).ratio() for option in options}
+    options = [option for option, ratio in ratios.items() if ratio > 0.75]
+    return sorted(options, key=lambda v: (-ratios[v], v))[:n]
 
 
 def pretty_seq(args: Sequence[str], conjunction: str) -> str:
diff --git a/mypy/semanal.py b/mypy/semanal.py
@@ -2531,7 +2531,7 @@ def report_missing_module_attribute(
                 )
             else:
                 alternatives = set(module.names.keys()).difference({source_id})
-                matches = best_matches(source_id, alternatives)[:3]
+                matches = best_matches(source_id, alternatives, n=3)
                 if matches:
                     suggestion = f"; maybe {pretty_seq(matches, 'or')}?"
                     message += f"{suggestion}"
diff --git a/test-data/unit/check-kwargs.test b/test-data/unit/check-kwargs.test
@@ -87,7 +87,7 @@ class A: pass
 
 [case testMultipleKeywordsForMisspelling]
 def f(thing : 'A', other: 'A', atter: 'A', btter: 'B') -> None: pass # N: "f" defined here
-f(otter=A()) # E: Unexpected keyword argument "otter" for "f"; did you mean "other" or "atter"?
+f(otter=A()) # E: Unexpected keyword argument "otter" for "f"; did you mean "atter" or "other"?
 class A: pass
 class B: pass
 
@@ -99,15 +99,15 @@ class B: pass
 
 [case testKeywordMisspellingInheritance]
 def f(atter: 'A', btter: 'B', ctter: 'C') -> None: pass # N: "f" defined here
-f(otter=B()) # E: Unexpected keyword argument "otter" for "f"; did you mean "btter" or "atter"?
+f(otter=B()) # E: Unexpected keyword argument "otter" for "f"; did you mean "atter" or "btter"?
 class A: pass
 class B(A): pass
 class C: pass
 
 [case testKeywordMisspellingFloatInt]
 def f(atter: float, btter: int) -> None: pass # N: "f" defined here
 x: int = 5
-f(otter=x) # E: Unexpected keyword argument "otter" for "f"; did you mean "btter" or "atter"?
+f(otter=x) # E: Unexpected keyword argument "otter" for "f"; did you mean "atter" or "btter"?
 
 [case testKeywordMisspellingVarArgs]
 def f(other: 'A', *atter: 'A') -> None: pass # N: "f" defined here
diff --git a/test-data/unit/check-modules.test b/test-data/unit/check-modules.test
@@ -2871,7 +2871,7 @@ aaaaa: int
 
 [case testModuleAttributeThreeSuggestions]
 import m
-m.aaaaa # E: Module has no attribute "aaaaa"; maybe "aabaa", "aaaba", or "aaaab"?
+m.aaaaa # E: Module has no attribute "aaaaa"; maybe "aaaab", "aaaba", or "aabaa"?
 
 [file m.py]
 aaaab: int
diff --git a/test-data/unit/semanal-modules.test b/test-data/unit/semanal-modules.test
@@ -814,7 +814,7 @@ def somef_unction():
 [file f.py]
 from m.x import somefunction
 [out]
-tmp/f.py:1: error: Module "m.x" has no attribute "somefunction"; maybe "somef_unction" or "some_function"?
+tmp/f.py:1: error: Module "m.x" has no attribute "somefunction"; maybe "some_function" or "somef_unction"?
 
 [case testImportMisspellingMultipleCandidatesTruncated]
 import f
@@ -831,7 +831,7 @@ def somefun_ction():
 [file f.py]
 from m.x import somefunction
 [out]
-tmp/f.py:1: error: Module "m.x" has no attribute "somefunction"; maybe "somefun_ction", "somefu_nction", or "somef_unction"?
+tmp/f.py:1: error: Module "m.x" has no attribute "somefunction"; maybe "some_function", "somef_unction", or "somefu_nction"?
 
 [case testFromImportAsInStub]
 from m import *

Original file line number	Diff line number	Diff line change
`@@ -2531,7 +2531,7 @@ def report_missing_module_attribute(`
`2531`	`2531`	`)`
`2532`	`2532`	`else:`
`2533`	`2533`	`alternatives = set(module.names.keys()).difference({source_id})`
`2534`		`- matches = best_matches(source_id, alternatives)[:3]`
	`2534`	`+ matches = best_matches(source_id, alternatives, n=3)`
`2535`	`2535`	`if matches:`
`2536`	`2536`	`suggestion = f"; maybe {pretty_seq(matches, 'or')}?"`
`2537`	`2537`	`message += f"{suggestion}"`