Skip to content

Commit ddfa220

Browse files
committed
internal/fuzzy: improvements to the symbol scoring algorithm
Based on feedback in golang/go#60027, tweak the fuzzy symbol scoring algorithm to much more strongly prefer sequential and exact matches. Fixes golang/go#60027 Change-Id: I1c6d019065c4dff4adf2db9e94397a635e13d50f Reviewed-on: https://go-review.googlesource.com/c/tools/+/493623 gopls-CI: kokoro <[email protected]> Run-TryBot: Robert Findley <[email protected]> Reviewed-by: Paul Jolly <[email protected]> TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Alan Donovan <[email protected]>
1 parent 3449242 commit ddfa220

File tree

5 files changed

+65
-26
lines changed

5 files changed

+65
-26
lines changed

gopls/internal/lsp/cmd/usage/workspace_symbol.hlp

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@ Example:
99

1010
workspace_symbol-flags:
1111
-matcher=string
12-
specifies the type of matcher: fuzzy, caseSensitive, or caseInsensitive.
13-
The default is caseInsensitive.
12+
specifies the type of matcher: fuzzy, fastfuzzy, casesensitive, or caseinsensitive.
13+
The default is caseinsensitive.

gopls/internal/lsp/cmd/workspace_symbol.go

+4-3
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"context"
99
"flag"
1010
"fmt"
11+
"strings"
1112

1213
"golang.org/x/tools/gopls/internal/lsp/protocol"
1314
"golang.org/x/tools/gopls/internal/lsp/source"
@@ -16,7 +17,7 @@ import (
1617

1718
// workspaceSymbol implements the workspace_symbol verb for gopls.
1819
type workspaceSymbol struct {
19-
Matcher string `flag:"matcher" help:"specifies the type of matcher: fuzzy, caseSensitive, or caseInsensitive.\nThe default is caseInsensitive."`
20+
Matcher string `flag:"matcher" help:"specifies the type of matcher: fuzzy, fastfuzzy, casesensitive, or caseinsensitive.\nThe default is caseinsensitive."`
2021

2122
app *Application
2223
}
@@ -46,10 +47,10 @@ func (r *workspaceSymbol) Run(ctx context.Context, args ...string) error {
4647
if opts != nil {
4748
opts(o)
4849
}
49-
switch r.Matcher {
50+
switch strings.ToLower(r.Matcher) {
5051
case "fuzzy":
5152
o.SymbolMatcher = source.SymbolFuzzy
52-
case "caseSensitive":
53+
case "casesensitive":
5354
o.SymbolMatcher = source.SymbolCaseSensitive
5455
case "fastfuzzy":
5556
o.SymbolMatcher = source.SymbolFastFuzzy

gopls/internal/lsp/source/workspace_symbol.go

+4-1
Original file line numberDiff line numberDiff line change
@@ -484,7 +484,10 @@ func matchFile(store *symbolStore, symbolizer symbolizer, matcher matcherFunc, r
484484
// every field or method nesting level to access the field decreases
485485
// the score by a factor of 1.0 - depth*depthFactor, up to a depth of
486486
// 3.
487-
depthFactor = 0.2
487+
//
488+
// Use a small constant here, as this exists mostly to break ties
489+
// (e.g. given a type Foo and a field x.Foo, prefer Foo).
490+
depthFactor = 0.01
488491
)
489492

490493
startWord := true

internal/fuzzy/symbol.go

+26-18
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,6 @@ import (
2626
// symbol or identifiers, so doing this avoids allocating strings.
2727
// - We can return the index of the right-most match, allowing us to trim
2828
// irrelevant qualification.
29-
//
30-
// This implementation is experimental, serving as a reference fast algorithm
31-
// to compare to the fuzzy algorithm implemented by Matcher.
3229
type SymbolMatcher struct {
3330
// Using buffers of length 256 is both a reasonable size for most qualified
3431
// symbols, and makes it easy to avoid bounds checks by using uint8 indexes.
@@ -169,19 +166,29 @@ input:
169166
// Score is the average score for each character.
170167
//
171168
// A character score is the multiple of:
172-
// 1. 1.0 if the character starts a segment, .8 if the character start a
173-
// mid-segment word, otherwise 0.6. This carries over to immediately
174-
// following characters.
175-
// 2. For the final character match, the multiplier from (1) is reduced to
176-
// .8 if the next character in the input is a mid-segment word, or 0.6 if
177-
// the next character in the input is not a word or segment start. This
178-
// ensures that we favor whole-word or whole-segment matches over prefix
179-
// matches.
180-
// 3. 1.0 if the character is part of the last segment, otherwise
181-
// 1.0-.2*<segments from the right>, with a max segment count of 3.
169+
// 1. 1.0 if the character starts a segment or is preceded by a matching
170+
// character, 0.9 if the character starts a mid-segment word, else 0.6.
171+
//
172+
// Note that characters preceded by a matching character get the max
173+
// score of 1.0 so that sequential or exact matches are preferred, even
174+
// if they don't start/end at a segment or word boundary. For example, a
175+
// match for "func" in intfuncs should have a higher score than in
176+
// ifunmatched.
177+
//
178+
// For the final character match, the multiplier from (1) is reduced to
179+
// 0.9 if the next character in the input is a mid-segment word, or 0.6
180+
// if the next character in the input is not a word or segment start.
181+
// This ensures that we favor whole-word or whole-segment matches over
182+
// prefix matches.
183+
//
184+
// 2. 1.0 if the character is part of the last segment, otherwise
185+
// 1.0-0.1*<segments from the right>, with a max segment count of 3.
186+
// Notably 1.0-0.1*3 = 0.7 > 0.6, so that foo/_/_/_/_ (a match very
187+
// early in a qualified symbol name) still scores higher than _f_o_o_
188+
// (a completely split match).
182189
//
183-
// This is a very naive algorithm, but it is fast. There's lots of prior art
184-
// here, and we should leverage it. For example, we could explicitly consider
190+
// This is a naive algorithm, but it is fast. There's lots of prior art here
191+
// that could be leveraged. For example, we could explicitly consider
185192
// character distance, and exact matches of words or segments.
186193
//
187194
// Also note that this might not actually find the highest scoring match, as
@@ -192,10 +199,10 @@ input:
192199
p = m.pattern[pi]
193200

194201
const (
195-
segStreak = 1.0
196-
wordStreak = 0.8
202+
segStreak = 1.0 // start of segment or sequential match
203+
wordStreak = 0.9 // start of word match
197204
noStreak = 0.6
198-
perSegment = 0.2 // we count at most 3 segments above
205+
perSegment = 0.1 // we count at most 3 segments above
199206
)
200207

201208
streakBonus := noStreak
@@ -228,6 +235,7 @@ input:
228235
if finalChar {
229236
break
230237
}
238+
streakBonus = segStreak // see above: sequential characters get the max score
231239
} else {
232240
streakBonus = noStreak
233241
}

internal/fuzzy/symbol_test.go

+29-2
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,12 @@ func TestSymbolRanking(t *testing.T) {
4040
symbols := []string{
4141
"this.is.better.than.most",
4242
"test.foo.bar",
43-
"atest",
4443
"thebest",
4544
"test.foo",
4645
"test.foo",
47-
"tTest",
46+
"atest",
4847
"testage",
48+
"tTest",
4949
"foo.test",
5050
"test",
5151
}
@@ -60,6 +60,33 @@ func TestSymbolRanking(t *testing.T) {
6060
}
6161
}
6262

63+
// Test that we strongly prefer exact matches.
64+
//
65+
// In golang/go#60027, we preferred "Runner" for the query "rune" over several
66+
// results containing the word "rune" exactly. Following this observation,
67+
// scoring was tweaked to more strongly emphasize sequential characters and
68+
// exact matches.
69+
func TestSymbolRanking_Issue60027(t *testing.T) {
70+
matcher := NewSymbolMatcher("rune")
71+
72+
// symbols to match, in ascending order of ranking.
73+
symbols := []string{
74+
"Runner",
75+
"singleRuneParam",
76+
"Config.ifsRune",
77+
"Parser.rune",
78+
}
79+
prev := 0.0
80+
for _, sym := range symbols {
81+
_, score := matcher.Match([]string{sym})
82+
t.Logf("Match(%q) = %v", sym, score)
83+
if score < prev {
84+
t.Errorf("Match(%q) = _, %v, want > %v", sym, score, prev)
85+
}
86+
prev = score
87+
}
88+
}
89+
6390
func TestChunkedMatch(t *testing.T) {
6491
matcher := NewSymbolMatcher("test")
6592

0 commit comments

Comments
 (0)