Skip to content

Commit 0ba9c84

Browse files
findleyrgopherbot
authored andcommitted
internal/fuzzy: several improvements for symbol matching
Following the edge case discovered in golang/go#60201, take a more scientific approach to improving symbol match scoring: - Add a conformance test that compares Matcher with SymbolMatcher, querying all identifiers in x/tools. The two are not expected to agree in all cases, but this test helped find interesting ranking edge cases, which are added to the ranking test. - Don't count a capital letter in the middle of a sequence of capital letters (e.g. the M in YAML) as a word start. This was the inconsistency that led to golang/go#60201. - Compute the sequence bonus before role score; role score should take precedent. - Simplify the sequence scoring logic: a sequential character gets the same score as a word start, unless it is the final character in the pattern in which case we also adjust for whether it completes a word or segment. This feels like a reasonable heuristic. - Fix a bug in final-rune adjustment where we were checking the next input rune for a segment start, not a separator. Notably, the scoring improvements above were all derived from first principles, and happened to also improve the conformance rate in the new test. Additionally, make the following cleanup: - s/character/rune throughout, since that's what we mean - add debugging support for more easily understanding the match algorithm - add additional commentary - add benchmarks Fixes golang/go#60201 Change-Id: I838898c49cbb69af083a8cc837612da047778c40 Reviewed-on: https://go-review.googlesource.com/c/tools/+/531697 Reviewed-by: Alan Donovan <[email protected]> Auto-Submit: Robert Findley <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]>
1 parent c2725ad commit 0ba9c84

File tree

4 files changed

+334
-84
lines changed

4 files changed

+334
-84
lines changed

gopls/internal/lsp/command/interface.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ type Interface interface {
3737
//
3838
// Applies a fix to a region of source code.
3939
ApplyFix(context.Context, ApplyFixArgs) error
40+
4041
// Test: Run test(s) (legacy)
4142
//
4243
// Runs `go test` for a specific set of test or benchmark functions.

internal/fuzzy/self_test.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// Copyright 2023 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package fuzzy_test
6+
7+
import (
8+
"testing"
9+
10+
. "golang.org/x/tools/internal/fuzzy"
11+
)
12+
13+
func BenchmarkSelf_Matcher(b *testing.B) {
14+
idents := collectIdentifiers(b)
15+
patterns := generatePatterns()
16+
17+
for i := 0; i < b.N; i++ {
18+
for _, pattern := range patterns {
19+
sm := NewMatcher(pattern)
20+
for _, ident := range idents {
21+
_ = sm.Score(ident)
22+
}
23+
}
24+
}
25+
}
26+
27+
func BenchmarkSelf_SymbolMatcher(b *testing.B) {
28+
idents := collectIdentifiers(b)
29+
patterns := generatePatterns()
30+
31+
for i := 0; i < b.N; i++ {
32+
for _, pattern := range patterns {
33+
sm := NewSymbolMatcher(pattern)
34+
for _, ident := range idents {
35+
_, _ = sm.Match([]string{ident})
36+
}
37+
}
38+
}
39+
}

internal/fuzzy/symbol.go

Lines changed: 125 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
package fuzzy
66

77
import (
8+
"bytes"
9+
"fmt"
10+
"log"
811
"unicode"
912
)
1013

@@ -36,10 +39,12 @@ type SymbolMatcher struct {
3639
segments [256]uint8 // how many segments from the right is each rune
3740
}
3841

42+
// Rune roles.
3943
const (
40-
segmentStart uint32 = 1 << iota
41-
wordStart
42-
separator
44+
segmentStart uint32 = 1 << iota // input rune starts a segment (i.e. follows '/' or '.')
45+
wordStart // input rune starts a word, per camel-case naming rules
46+
separator // input rune is a separator ('/' or '.')
47+
upper // input rune is an upper case letter
4348
)
4449

4550
// NewSymbolMatcher creates a SymbolMatcher that may be used to match the given
@@ -61,17 +66,17 @@ func NewSymbolMatcher(pattern string) *SymbolMatcher {
6166
return m
6267
}
6368

64-
// Match looks for the right-most match of the search pattern within the symbol
65-
// represented by concatenating the given chunks, returning its offset and
66-
// score.
69+
// Match searches for the right-most match of the search pattern within the
70+
// symbol represented by concatenating the given chunks.
6771
//
68-
// If a match is found, the first return value will hold the absolute byte
69-
// offset within all chunks for the start of the symbol. In other words, the
70-
// index of the match within strings.Join(chunks, ""). If no match is found,
71-
// the first return value will be -1.
72+
// If a match is found, the first result holds the absolute byte offset within
73+
// all chunks for the start of the symbol. In other words, the index of the
74+
// match within strings.Join(chunks, "").
7275
//
7376
// The second return value will be the score of the match, which is always
7477
// between 0 and 1, inclusive. A score of 0 indicates no match.
78+
//
79+
// If no match is found, Match returns (-1, 0).
7580
func (m *SymbolMatcher) Match(chunks []string) (int, float64) {
7681
// Explicit behavior for an empty pattern.
7782
//
@@ -81,11 +86,25 @@ func (m *SymbolMatcher) Match(chunks []string) (int, float64) {
8186
return -1, 0
8287
}
8388

84-
// First phase: populate the input buffer with lower-cased runes.
89+
// Matching implements a heavily optimized linear scoring algorithm on the
90+
// input. This is not guaranteed to produce the highest score, but works well
91+
// enough, particularly due to the right-to-left significance of qualified
92+
// symbols.
93+
//
94+
// Matching proceeds in three passes through the input:
95+
// - The first pass populates the input buffer and collects rune roles.
96+
// - The second pass proceeds right-to-left to find the right-most match.
97+
// - The third pass proceeds left-to-right from the start of the right-most
98+
// match, to find the most *compact* match, and computes the score of this
99+
// match.
100+
//
101+
// See below for more details of each pass, as well as the scoring algorithm.
102+
103+
// First pass: populate the input buffer out of the provided chunks
104+
// (lower-casing in the process), and collect rune roles.
85105
//
86106
// We could also check for a forward match here, but since we'd have to write
87107
// the entire input anyway this has negligible impact on performance.
88-
89108
var (
90109
inputLen = uint8(0)
91110
modifiers = wordStart | segmentStart
@@ -107,7 +126,16 @@ input:
107126
l = unicode.ToLower(r)
108127
}
109128
if l != r {
110-
modifiers |= wordStart
129+
modifiers |= upper
130+
131+
// If the current rune is capitalized *and the preceding rune was not*,
132+
// mark this as a word start. This avoids spuriously high ranking of
133+
// non-camelcase naming schemas, such as the
134+
// yaml_PARSE_FLOW_SEQUENCE_ENTRY_MAPPING_END_STATE example of
135+
// golang/go#60201.
136+
if inputLen == 0 || m.roles[inputLen-1]&upper == 0 {
137+
modifiers |= wordStart
138+
}
111139
}
112140
m.inputBuffer[inputLen] = l
113141
m.roles[inputLen] = modifiers
@@ -125,14 +153,13 @@ input:
125153
}
126154
}
127155

128-
// Second phase: find the right-most match, and count segments from the
156+
// Second pass: find the right-most match, and count segments from the
129157
// right.
130-
131158
var (
132159
pi = uint8(m.patternLen - 1) // pattern index
133160
p = m.pattern[pi] // pattern rune
134161
start = -1 // start offset of match
135-
rseg = uint8(0)
162+
rseg = uint8(0) // effective "depth" from the right of the current rune in consideration
136163
)
137164
const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes.
138165

@@ -144,6 +171,8 @@ input:
144171
m.segments[ii] = rseg
145172
if p == r {
146173
if pi == 0 {
174+
// TODO(rfindley): BUG: the docstring for Match says that it returns an
175+
// absolute byte offset, but clearly it is returning a rune offset here.
147176
start = int(ii)
148177
break
149178
}
@@ -161,85 +190,120 @@ input:
161190
return -1, 0
162191
}
163192

164-
// Third phase: find the shortest match, and compute the score.
193+
// Third pass: find the shortest match and compute the score.
165194

166-
// Score is the average score for each character.
195+
// Score is the average score for each rune.
167196
//
168-
// A character score is the multiple of:
169-
// 1. 1.0 if the character starts a segment or is preceded by a matching
170-
// character, 0.9 if the character starts a mid-segment word, else 0.6.
197+
// A rune score is the multiple of:
198+
// 1. The base score, which is 1.0 if the rune starts a segment, 0.9 if the
199+
// rune starts a mid-segment word, else 0.6.
171200
//
172-
// Note that characters preceded by a matching character get the max
173-
// score of 1.0 so that sequential or exact matches are preferred, even
174-
// if they don't start/end at a segment or word boundary. For example, a
175-
// match for "func" in intfuncs should have a higher score than in
176-
// ifunmatched.
201+
// Runes preceded by a matching rune are treated the same as the start
202+
// of a mid-segment word (with a 0.9 score), so that sequential or exact
203+
// matches are preferred. We call this a sequential bonus.
177204
//
178-
// For the final character match, the multiplier from (1) is reduced to
179-
// 0.9 if the next character in the input is a mid-segment word, or 0.6
180-
// if the next character in the input is not a word or segment start.
181-
// This ensures that we favor whole-word or whole-segment matches over
182-
// prefix matches.
205+
// For the final rune match, this sequential bonus is reduced to 0.8 if
206+
// the next rune in the input is a mid-segment word, or 0.7 if the next
207+
// rune in the input is not a word or segment start. This ensures that
208+
// we favor whole-word or whole-segment matches over prefix matches.
183209
//
184-
// 2. 1.0 if the character is part of the last segment, otherwise
210+
// 2. 1.0 if the rune is part of the last segment, otherwise
185211
// 1.0-0.1*<segments from the right>, with a max segment count of 3.
186212
// Notably 1.0-0.1*3 = 0.7 > 0.6, so that foo/_/_/_/_ (a match very
187-
// early in a qualified symbol name) still scores higher than _f_o_o_
188-
// (a completely split match).
213+
// early in a qualified symbol name) still scores higher than _f_o_o_ (a
214+
// completely split match).
189215
//
190216
// This is a naive algorithm, but it is fast. There's lots of prior art here
191217
// that could be leveraged. For example, we could explicitly consider
192-
// character distance, and exact matches of words or segments.
218+
// rune distance, and exact matches of words or segments.
193219
//
194220
// Also note that this might not actually find the highest scoring match, as
195221
// doing so could require a non-linear algorithm, depending on how the score
196222
// is calculated.
197223

224+
// debugging support
225+
const debug = false // enable to log debugging information
226+
var (
227+
runeScores []float64
228+
runeIdxs []int
229+
)
230+
198231
pi = 0
199232
p = m.pattern[pi]
200233

201234
const (
202-
segStreak = 1.0 // start of segment or sequential match
203-
wordStreak = 0.9 // start of word match
204-
noStreak = 0.6
205-
perSegment = 0.1 // we count at most 3 segments above
235+
segStartScore = 1.0 // base score of runes starting a segment
236+
wordScore = 0.9 // base score of runes starting or continuing a word
237+
noStreak = 0.6
238+
perSegment = 0.1 // we count at most 3 segments above
206239
)
207240

208-
streakBonus := noStreak
209241
totScore := 0.0
242+
lastMatch := uint8(255)
210243
for ii := uint8(start); ii < inputLen; ii++ {
211244
r := m.inputBuffer[ii]
212245
if r == p {
213246
pi++
247+
finalRune := pi >= m.patternLen
214248
p = m.pattern[pi]
215-
// Note: this could be optimized with some bit operations.
249+
250+
baseScore := noStreak
251+
252+
// Calculate the sequence bonus based on preceding matches.
253+
//
254+
// We do this first as it is overridden by role scoring below.
255+
if lastMatch == ii-1 {
256+
baseScore = wordScore
257+
// Reduce the sequence bonus for the final rune of the pattern based on
258+
// whether it borders a new segment or word.
259+
if finalRune {
260+
switch {
261+
case ii == inputLen-1 || m.roles[ii+1]&separator != 0:
262+
// Full segment: no reduction
263+
case m.roles[ii+1]&wordStart != 0:
264+
baseScore = wordScore - 0.1
265+
default:
266+
baseScore = wordScore - 0.2
267+
}
268+
}
269+
}
270+
lastMatch = ii
271+
272+
// Calculate the rune's role score. If the rune starts a segment or word,
273+
// this overrides the sequence score, as the rune starts a new sequence.
216274
switch {
217-
case m.roles[ii]&segmentStart != 0 && segStreak > streakBonus:
218-
streakBonus = segStreak
219-
case m.roles[ii]&wordStart != 0 && wordStreak > streakBonus:
220-
streakBonus = wordStreak
275+
case m.roles[ii]&segmentStart != 0:
276+
baseScore = segStartScore
277+
case m.roles[ii]&wordStart != 0:
278+
baseScore = wordScore
221279
}
222-
finalChar := pi >= m.patternLen
223-
// finalCost := 1.0
224-
if finalChar && streakBonus > noStreak {
225-
switch {
226-
case ii == inputLen-1 || m.roles[ii+1]&segmentStart != 0:
227-
// Full segment: no reduction
228-
case m.roles[ii+1]&wordStart != 0:
229-
streakBonus = wordStreak
230-
default:
231-
streakBonus = noStreak
232-
}
280+
281+
// Apply the segment-depth penalty (segments from the right).
282+
runeScore := baseScore * (1.0 - float64(m.segments[ii])*perSegment)
283+
if debug {
284+
runeScores = append(runeScores, runeScore)
285+
runeIdxs = append(runeIdxs, int(ii))
233286
}
234-
totScore += streakBonus * (1.0 - float64(m.segments[ii])*perSegment)
235-
if finalChar {
287+
totScore += runeScore
288+
if finalRune {
236289
break
237290
}
238-
streakBonus = segStreak // see above: sequential characters get the max score
239-
} else {
240-
streakBonus = noStreak
241291
}
242292
}
243293

294+
if debug {
295+
// Format rune roles and scores in line:
296+
// fo[o:.52].[b:1]a[r:.6]
297+
var summary bytes.Buffer
298+
last := 0
299+
for i, idx := range runeIdxs {
300+
summary.WriteString(string(m.inputBuffer[last:idx])) // encode runes
301+
fmt.Fprintf(&summary, "[%s:%.2g]", string(m.inputBuffer[idx]), runeScores[i])
302+
last = idx + 1
303+
}
304+
summary.WriteString(string(m.inputBuffer[last:inputLen])) // encode runes
305+
log.Println(summary.String())
306+
}
307+
244308
return start, totScore / float64(m.patternLen)
245309
}

0 commit comments

Comments
 (0)