5
5
package fuzzy
6
6
7
7
import (
8
+ "bytes"
9
+ "fmt"
10
+ "log"
8
11
"unicode"
9
12
)
10
13
@@ -36,10 +39,12 @@ type SymbolMatcher struct {
36
39
segments [256 ]uint8 // how many segments from the right is each rune
37
40
}
38
41
42
+ // Rune roles.
39
43
const (
40
- segmentStart uint32 = 1 << iota
41
- wordStart
42
- separator
44
+ segmentStart uint32 = 1 << iota // input rune starts a segment (i.e. follows '/' or '.')
45
+ wordStart // input rune starts a word, per camel-case naming rules
46
+ separator // input rune is a separator ('/' or '.')
47
+ upper // input rune is an upper case letter
43
48
)
44
49
45
50
// NewSymbolMatcher creates a SymbolMatcher that may be used to match the given
@@ -61,17 +66,17 @@ func NewSymbolMatcher(pattern string) *SymbolMatcher {
61
66
return m
62
67
}
63
68
64
- // Match looks for the right-most match of the search pattern within the symbol
65
- // represented by concatenating the given chunks, returning its offset and
66
- // score.
69
+ // Match searches for the right-most match of the search pattern within the
70
+ // symbol represented by concatenating the given chunks.
67
71
//
68
- // If a match is found, the first return value will hold the absolute byte
69
- // offset within all chunks for the start of the symbol. In other words, the
70
- // index of the match within strings.Join(chunks, ""). If no match is found,
71
- // the first return value will be -1.
72
+ // If a match is found, the first result holds the absolute byte offset within
73
+ // all chunks for the start of the symbol. In other words, the index of the
74
+ // match within strings.Join(chunks, "").
72
75
//
73
76
// The second return value will be the score of the match, which is always
74
77
// between 0 and 1, inclusive. A score of 0 indicates no match.
78
+ //
79
+ // If no match is found, Match returns (-1, 0).
75
80
func (m * SymbolMatcher ) Match (chunks []string ) (int , float64 ) {
76
81
// Explicit behavior for an empty pattern.
77
82
//
@@ -81,11 +86,25 @@ func (m *SymbolMatcher) Match(chunks []string) (int, float64) {
81
86
return - 1 , 0
82
87
}
83
88
84
- // First phase: populate the input buffer with lower-cased runes.
89
+ // Matching implements a heavily optimized linear scoring algorithm on the
90
+ // input. This is not guaranteed to produce the highest score, but works well
91
+ // enough, particularly due to the right-to-left significance of qualified
92
+ // symbols.
93
+ //
94
+ // Matching proceeds in three passes through the input:
95
+ // - The first pass populates the input buffer and collects rune roles.
96
+ // - The second pass proceeds right-to-left to find the right-most match.
97
+ // - The third pass proceeds left-to-right from the start of the right-most
98
+ // match, to find the most *compact* match, and computes the score of this
99
+ // match.
100
+ //
101
+ // See below for more details of each pass, as well as the scoring algorithm.
102
+
103
+ // First pass: populate the input buffer out of the provided chunks
104
+ // (lower-casing in the process), and collect rune roles.
85
105
//
86
106
// We could also check for a forward match here, but since we'd have to write
87
107
// the entire input anyway this has negligible impact on performance.
88
-
89
108
var (
90
109
inputLen = uint8 (0 )
91
110
modifiers = wordStart | segmentStart
@@ -107,7 +126,16 @@ input:
107
126
l = unicode .ToLower (r )
108
127
}
109
128
if l != r {
110
- modifiers |= wordStart
129
+ modifiers |= upper
130
+
131
+ // If the current rune is capitalized *and the preceding rune was not*,
132
+ // mark this as a word start. This avoids spuriously high ranking of
133
+ // non-camelcase naming schemas, such as the
134
+ // yaml_PARSE_FLOW_SEQUENCE_ENTRY_MAPPING_END_STATE example of
135
+ // golang/go#60201.
136
+ if inputLen == 0 || m .roles [inputLen - 1 ]& upper == 0 {
137
+ modifiers |= wordStart
138
+ }
111
139
}
112
140
m .inputBuffer [inputLen ] = l
113
141
m .roles [inputLen ] = modifiers
@@ -125,14 +153,13 @@ input:
125
153
}
126
154
}
127
155
128
- // Second phase : find the right-most match, and count segments from the
156
+ // Second pass : find the right-most match, and count segments from the
129
157
// right.
130
-
131
158
var (
132
159
pi = uint8 (m .patternLen - 1 ) // pattern index
133
160
p = m .pattern [pi ] // pattern rune
134
161
start = - 1 // start offset of match
135
- rseg = uint8 (0 )
162
+ rseg = uint8 (0 ) // effective "depth" from the right of the current rune in consideration
136
163
)
137
164
const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes.
138
165
@@ -144,6 +171,8 @@ input:
144
171
m .segments [ii ] = rseg
145
172
if p == r {
146
173
if pi == 0 {
174
+ // TODO(rfindley): BUG: the docstring for Match says that it returns an
175
+ // absolute byte offset, but clearly it is returning a rune offset here.
147
176
start = int (ii )
148
177
break
149
178
}
@@ -161,85 +190,120 @@ input:
161
190
return - 1 , 0
162
191
}
163
192
164
- // Third phase : find the shortest match, and compute the score.
193
+ // Third pass : find the shortest match and compute the score.
165
194
166
- // Score is the average score for each character .
195
+ // Score is the average score for each rune .
167
196
//
168
- // A character score is the multiple of:
169
- // 1. 1.0 if the character starts a segment or is preceded by a matching
170
- // character, 0.9 if the character starts a mid-segment word, else 0.6.
197
+ // A rune score is the multiple of:
198
+ // 1. The base score, which is 1.0 if the rune starts a segment, 0.9 if the
199
+ // rune starts a mid-segment word, else 0.6.
171
200
//
172
- // Note that characters preceded by a matching character get the max
173
- // score of 1.0 so that sequential or exact matches are preferred, even
174
- // if they don't start/end at a segment or word boundary. For example, a
175
- // match for "func" in intfuncs should have a higher score than in
176
- // ifunmatched.
201
+ // Runes preceded by a matching rune are treated the same as the start
202
+ // of a mid-segment word (with a 0.9 score), so that sequential or exact
203
+ // matches are preferred. We call this a sequential bonus.
177
204
//
178
- // For the final character match, the multiplier from (1) is reduced to
179
- // 0.9 if the next character in the input is a mid-segment word, or 0.6
180
- // if the next character in the input is not a word or segment start.
181
- // This ensures that we favor whole-word or whole-segment matches over
182
- // prefix matches.
205
+ // For the final rune match, this sequential bonus is reduced to 0.8 if
206
+ // the next rune in the input is a mid-segment word, or 0.7 if the next
207
+ // rune in the input is not a word or segment start. This ensures that
208
+ // we favor whole-word or whole-segment matches over prefix matches.
183
209
//
184
- // 2. 1.0 if the character is part of the last segment, otherwise
210
+ // 2. 1.0 if the rune is part of the last segment, otherwise
185
211
// 1.0-0.1*<segments from the right>, with a max segment count of 3.
186
212
// Notably 1.0-0.1*3 = 0.7 > 0.6, so that foo/_/_/_/_ (a match very
187
- // early in a qualified symbol name) still scores higher than _f_o_o_
188
- // (a completely split match).
213
+ // early in a qualified symbol name) still scores higher than _f_o_o_ (a
214
+ // completely split match).
189
215
//
190
216
// This is a naive algorithm, but it is fast. There's lots of prior art here
191
217
// that could be leveraged. For example, we could explicitly consider
192
- // character distance, and exact matches of words or segments.
218
+ // rune distance, and exact matches of words or segments.
193
219
//
194
220
// Also note that this might not actually find the highest scoring match, as
195
221
// doing so could require a non-linear algorithm, depending on how the score
196
222
// is calculated.
197
223
224
+ // debugging support
225
+ const debug = false // enable to log debugging information
226
+ var (
227
+ runeScores []float64
228
+ runeIdxs []int
229
+ )
230
+
198
231
pi = 0
199
232
p = m .pattern [pi ]
200
233
201
234
const (
202
- segStreak = 1.0 // start of segment or sequential match
203
- wordStreak = 0.9 // start of word match
204
- noStreak = 0.6
205
- perSegment = 0.1 // we count at most 3 segments above
235
+ segStartScore = 1.0 // base score of runes starting a segment
236
+ wordScore = 0.9 // base score of runes starting or continuing a word
237
+ noStreak = 0.6
238
+ perSegment = 0.1 // we count at most 3 segments above
206
239
)
207
240
208
- streakBonus := noStreak
209
241
totScore := 0.0
242
+ lastMatch := uint8 (255 )
210
243
for ii := uint8 (start ); ii < inputLen ; ii ++ {
211
244
r := m .inputBuffer [ii ]
212
245
if r == p {
213
246
pi ++
247
+ finalRune := pi >= m .patternLen
214
248
p = m .pattern [pi ]
215
- // Note: this could be optimized with some bit operations.
249
+
250
+ baseScore := noStreak
251
+
252
+ // Calculate the sequence bonus based on preceding matches.
253
+ //
254
+ // We do this first as it is overridden by role scoring below.
255
+ if lastMatch == ii - 1 {
256
+ baseScore = wordScore
257
+ // Reduce the sequence bonus for the final rune of the pattern based on
258
+ // whether it borders a new segment or word.
259
+ if finalRune {
260
+ switch {
261
+ case ii == inputLen - 1 || m .roles [ii + 1 ]& separator != 0 :
262
+ // Full segment: no reduction
263
+ case m .roles [ii + 1 ]& wordStart != 0 :
264
+ baseScore = wordScore - 0.1
265
+ default :
266
+ baseScore = wordScore - 0.2
267
+ }
268
+ }
269
+ }
270
+ lastMatch = ii
271
+
272
+ // Calculate the rune's role score. If the rune starts a segment or word,
273
+ // this overrides the sequence score, as the rune starts a new sequence.
216
274
switch {
217
- case m .roles [ii ]& segmentStart != 0 && segStreak > streakBonus :
218
- streakBonus = segStreak
219
- case m .roles [ii ]& wordStart != 0 && wordStreak > streakBonus :
220
- streakBonus = wordStreak
275
+ case m .roles [ii ]& segmentStart != 0 :
276
+ baseScore = segStartScore
277
+ case m .roles [ii ]& wordStart != 0 :
278
+ baseScore = wordScore
221
279
}
222
- finalChar := pi >= m .patternLen
223
- // finalCost := 1.0
224
- if finalChar && streakBonus > noStreak {
225
- switch {
226
- case ii == inputLen - 1 || m .roles [ii + 1 ]& segmentStart != 0 :
227
- // Full segment: no reduction
228
- case m .roles [ii + 1 ]& wordStart != 0 :
229
- streakBonus = wordStreak
230
- default :
231
- streakBonus = noStreak
232
- }
280
+
281
+ // Apply the segment-depth penalty (segments from the right).
282
+ runeScore := baseScore * (1.0 - float64 (m .segments [ii ])* perSegment )
283
+ if debug {
284
+ runeScores = append (runeScores , runeScore )
285
+ runeIdxs = append (runeIdxs , int (ii ))
233
286
}
234
- totScore += streakBonus * ( 1.0 - float64 ( m . segments [ ii ]) * perSegment )
235
- if finalChar {
287
+ totScore += runeScore
288
+ if finalRune {
236
289
break
237
290
}
238
- streakBonus = segStreak // see above: sequential characters get the max score
239
- } else {
240
- streakBonus = noStreak
241
291
}
242
292
}
243
293
294
+ if debug {
295
+ // Format rune roles and scores in line:
296
+ // fo[o:.52].[b:1]a[r:.6]
297
+ var summary bytes.Buffer
298
+ last := 0
299
+ for i , idx := range runeIdxs {
300
+ summary .WriteString (string (m .inputBuffer [last :idx ])) // encode runes
301
+ fmt .Fprintf (& summary , "[%s:%.2g]" , string (m .inputBuffer [idx ]), runeScores [i ])
302
+ last = idx + 1
303
+ }
304
+ summary .WriteString (string (m .inputBuffer [last :inputLen ])) // encode runes
305
+ log .Println (summary .String ())
306
+ }
307
+
244
308
return start , totScore / float64 (m .patternLen )
245
309
}
0 commit comments