internal/fuzzy: improvements to the symbol scoring algorithm

findleyr · findleyr · commit ddfa2200ae0b · 2023-05-09T14:15:48.000Z
Based on feedback in golang/go#60027, tweak the fuzzy symbol scoring algorithm to much more strongly prefer sequential and exact matches. Fixes golang/go#60027 Change-Id: I1c6d019065c4dff4adf2db9e94397a635e13d50f Reviewed-on: https://go-review.googlesource.com/c/tools/+/493623 gopls-CI: kokoro <noreply+kokoro@google.com> Run-TryBot: Robert Findley <rfindley@google.com> Reviewed-by: Paul Jolly <paul@myitcv.org.uk> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Alan Donovan <adonovan@google.com>
diff --git a/gopls/internal/lsp/cmd/usage/workspace_symbol.hlp b/gopls/internal/lsp/cmd/usage/workspace_symbol.hlp
@@ -9,5 +9,5 @@ Example:
 
 workspace_symbol-flags:
   -matcher=string
-    	specifies the type of matcher: fuzzy, caseSensitive, or caseInsensitive.
-    	The default is caseInsensitive.
+    	specifies the type of matcher: fuzzy, fastfuzzy, casesensitive, or caseinsensitive.
+    	The default is caseinsensitive.
diff --git a/gopls/internal/lsp/cmd/workspace_symbol.go b/gopls/internal/lsp/cmd/workspace_symbol.go
@@ -8,6 +8,7 @@ import (
 	"context"
 	"flag"
 	"fmt"
+	"strings"
 
 	"golang.org/x/tools/gopls/internal/lsp/protocol"
 	"golang.org/x/tools/gopls/internal/lsp/source"
@@ -16,7 +17,7 @@ import (
 
 // workspaceSymbol implements the workspace_symbol verb for gopls.
 type workspaceSymbol struct {
-	Matcher string `flag:"matcher" help:"specifies the type of matcher: fuzzy, caseSensitive, or caseInsensitive.\nThe default is caseInsensitive."`
+	Matcher string `flag:"matcher" help:"specifies the type of matcher: fuzzy, fastfuzzy, casesensitive, or caseinsensitive.\nThe default is caseinsensitive."`
 
 	app *Application
 }
@@ -46,10 +47,10 @@ func (r *workspaceSymbol) Run(ctx context.Context, args ...string) error {
 		if opts != nil {
 			opts(o)
 		}
-		switch r.Matcher {
+		switch strings.ToLower(r.Matcher) {
 		case "fuzzy":
 			o.SymbolMatcher = source.SymbolFuzzy
-		case "caseSensitive":
+		case "casesensitive":
 			o.SymbolMatcher = source.SymbolCaseSensitive
 		case "fastfuzzy":
 			o.SymbolMatcher = source.SymbolFastFuzzy
diff --git a/gopls/internal/lsp/source/workspace_symbol.go b/gopls/internal/lsp/source/workspace_symbol.go
@@ -484,7 +484,10 @@ func matchFile(store *symbolStore, symbolizer symbolizer, matcher matcherFunc, r
 			// every field or method nesting level to access the field decreases
 			// the score by a factor of 1.0 - depth*depthFactor, up to a depth of
 			// 3.
-			depthFactor = 0.2
+			//
+			// Use a small constant here, as this exists mostly to break ties
+			// (e.g. given a type Foo and a field x.Foo, prefer Foo).
+			depthFactor = 0.01
 		)
 
 		startWord := true
diff --git a/internal/fuzzy/symbol.go b/internal/fuzzy/symbol.go
@@ -26,9 +26,6 @@ import (
 //     symbol or identifiers, so doing this avoids allocating strings.
 //   - We can return the index of the right-most match, allowing us to trim
 //     irrelevant qualification.
-//
-// This implementation is experimental, serving as a reference fast algorithm
-// to compare to the fuzzy algorithm implemented by Matcher.
 type SymbolMatcher struct {
 	// Using buffers of length 256 is both a reasonable size for most qualified
 	// symbols, and makes it easy to avoid bounds checks by using uint8 indexes.
@@ -169,19 +166,29 @@ input:
 	// Score is the average score for each character.
 	//
 	// A character score is the multiple of:
-	//   1. 1.0 if the character starts a segment, .8 if the character start a
-	//      mid-segment word, otherwise 0.6. This carries over to immediately
-	//      following characters.
-	//   2. For the final character match, the multiplier from (1) is reduced to
-	//     .8 if the next character in the input is a mid-segment word, or 0.6 if
-	//      the next character in the input is not a word or segment start. This
-	//      ensures that we favor whole-word or whole-segment matches over prefix
-	//      matches.
-	//   3. 1.0 if the character is part of the last segment, otherwise
-	//      1.0-.2*<segments from the right>, with a max segment count of 3.
+	//   1. 1.0 if the character starts a segment or is preceded by a matching
+	//      character, 0.9 if the character starts a mid-segment word, else 0.6.
+	//
+	//      Note that characters preceded by a matching character get the max
+	//      score of 1.0 so that sequential or exact matches are preferred, even
+	//      if they don't start/end at a segment or word boundary. For example, a
+	//      match for "func" in intfuncs should have a higher score than in
+	//      ifunmatched.
+	//
+	//      For the final character match, the multiplier from (1) is reduced to
+	//      0.9 if the next character in the input is a mid-segment word, or 0.6
+	//      if the next character in the input is not a word or segment start.
+	//      This ensures that we favor whole-word or whole-segment matches over
+	//      prefix matches.
+	//
+	//   2. 1.0 if the character is part of the last segment, otherwise
+	//      1.0-0.1*<segments from the right>, with a max segment count of 3.
+	//      Notably 1.0-0.1*3 = 0.7 > 0.6, so that foo/_/_/_/_ (a match very
+	//      early in a qualified symbol name) still scores higher than _f_o_o_
+	//      (a completely split match).
 	//
-	// This is a very naive algorithm, but it is fast. There's lots of prior art
-	// here, and we should leverage it. For example, we could explicitly consider
+	// This is a naive algorithm, but it is fast. There's lots of prior art here
+	// that could be leveraged. For example, we could explicitly consider
 	// character distance, and exact matches of words or segments.
 	//
 	// Also note that this might not actually find the highest scoring match, as
@@ -192,10 +199,10 @@ input:
 	p = m.pattern[pi]
 
 	const (
-		segStreak  = 1.0
-		wordStreak = 0.8
+		segStreak  = 1.0 // start of segment or sequential match
+		wordStreak = 0.9 // start of word match
 		noStreak   = 0.6
-		perSegment = 0.2 // we count at most 3 segments above
+		perSegment = 0.1 // we count at most 3 segments above
 	)
 
 	streakBonus := noStreak
@@ -228,6 +235,7 @@ input:
 			if finalChar {
 				break
 			}
+			streakBonus = segStreak // see above: sequential characters get the max score
 		} else {
 			streakBonus = noStreak
 		}
diff --git a/internal/fuzzy/symbol_test.go b/internal/fuzzy/symbol_test.go
@@ -40,12 +40,12 @@ func TestSymbolRanking(t *testing.T) {
 	symbols := []string{
 		"this.is.better.than.most",
 		"test.foo.bar",
-		"atest",
 		"thebest",
 		"test.foo",
 		"test.foo",
-		"tTest",
+		"atest",
 		"testage",
+		"tTest",
 		"foo.test",
 		"test",
 	}
@@ -60,6 +60,33 @@ func TestSymbolRanking(t *testing.T) {
 	}
 }
 
+// Test that we strongly prefer exact matches.
+//
+// In golang/go#60027, we preferred "Runner" for the query "rune" over several
+// results containing the word "rune" exactly. Following this observation,
+// scoring was tweaked to more strongly emphasize sequential characters and
+// exact matches.
+func TestSymbolRanking_Issue60027(t *testing.T) {
+	matcher := NewSymbolMatcher("rune")
+
+	// symbols to match, in ascending order of ranking.
+	symbols := []string{
+		"Runner",
+		"singleRuneParam",
+		"Config.ifsRune",
+		"Parser.rune",
+	}
+	prev := 0.0
+	for _, sym := range symbols {
+		_, score := matcher.Match([]string{sym})
+		t.Logf("Match(%q) = %v", sym, score)
+		if score < prev {
+			t.Errorf("Match(%q) = _, %v, want > %v", sym, score, prev)
+		}
+		prev = score
+	}
+}
+
 func TestChunkedMatch(t *testing.T) {
 	matcher := NewSymbolMatcher("test")