-
Notifications
You must be signed in to change notification settings - Fork 160
Search order implementation with additional tests. #366
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,66 +71,59 @@ class SimplePackageIndex implements PackageIndex { | |
|
||
@override | ||
Future<PackageSearchResult> search(SearchQuery query) async { | ||
final Map<String, double> total = <String, double>{}; | ||
void addAll(Map<String, double> scores, double weight) { | ||
scores?.forEach((String url, double score) { | ||
if (score != null) { | ||
final double prev = total[url] ?? 0.0; | ||
total[url] = prev + score * weight; | ||
} | ||
}); | ||
} | ||
|
||
addAll(_nameIndex.search(query.text), 0.70); | ||
addAll(_descrIndex.search(query.text), 0.10); | ||
addAll(_readmeIndex.search(query.text), 0.05); | ||
|
||
if ((query.text == null || query.text.isEmpty) && | ||
query.packagePrefix != null) { | ||
addAll(_nameIndex.search(query.packagePrefix), 0.8); | ||
// do text matching | ||
final Score textScore = _searchText(query.text, query.packagePrefix); | ||
|
||
// The set of urls to filter on. | ||
final Set<String> urls = | ||
textScore?.getKeys()?.toSet() ?? _documents.keys.toSet(); | ||
|
||
// filter on package prefix | ||
if (query.packagePrefix != null) { | ||
urls.removeWhere( | ||
(url) => !_documents[url] | ||
.package | ||
.toLowerCase() | ||
.startsWith(query.packagePrefix.toLowerCase()), | ||
); | ||
} | ||
|
||
addAll(getHealthScore(total.keys), 0.05); | ||
addAll(getPopularityScore(total.keys), 0.10); | ||
|
||
List<PackageScore> results = <PackageScore>[]; | ||
for (String url in total.keys) { | ||
final PackageDocument doc = _documents[url]; | ||
|
||
// filter on platform | ||
if (query.platformPredicate != null && | ||
!query.platformPredicate.matches(doc.platforms)) { | ||
continue; | ||
} | ||
|
||
// filter on package prefix | ||
if (query.packagePrefix != null && | ||
!doc.package | ||
.toLowerCase() | ||
.startsWith(query.packagePrefix.toLowerCase())) { | ||
continue; | ||
} | ||
|
||
results.add(new PackageScore( | ||
url: doc.url, | ||
package: doc.package, | ||
score: total[url], | ||
)); | ||
// filter on platform | ||
if (query.platformPredicate != null) { | ||
urls.removeWhere( | ||
(url) => !query.platformPredicate.matches(_documents[url].platforms)); | ||
} | ||
|
||
results.sort((a, b) => -a.score.compareTo(b.score)); | ||
|
||
// filter out the noise (maybe a single matching ngram) | ||
if (results.isNotEmpty) { | ||
final double bestScore = results.first.score; | ||
final double scoreTreshold = bestScore / 25; | ||
results.removeWhere((pr) => pr.score < scoreTreshold); | ||
// reduce text results if filter did remove an url | ||
textScore?.removeWhere((key) => !urls.contains(key)); | ||
|
||
List<PackageScore> results; | ||
switch (query.order ?? SearchOrder.overall) { | ||
case SearchOrder.overall: | ||
final Score overallScore = new Score() | ||
..addValues(textScore?.values, 0.85) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could avoid creating a copy of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case I think it is cleaner to keep it separate, as it is explicit how the text match score relates to the rest. |
||
..addValues(getPopularityScore(urls), 0.10) | ||
..addValues(getHealthScore(urls), 0.05); | ||
results = _rankWithValues(overallScore.values); | ||
break; | ||
case SearchOrder.text: | ||
results = _rankWithValues(textScore.values); | ||
break; | ||
case SearchOrder.updated: | ||
results = _rankWithComparator(urls, _compareUpdated); | ||
break; | ||
case SearchOrder.popularity: | ||
results = _rankWithValues(getPopularityScore(urls)); | ||
break; | ||
case SearchOrder.health: | ||
results = _rankWithValues(getHealthScore(urls)); | ||
break; | ||
} | ||
|
||
// bound by offset and limit | ||
final int totalCount = min(maxSearchResults, results.length); | ||
final int totalCount = results.length; | ||
if (query.offset != null && query.offset > 0) { | ||
if (query.offset > totalCount) { | ||
if (query.offset >= results.length) { | ||
results = <PackageScore>[]; | ||
} else { | ||
results = results.sublist(query.offset); | ||
|
@@ -168,6 +161,81 @@ class SimplePackageIndex implements PackageIndex { | |
value: (String url) => _documents[url].popularity * 100, | ||
); | ||
} | ||
|
||
Score _searchText(String text, String packagePrefix) { | ||
if (text != null && text.isNotEmpty) { | ||
final Score textScore = new Score() | ||
..addValues(_nameIndex.search(text), 0.82) | ||
..addValues(_descrIndex.search(text), 0.12) | ||
..addValues(_readmeIndex.search(text), 0.06); | ||
// removes scores that are less than 5% of the best | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment is confusing, since |
||
textScore.removeLowScores(0.05); | ||
// removes scores that are low | ||
textScore.removeWhere((url) => textScore.values[url] < 1.0); | ||
return textScore; | ||
} | ||
return null; | ||
} | ||
|
||
List<PackageScore> _rankWithValues(Map<String, double> values) { | ||
final List<PackageScore> list = values.keys | ||
.map((url) => new PackageScore( | ||
url: url, | ||
package: _documents[url].package, | ||
score: values[url], | ||
)) | ||
.toList(); | ||
list.sort((a, b) { | ||
final int scoreCompare = -a.score.compareTo(b.score); | ||
if (scoreCompare != 0) return scoreCompare; | ||
// if two packages got the same score, order by last updated | ||
return _compareUpdated(_documents[a.url], _documents[b.url]); | ||
}); | ||
return list; | ||
} | ||
|
||
List<PackageScore> _rankWithComparator( | ||
Set<String> urls, int compare(PackageDocument a, PackageDocument b)) { | ||
final List<PackageScore> list = urls | ||
.map((url) => | ||
new PackageScore(url: url, package: _documents[url].package)) | ||
.toList(); | ||
list.sort((a, b) => compare(_documents[a.url], _documents[b.url])); | ||
return list; | ||
} | ||
|
||
int _compareUpdated(PackageDocument a, PackageDocument b) { | ||
if (a.updated == null) return -1; | ||
if (b.updated == null) return 1; | ||
return -a.updated.compareTo(b.updated); | ||
} | ||
} | ||
|
||
class Score { | ||
final Map<String, double> values = <String, double>{}; | ||
|
||
Iterable<String> getKeys() => values.keys; | ||
|
||
void addValues(Map<String, double> newValues, double weight) { | ||
if (newValues == null) return; | ||
newValues.forEach((String key, double score) { | ||
if (score != null) { | ||
final double prev = values[key] ?? 0.0; | ||
values[key] = prev + score * weight; | ||
} | ||
}); | ||
} | ||
|
||
void removeWhere(bool keyCondition(String key)) { | ||
final Set<String> keysToRemove = values.keys.where(keyCondition).toSet(); | ||
keysToRemove.forEach(values.remove); | ||
} | ||
|
||
void removeLowScores(double fraction) { | ||
final double maxValue = values.values.fold(0.0, max); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is a big weird: Basically if I search for "foobar" and none of the packages are relevant, though our character n-grams returns some low-score results, we find the max here, and someone passes like I think a better way of doing this is to always ensure that if a package is perfect match, then it's score needs to be removeWhere((key) => values[key] < 0.9); Then it's even possible to have optimizations which pushes this filter down to the things calling There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This may happen automatically, because we weight longer N-grams and prefixes more, and if there is a 6+ character match, we will most likely remove the low-quality matches anyway. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe slightly said in another way: If a search query has only results with poor scores, we should display none of them - not even the highest scored one. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, that makes sense. I've added a low-score filter, but it is very conservative now. I would still experiment with it more, because it is not always the package name that people are searching for, and the rules around it may become complex. |
||
final double cutoff = maxValue * fraction; | ||
removeWhere((key) => values[key] < cutoff); | ||
} | ||
} | ||
|
||
class TokenIndex { | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For future optimizations: There might be possibilities where one can avoid constructing the big maps in
Score
if we know only a small subset of keys will survive.i.e. Instead of building up big datastructures and removing from them later, we could try filtering early on and construct smaller datastructures.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point, will address it in a follow-up.