Skip to content

Search order implementation with additional tests. #366

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 21, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 120 additions & 52 deletions app/lib/search/index_simple.dart
Original file line number Diff line number Diff line change
Expand Up @@ -71,66 +71,59 @@ class SimplePackageIndex implements PackageIndex {

@override
Future<PackageSearchResult> search(SearchQuery query) async {
final Map<String, double> total = <String, double>{};
void addAll(Map<String, double> scores, double weight) {
scores?.forEach((String url, double score) {
if (score != null) {
final double prev = total[url] ?? 0.0;
total[url] = prev + score * weight;
}
});
}

addAll(_nameIndex.search(query.text), 0.70);
addAll(_descrIndex.search(query.text), 0.10);
addAll(_readmeIndex.search(query.text), 0.05);

if ((query.text == null || query.text.isEmpty) &&
query.packagePrefix != null) {
addAll(_nameIndex.search(query.packagePrefix), 0.8);
// do text matching
final Score textScore = _searchText(query.text, query.packagePrefix);

// The set of urls to filter on.
final Set<String> urls =
textScore?.getKeys()?.toSet() ?? _documents.keys.toSet();

// filter on package prefix
if (query.packagePrefix != null) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For future optimizations: There might be possibilities where one can avoid constructing the big maps in Score if we know only a small subset of keys will survive.

i.e. Instead of building up big datastructures and removing from them later, we could try filtering early on and construct smaller datastructures.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, will address it in a follow-up.

urls.removeWhere(
(url) => !_documents[url]
.package
.toLowerCase()
.startsWith(query.packagePrefix.toLowerCase()),
);
}

addAll(getHealthScore(total.keys), 0.05);
addAll(getPopularityScore(total.keys), 0.10);

List<PackageScore> results = <PackageScore>[];
for (String url in total.keys) {
final PackageDocument doc = _documents[url];

// filter on platform
if (query.platformPredicate != null &&
!query.platformPredicate.matches(doc.platforms)) {
continue;
}

// filter on package prefix
if (query.packagePrefix != null &&
!doc.package
.toLowerCase()
.startsWith(query.packagePrefix.toLowerCase())) {
continue;
}

results.add(new PackageScore(
url: doc.url,
package: doc.package,
score: total[url],
));
// filter on platform
if (query.platformPredicate != null) {
urls.removeWhere(
(url) => !query.platformPredicate.matches(_documents[url].platforms));
}

results.sort((a, b) => -a.score.compareTo(b.score));

// filter out the noise (maybe a single matching ngram)
if (results.isNotEmpty) {
final double bestScore = results.first.score;
final double scoreTreshold = bestScore / 25;
results.removeWhere((pr) => pr.score < scoreTreshold);
// reduce text results if filter did remove an url
textScore?.removeWhere((key) => !urls.contains(key));

List<PackageScore> results;
switch (query.order ?? SearchOrder.overall) {
case SearchOrder.overall:
final Score overallScore = new Score()
..addValues(textScore?.values, 0.85)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could avoid creating a copy of textScore and modify the copy, by just modify the textScore itself.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case I think it is cleaner to keep it separate, as it is explicit how the text match score relates to the rest.

..addValues(getPopularityScore(urls), 0.10)
..addValues(getHealthScore(urls), 0.05);
results = _rankWithValues(overallScore.values);
break;
case SearchOrder.text:
results = _rankWithValues(textScore.values);
break;
case SearchOrder.updated:
results = _rankWithComparator(urls, _compareUpdated);
break;
case SearchOrder.popularity:
results = _rankWithValues(getPopularityScore(urls));
break;
case SearchOrder.health:
results = _rankWithValues(getHealthScore(urls));
break;
}

// bound by offset and limit
final int totalCount = min(maxSearchResults, results.length);
final int totalCount = results.length;
if (query.offset != null && query.offset > 0) {
if (query.offset > totalCount) {
if (query.offset >= results.length) {
results = <PackageScore>[];
} else {
results = results.sublist(query.offset);
Expand Down Expand Up @@ -168,6 +161,81 @@ class SimplePackageIndex implements PackageIndex {
value: (String url) => _documents[url].popularity * 100,
);
}

Score _searchText(String text, String packagePrefix) {
if (text != null && text.isNotEmpty) {
final Score textScore = new Score()
..addValues(_nameIndex.search(text), 0.82)
..addValues(_descrIndex.search(text), 0.12)
..addValues(_readmeIndex.search(text), 0.06);
// removes scores that are less than 5% of the best
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment is confusing, since less than 5% of the best could be interpreted as we drop the lowest 95%. Maybe just consider removing these comments if the code itself is self-explaining.

textScore.removeLowScores(0.05);
// removes scores that are low
textScore.removeWhere((url) => textScore.values[url] < 1.0);
return textScore;
}
return null;
}

List<PackageScore> _rankWithValues(Map<String, double> values) {
final List<PackageScore> list = values.keys
.map((url) => new PackageScore(
url: url,
package: _documents[url].package,
score: values[url],
))
.toList();
list.sort((a, b) {
final int scoreCompare = -a.score.compareTo(b.score);
if (scoreCompare != 0) return scoreCompare;
// if two packages got the same score, order by last updated
return _compareUpdated(_documents[a.url], _documents[b.url]);
});
return list;
}

List<PackageScore> _rankWithComparator(
Set<String> urls, int compare(PackageDocument a, PackageDocument b)) {
final List<PackageScore> list = urls
.map((url) =>
new PackageScore(url: url, package: _documents[url].package))
.toList();
list.sort((a, b) => compare(_documents[a.url], _documents[b.url]));
return list;
}

int _compareUpdated(PackageDocument a, PackageDocument b) {
if (a.updated == null) return -1;
if (b.updated == null) return 1;
return -a.updated.compareTo(b.updated);
}
}

class Score {
final Map<String, double> values = <String, double>{};

Iterable<String> getKeys() => values.keys;

void addValues(Map<String, double> newValues, double weight) {
if (newValues == null) return;
newValues.forEach((String key, double score) {
if (score != null) {
final double prev = values[key] ?? 0.0;
values[key] = prev + score * weight;
}
});
}

void removeWhere(bool keyCondition(String key)) {
final Set<String> keysToRemove = values.keys.where(keyCondition).toSet();
keysToRemove.forEach(values.remove);
}

void removeLowScores(double fraction) {
final double maxValue = values.values.fold(0.0, max);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is a big weird: Basically if I search for "foobar" and none of the packages are relevant, though our character n-grams returns some low-score results, we find the max here, and someone passes like fraction = 0.9 (meaning should be in top 10%) then we get results, even though none are relevant.

I think a better way of doing this is to always ensure that if a package is perfect match, then it's score needs to be 1.0 then you can just say

removeWhere((key) => values[key] < 0.9);

Then it's even possible to have optimizations which pushes this filter down to the things calling addValues so you can avoid work if you know already it can't pass the filter.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may happen automatically, because we weight longer N-grams and prefixes more, and if there is a 6+ character match, we will most likely remove the low-quality matches anyway.
I'll experiment with this, and do a follow-up in a subsequent PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe slightly said in another way: If a search query has only results with poor scores, we should display none of them - not even the highest scored one.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that makes sense. I've added a low-score filter, but it is very conservative now. I would still experiment with it more, because it is not always the package name that people are searching for, and the rules around it may become complex.

final double cutoff = maxValue * fraction;
removeWhere((key) => values[key] < cutoff);
}
}

class TokenIndex {
Expand Down
16 changes: 10 additions & 6 deletions app/lib/shared/search_service.dart
Original file line number Diff line number Diff line change
Expand Up @@ -184,12 +184,14 @@ class SearchQuery {
/// Sanity check, whether the query object is to be expected a valid result.
bool get isValid {
final bool hasText = text != null && text.isNotEmpty;
final bool hasPackagePrefix =
packagePrefix != null && packagePrefix.isNotEmpty;
final bool hasNonTextOrdering = order != null &&
order != SearchOrder.overall &&
order != SearchOrder.text;
return hasText || hasPackagePrefix || hasNonTextOrdering;
final bool hasNonTextOrdering = order != SearchOrder.text;
final bool isEmpty = !hasText &&
order == null &&
packagePrefix == null &&
(platformPredicate == null || !platformPredicate.isNotEmpty);
if (isEmpty) return false;

return hasText || hasNonTextOrdering;
}
}

Expand Down Expand Up @@ -221,6 +223,8 @@ class PackageSearchResult extends Object
class PackageScore extends Object with _$PackageScoreSerializerMixin {
final String url;
final String package;

@JsonKey(includeIfNull: false)
final double score;

PackageScore({
Expand Down
17 changes: 15 additions & 2 deletions app/lib/shared/search_service.g.dart

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions app/test/search/handlers_test.dart
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ void main() {
{
'url': 'https://pub.domain/packages/pkg_foo',
'package': 'pkg_foo',
'score': closeTo(71.1, 0.1),
'score': closeTo(70.9, 0.1),
}
],
});
Expand Down Expand Up @@ -105,7 +105,7 @@ void main() {
{
'url': 'https://pub.domain/packages/pkg_foo',
'package': 'pkg_foo',
'score': closeTo(13.3, 0.1),
'score': closeTo(1.0, 0.1),
}
],
});
Expand Down
Loading