From bac34ee20a367848534e475d4b3ecef2da1baa67 Mon Sep 17 00:00:00 2001 From: Gershom Date: Sat, 30 Sep 2017 18:46:50 -0400 Subject: [PATCH 1/2] bumping query limits --- Distribution/Server/Features/Search/PkgSearch.hs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Distribution/Server/Features/Search/PkgSearch.hs b/Distribution/Server/Features/Search/PkgSearch.hs index 0312f7b87..f153c96e7 100644 --- a/Distribution/Server/Features/Search/PkgSearch.hs +++ b/Distribution/Server/Features/Search/PkgSearch.hs @@ -79,8 +79,8 @@ defaultSearchRankParameters = paramFieldWeights, paramFeatureWeights, paramFeatureFunctions, - paramResultsetSoftLimit = 200, - paramResultsetHardLimit = 400 + paramResultsetSoftLimit = 400, + paramResultsetHardLimit = 800 } where paramK1 :: Float @@ -146,7 +146,7 @@ main = do print ("search engine invariant", invariant searchengine) -- print [ avgFieldLength ctx s | s <- [minBound..maxBound] ] - + -- print $ take 100 $ sortBy (flip compare) $ map Set.size $ Map.elems (termMap searchindex) -- T.putStr $ T.unlines $ Map.keys (termMap searchindex) -- let SearchEngine{searchIndex=SearchIndex{termMap, termIdMap, docKeyMap, docIdMap}} = searchengine @@ -154,7 +154,7 @@ main = do let loop = do putStr "search term> " - hFlush stdout + hFlush stdout t <- getLine unless (null t) $ do let terms = stems English From 59f95714b2e82fede85dc1de28f5043572ae95d8 Mon Sep 17 00:00:00 2001 From: Gershom Date: Sat, 30 Sep 2017 19:19:10 -0400 Subject: [PATCH 2/2] some extra search tweaks to handle stemmed/inexact matches on suffixes that don't occur in normal language but do in software --- .../Search/ExtractDescriptionTerms.hs | 25 +++++++++++-------- .../Server/Features/Search/PkgSearch.hs | 12 ++++++--- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/Distribution/Server/Features/Search/ExtractDescriptionTerms.hs b/Distribution/Server/Features/Search/ExtractDescriptionTerms.hs index 08916c0bc..65e1d7169 100644 --- a/Distribution/Server/Features/Search/ExtractDescriptionTerms.hs +++ b/Distribution/Server/Features/Search/ExtractDescriptionTerms.hs @@ -2,7 +2,8 @@ module Distribution.Server.Features.Search.ExtractDescriptionTerms ( extractSynopsisTerms, - extractDescriptionTerms + extractDescriptionTerms, + extraStems ) where import Distribution.Server.Prelude @@ -20,10 +21,13 @@ import Documentation.Haddock.Types import qualified Distribution.Server.Pages.Package.HaddockParse as Haddock (parse) +extraStems :: [Text] -> Text -> [Text] +extraStems ss x = x : mapMaybe (x `T.stripSuffix`) ss -extractSynopsisTerms :: Set Text -> String -> [Text] -extractSynopsisTerms stopWords = - NLP.stems NLP.English +extractSynopsisTerms :: [Text] -> Set Text -> String -> [Text] +extractSynopsisTerms ss stopWords = + concatMap (extraStems ss) --note this adds extra possible stems, it doesn't delete any given one. + . NLP.stems NLP.English . filter (`Set.notMember` stopWords) . map (T.toCaseFold . T.pack) . concatMap splitTok @@ -31,7 +35,7 @@ extractSynopsisTerms stopWords = . NLP.tokenize -ignoreTok :: String -> Bool +ignoreTok :: String -> Bool ignoreTok = all isPunctuation splitTok :: String -> [String] @@ -48,9 +52,10 @@ splitTok tok = (leading, []) -> leading : [] -extractDescriptionTerms :: Set Text -> String -> [Text] -extractDescriptionTerms stopWords = - NLP.stems NLP.English +extractDescriptionTerms :: [Text] -> Set Text -> String -> [Text] +extractDescriptionTerms ss stopWords = + concatMap (extraStems ss) + . NLP.stems NLP.English . filter (`Set.notMember` stopWords) . map (T.toCaseFold . T.pack) . maybe @@ -98,7 +103,7 @@ main = do let mostFreq :: [String] pkgs :: [PackageDescription] (mostFreq, pkgs) = read pkgsFile - + stopWordsFile <- T.readFile "stopwords.txt" -- wordsFile <- T.readFile "/usr/share/dict/words" -- let ws = Set.fromList (map T.toLower $ T.lines wordsFile) @@ -114,7 +119,7 @@ main = do sequence_ [ putStrLn $ display (packageName pkg) ++ ": " ++ --intercalate ", " - (description pkg) ++ "\n" + (description pkg) ++ "\n" ++ intercalate ", " (map T.unpack $ extractDescriptionTerms stopWords (description pkg)) ++ "\n" | pkg <- pkgs diff --git a/Distribution/Server/Features/Search/PkgSearch.hs b/Distribution/Server/Features/Search/PkgSearch.hs index f153c96e7..6f19cfcfb 100644 --- a/Distribution/Server/Features/Search/PkgSearch.hs +++ b/Distribution/Server/Features/Search/PkgSearch.hs @@ -56,13 +56,15 @@ pkgSearchConfig = } where extractTokens :: PackageDescription -> PkgDocField -> [Text] - extractTokens pkg NameField = extractPackageNameTerms (display $ packageName pkg) - extractTokens pkg SynopsisField = extractSynopsisTerms stopWords (synopsis pkg) - extractTokens pkg DescriptionField = extractDescriptionTerms stopWords (description pkg) + extractTokens pkg NameField = concatMap (extraStems computerStems) $ + extractPackageNameTerms (display $ packageName pkg) + extractTokens pkg SynopsisField = extractSynopsisTerms computerStems stopWords (synopsis pkg) + extractTokens pkg DescriptionField = extractDescriptionTerms computerStems stopWords (description pkg) normaliseQueryToken :: Text -> PkgDocField -> Text normaliseQueryToken tok = let tokFold = T.toCaseFold tok + -- we don't need to use extraStems here because the index is inflated by it already. tokStem = stem English tokFold in \field -> case field of NameField -> tokFold @@ -114,6 +116,10 @@ stopWords = "now","how","where","when","up","has","been","about","them","then","see", "no","do","than","should","out","off","much","if","i","have","also"] +-- Extra stems that tend to occur with software packages +computerStems :: [Text] +computerStems = map T.pack ["ql","db","ml","gl"] + {- -------------------