Skip to content

Ad hoc stemming #633

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 28, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions Distribution/Server/Features/Search/ExtractDescriptionTerms.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

module Distribution.Server.Features.Search.ExtractDescriptionTerms (
extractSynopsisTerms,
extractDescriptionTerms
extractDescriptionTerms,
extraStems
) where

import Distribution.Server.Prelude
Expand All @@ -20,18 +21,21 @@ import Documentation.Haddock.Types

import qualified Distribution.Server.Pages.Package.HaddockParse as Haddock (parse)

extraStems :: [Text] -> Text -> [Text]
extraStems ss x = x : mapMaybe (x `T.stripSuffix`) ss

extractSynopsisTerms :: Set Text -> String -> [Text]
extractSynopsisTerms stopWords =
NLP.stems NLP.English
extractSynopsisTerms :: [Text] -> Set Text -> String -> [Text]
extractSynopsisTerms ss stopWords =
concatMap (extraStems ss) --note this adds extra possible stems, it doesn't delete any given one.
. NLP.stems NLP.English
. filter (`Set.notMember` stopWords)
. map (T.toCaseFold . T.pack)
. concatMap splitTok
. filter (not . ignoreTok)
. NLP.tokenize


ignoreTok :: String -> Bool
ignoreTok :: String -> Bool
ignoreTok = all isPunctuation

splitTok :: String -> [String]
Expand All @@ -48,9 +52,10 @@ splitTok tok =
(leading, []) -> leading : []


extractDescriptionTerms :: Set Text -> String -> [Text]
extractDescriptionTerms stopWords =
NLP.stems NLP.English
extractDescriptionTerms :: [Text] -> Set Text -> String -> [Text]
extractDescriptionTerms ss stopWords =
concatMap (extraStems ss)
. NLP.stems NLP.English
. filter (`Set.notMember` stopWords)
. map (T.toCaseFold . T.pack)
. maybe
Expand Down Expand Up @@ -98,7 +103,7 @@ main = do
let mostFreq :: [String]
pkgs :: [PackageDescription]
(mostFreq, pkgs) = read pkgsFile

stopWordsFile <- T.readFile "stopwords.txt"
-- wordsFile <- T.readFile "/usr/share/dict/words"
-- let ws = Set.fromList (map T.toLower $ T.lines wordsFile)
Expand All @@ -114,7 +119,7 @@ main = do
sequence_
[ putStrLn $ display (packageName pkg) ++ ": "
++ --intercalate ", "
(description pkg) ++ "\n"
(description pkg) ++ "\n"
++ intercalate ", "
(map T.unpack $ extractDescriptionTerms stopWords (description pkg)) ++ "\n"
| pkg <- pkgs
Expand Down
20 changes: 13 additions & 7 deletions Distribution/Server/Features/Search/PkgSearch.hs
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,15 @@ pkgSearchConfig =
}
where
extractTokens :: PackageDescription -> PkgDocField -> [Text]
extractTokens pkg NameField = extractPackageNameTerms (display $ packageName pkg)
extractTokens pkg SynopsisField = extractSynopsisTerms stopWords (synopsis pkg)
extractTokens pkg DescriptionField = extractDescriptionTerms stopWords (description pkg)
extractTokens pkg NameField = concatMap (extraStems computerStems) $
extractPackageNameTerms (display $ packageName pkg)
extractTokens pkg SynopsisField = extractSynopsisTerms computerStems stopWords (synopsis pkg)
extractTokens pkg DescriptionField = extractDescriptionTerms computerStems stopWords (description pkg)

normaliseQueryToken :: Text -> PkgDocField -> Text
normaliseQueryToken tok =
let tokFold = T.toCaseFold tok
-- we don't need to use extraStems here because the index is inflated by it already.
tokStem = stem English tokFold
in \field -> case field of
NameField -> tokFold
Expand All @@ -79,8 +81,8 @@ defaultSearchRankParameters =
paramFieldWeights,
paramFeatureWeights,
paramFeatureFunctions,
paramResultsetSoftLimit = 200,
paramResultsetHardLimit = 400
paramResultsetSoftLimit = 400,
paramResultsetHardLimit = 800
}
where
paramK1 :: Float
Expand Down Expand Up @@ -114,6 +116,10 @@ stopWords =
"now","how","where","when","up","has","been","about","them","then","see",
"no","do","than","should","out","off","much","if","i","have","also"]

-- Extra stems that tend to occur with software packages
computerStems :: [Text]
computerStems = map T.pack ["ql","db","ml","gl"]


{-
-------------------
Expand Down Expand Up @@ -146,15 +152,15 @@ main = do
print ("search engine invariant", invariant searchengine)

-- print [ avgFieldLength ctx s | s <- [minBound..maxBound] ]

-- print $ take 100 $ sortBy (flip compare) $ map Set.size $ Map.elems (termMap searchindex)
-- T.putStr $ T.unlines $ Map.keys (termMap searchindex)
-- let SearchEngine{searchIndex=SearchIndex{termMap, termIdMap, docKeyMap, docIdMap}} = searchengine
-- print (Map.size termMap, IntMap.size termIdMap, Map.size docKeyMap, IntMap.size docIdMap)

let loop = do
putStr "search term> "
hFlush stdout
hFlush stdout
t <- getLine
unless (null t) $ do
let terms = stems English
Expand Down