jalgos · alexandre-jalgos · Jun 12, 2019 · Jun 17, 2019 · Jul 15, 2019 · Jul 15, 2019
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -29,3 +29,4 @@ Author: Ingo Feinerer [aut, cre] (<https://orcid.org/0000-0001-7656-8338>),
 Maintainer: Ingo Feinerer <[email protected]>
 Repository: CRAN
 Date/Publication: 2018-12-21 13:55:26 UTC
+RoxygenNote: 6.1.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -92,6 +92,10 @@ export("as.DocumentTermMatrix",
        "weightTfIdf",
        "weightBin",
        "weightSMART",
+       "weightSMART2",
+       "LearnTf",
+       "LearnWeightingParams",
+       "DocumentTermMatrix.TF",
        "writeCorpus",
        "XMLSource",
        "XMLTextDocument",

diff --git a/R/matrix.R b/R/matrix.R
@@ -147,6 +147,50 @@ function(x, control = list())
     .TermDocumentMatrix(m, control$weighting)
 }
 
+LearnTf <- function(x, control = list())
+{
+    tflist <- tm_parLapply(unname(content(x)), termFreq, control)
+    names(tflist) <- names(x)
+    TF <- rbindlist(lapply(seq_along(tflist),
+                           ith.tf.data.table,
+                           tflist = tflist))
+    TF[, doc_id := factor(doc_id)]
+    TF[, term := factor(term)]
+    setkeyv(TF, c("doc_id", "term"))
+    return(TF)
+}
+
+ith.tf.data.table <- function(tflist, i)
+{
+    if(length(tflist[[i]]) < 1)
+        NULL
+    else
+        data.table(doc_id = names(tflist)[i],
+                   term = names(tflist[[i]]),
+                   freq = tflist[[i]])
+}
+
+TF.to.vector <- function(TF, id)
+{
+    tf <- TF[doc_id == id, freq]
+    names(tf) <- TF[doc_id == id, term]
+    return(tf)
+}
+
+data.table.to.tflist <- function(TF)
+{
+    lapply(TF[, unique(doc_id)],
+           TF.to.vector,
+           TF = TF)
+}
+
+LearnWeightingParams <- function(TF, control = list())
+{
+    m <- TermDocumentMatrix.TF(TF, control = control)
+    return(list(docfreq = row_sums(m > 0),
+                ndoc = nDocs(m)))
+}
+
 TermDocumentMatrix.PCorpus <-
 TermDocumentMatrix.VCorpus <-
 function(x, control = list())
@@ -167,6 +211,37 @@ function(x, control = list())
     .TermDocumentMatrix(m, control$weighting)
 }
 
+TermDocumentMatrix.TF <-
+function(TF, control = list())
+{
+    if(!is.null(control$dictionary))
+        TF <- TF[term %in% control$dictionary]
+
+    tflist <- data.table.to.tflist(TF)
+    v <- unlist(tflist)
+    i <- names(v)
+    terms <- sort(unique(as.character(if (is.null(control$dictionary)) i
+                                      else control$dictionary)))
+    i <- match(i, terms)
+    j <- rep.int(seq_along(TF[, unique(doc_id)]), lengths(tflist))
+
+    docs <- as.character(TF[, unique(doc_id)])
+    ndoc <- length(docs)
+
+    m <- simple_triplet_matrix(i, j, as.numeric(v),
+                               nrow = length(terms),
+                               ncol = ndoc,
+                               dimnames = list(Terms = terms, Docs = docs))
+
+    m <- filter_global_bounds(m, control$bounds$global)
+
+    .TermDocumentMatrix(m, control$weighting)
+}
+
+DocumentTermMatrix.TF <-
+function(TF, control = list())
+    t(TermDocumentMatrix.TF(TF, control))
+
 DocumentTermMatrix <-
 function(x, control = list())
     t(TermDocumentMatrix(x, control))

diff --git a/R/tm.R b/R/tm.R
@@ -0,0 +1 @@
+.datatable.aware <- TRUE
diff --git a/R/weight.R b/R/weight.R
@@ -41,6 +41,108 @@ weightTfIdf <-
         if (isDTM) t(m) else m
     }, "term frequency - inverse document frequency", "tf-idf")
 
+weightSMART2 <- WeightFunction(function(m, spec = "nnn", control = list()) {
+    stopifnot(inherits(m, c("DocumentTermMatrix", "TermDocumentMatrix")),
+              is.character(spec), nchar(spec) == 3L, is.list(control))
+
+    term_frequency <-
+        match.arg(substr(spec, 1L, 1L),
+                  c("n", "l", "a", "b", "L"))
+    document_frequency <-
+        match.arg(substr(spec, 2L, 2L),
+                  c("n", "t", "p"))
+    normalization <-
+        match.arg(substr(spec, 3L, 3L),
+                  c("n", "c", "u", "b"))
+
+    isDTM <- inherits(m, "DocumentTermMatrix")
+    if (isDTM) m <- t(m)
+
+    if (normalization == "b") {
+            ## Need to compute the character lengths of the documents
+            ## before starting the weighting.
+            charlengths <-
+                tapply(nchar(Terms(m))[m$i] * m$v, m$j, sum)
+        }
+
+    ## Term frequency
+    m$v <- switch(term_frequency,
+                  ## natural
+                  n = m$v,
+                  ## logarithm
+                  l = 1 + log2(m$v),
+                  ## augmented
+                  a = {
+        s <- tapply(m$v, m$j, max)
+        0.5 + (0.5 * m$v) / s[as.character(m$j)]
+    },
+    ## boolean
+    b = as.numeric(m$v > 0),
+    ## log ave
+    L = {
+        s <- tapply(m$v, m$j, mean)
+        ((1 + log2(m$v)) / (1 + log2(s[as.character(m$j)])))
+    })
+
+    ## Document frequency 
+    if(!is.null(control$docfreq))
+        rs <- control$docfreq + row_sums(m > 0)
+    else
+        rs <- row_sums(m > 0)
+    if(!is.null(control$ndoc))
+        ndoc <- control$ndoc + nDocs(m)
+    else
+        ndoc <- nDocs(m)
+
+    if (any(rs == 0))
+        warning("unreferenced term(s): ",
+                paste(Terms(m)[rs == 0], collapse = " "))
+    df <- switch(document_frequency,
+                 ## natural
+                 n = 1,
+                 ## idf
+                 t = log2(ndoc / rs),
+                 ## prob idf
+                 p = max(0, log2((ndoc - rs) / rs)))
+    df[!is.finite(df)] <- 0
+
+    ## Normalization
+    cs <- col_sums(m)
+    if (any(cs == 0))
+        warning("empty document(s): ",
+                paste(Docs(m)[cs == 0], collapse = " "))
+    norm <- switch(normalization,
+                   ## none
+                   n = rep.int(1, ndoc),
+                   ## cosine
+                   c = sqrt(col_sums(m ^ 2)),
+                   ## pivoted unique
+                   u = {
+        if (is.null(pivot <- control$pivot))
+            stop("invalid control argument pivot")
+        if (is.null(slope <- control$slope))
+            stop("invalid control argument slope")
+        (slope * sqrt(col_sums(m ^ 2)) +
+         (1 - slope) * pivot)
+    },
+    ## byte size
+    b = {
+        if (is.null(alpha <- control$alpha))
+            stop("invalid control argument alpha")
+        norm <- double(ndoc)
+        norm[match(names(charlengths),
+                   seq_along(norm))] <-
+            charlengths ^ alpha
+        norm
+    })
+
+    m <- m * df
+    m$v <- m$v / norm[m$j]
+    attr(m, "weighting") <- c(paste("SMART", spec), "SMART")
+
+     m <- if (isDTM) t(m) else m
+}, "SMART2", "SMART2")
+
 weightSMART <-
     WeightFunction(function(m, spec = "nnn", control = list()) {
         stopifnot(inherits(m, c("DocumentTermMatrix", "TermDocumentMatrix")),

diff --git a/src/RcppExports.o b/src/RcppExports.o
diff --git a/src/copy.o b/src/copy.o
diff --git a/src/init.o b/src/init.o
diff --git a/src/remove.o b/src/remove.o
diff --git a/src/scan.o b/src/scan.o
diff --git a/src/tdm.o b/src/tdm.o
diff --git a/src/tm.so b/src/tm.so
diff --git a/src/tokenizer.o b/src/tokenizer.o