diff --git a/DESCRIPTION b/DESCRIPTION index d4aaf01..9d79cb5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,3 +29,4 @@ Author: Ingo Feinerer [aut, cre] (), Maintainer: Ingo Feinerer Repository: CRAN Date/Publication: 2018-12-21 13:55:26 UTC +RoxygenNote: 6.1.1 diff --git a/NAMESPACE b/NAMESPACE index 5f2c0fd..2a2bbba 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -92,6 +92,10 @@ export("as.DocumentTermMatrix", "weightTfIdf", "weightBin", "weightSMART", + "weightSMART2", + "LearnTf", + "LearnWeightingParams", + "DocumentTermMatrix.TF", "writeCorpus", "XMLSource", "XMLTextDocument", diff --git a/R/matrix.R b/R/matrix.R index 2912b4d..9ab75c7 100644 --- a/R/matrix.R +++ b/R/matrix.R @@ -147,6 +147,50 @@ function(x, control = list()) .TermDocumentMatrix(m, control$weighting) } +LearnTf <- function(x, control = list()) +{ + tflist <- tm_parLapply(unname(content(x)), termFreq, control) + names(tflist) <- names(x) + TF <- rbindlist(lapply(seq_along(tflist), + ith.tf.data.table, + tflist = tflist)) + TF[, doc_id := factor(doc_id)] + TF[, term := factor(term)] + setkeyv(TF, c("doc_id", "term")) + return(TF) +} + +ith.tf.data.table <- function(tflist, i) +{ + if(length(tflist[[i]]) < 1) + NULL + else + data.table(doc_id = names(tflist)[i], + term = names(tflist[[i]]), + freq = tflist[[i]]) +} + +TF.to.vector <- function(TF, id) +{ + tf <- TF[doc_id == id, freq] + names(tf) <- TF[doc_id == id, term] + return(tf) +} + +data.table.to.tflist <- function(TF) +{ + lapply(TF[, unique(doc_id)], + TF.to.vector, + TF = TF) +} + +LearnWeightingParams <- function(TF, control = list()) +{ + m <- TermDocumentMatrix.TF(TF, control = control) + return(list(docfreq = row_sums(m > 0), + ndoc = nDocs(m))) +} + TermDocumentMatrix.PCorpus <- TermDocumentMatrix.VCorpus <- function(x, control = list()) @@ -167,6 +211,37 @@ function(x, control = list()) .TermDocumentMatrix(m, control$weighting) } +TermDocumentMatrix.TF <- +function(TF, control = list()) +{ + if(!is.null(control$dictionary)) + TF <- TF[term %in% control$dictionary] + + tflist <- data.table.to.tflist(TF) + v <- unlist(tflist) + i <- names(v) + terms <- sort(unique(as.character(if (is.null(control$dictionary)) i + else control$dictionary))) + i <- match(i, terms) + j <- rep.int(seq_along(TF[, unique(doc_id)]), lengths(tflist)) + + docs <- as.character(TF[, unique(doc_id)]) + ndoc <- length(docs) + + m <- simple_triplet_matrix(i, j, as.numeric(v), + nrow = length(terms), + ncol = ndoc, + dimnames = list(Terms = terms, Docs = docs)) + + m <- filter_global_bounds(m, control$bounds$global) + + .TermDocumentMatrix(m, control$weighting) +} + +DocumentTermMatrix.TF <- +function(TF, control = list()) + t(TermDocumentMatrix.TF(TF, control)) + DocumentTermMatrix <- function(x, control = list()) t(TermDocumentMatrix(x, control)) diff --git a/R/tm.R b/R/tm.R new file mode 100644 index 0000000..2901762 --- /dev/null +++ b/R/tm.R @@ -0,0 +1 @@ +.datatable.aware <- TRUE diff --git a/R/weight.R b/R/weight.R index 8890646..33e33e0 100644 --- a/R/weight.R +++ b/R/weight.R @@ -41,6 +41,108 @@ weightTfIdf <- if (isDTM) t(m) else m }, "term frequency - inverse document frequency", "tf-idf") +weightSMART2 <- WeightFunction(function(m, spec = "nnn", control = list()) { + stopifnot(inherits(m, c("DocumentTermMatrix", "TermDocumentMatrix")), + is.character(spec), nchar(spec) == 3L, is.list(control)) + + term_frequency <- + match.arg(substr(spec, 1L, 1L), + c("n", "l", "a", "b", "L")) + document_frequency <- + match.arg(substr(spec, 2L, 2L), + c("n", "t", "p")) + normalization <- + match.arg(substr(spec, 3L, 3L), + c("n", "c", "u", "b")) + + isDTM <- inherits(m, "DocumentTermMatrix") + if (isDTM) m <- t(m) + + if (normalization == "b") { + ## Need to compute the character lengths of the documents + ## before starting the weighting. + charlengths <- + tapply(nchar(Terms(m))[m$i] * m$v, m$j, sum) + } + + ## Term frequency + m$v <- switch(term_frequency, + ## natural + n = m$v, + ## logarithm + l = 1 + log2(m$v), + ## augmented + a = { + s <- tapply(m$v, m$j, max) + 0.5 + (0.5 * m$v) / s[as.character(m$j)] + }, + ## boolean + b = as.numeric(m$v > 0), + ## log ave + L = { + s <- tapply(m$v, m$j, mean) + ((1 + log2(m$v)) / (1 + log2(s[as.character(m$j)]))) + }) + + ## Document frequency + if(!is.null(control$docfreq)) + rs <- control$docfreq + row_sums(m > 0) + else + rs <- row_sums(m > 0) + if(!is.null(control$ndoc)) + ndoc <- control$ndoc + nDocs(m) + else + ndoc <- nDocs(m) + + if (any(rs == 0)) + warning("unreferenced term(s): ", + paste(Terms(m)[rs == 0], collapse = " ")) + df <- switch(document_frequency, + ## natural + n = 1, + ## idf + t = log2(ndoc / rs), + ## prob idf + p = max(0, log2((ndoc - rs) / rs))) + df[!is.finite(df)] <- 0 + + ## Normalization + cs <- col_sums(m) + if (any(cs == 0)) + warning("empty document(s): ", + paste(Docs(m)[cs == 0], collapse = " ")) + norm <- switch(normalization, + ## none + n = rep.int(1, ndoc), + ## cosine + c = sqrt(col_sums(m ^ 2)), + ## pivoted unique + u = { + if (is.null(pivot <- control$pivot)) + stop("invalid control argument pivot") + if (is.null(slope <- control$slope)) + stop("invalid control argument slope") + (slope * sqrt(col_sums(m ^ 2)) + + (1 - slope) * pivot) + }, + ## byte size + b = { + if (is.null(alpha <- control$alpha)) + stop("invalid control argument alpha") + norm <- double(ndoc) + norm[match(names(charlengths), + seq_along(norm))] <- + charlengths ^ alpha + norm + }) + + m <- m * df + m$v <- m$v / norm[m$j] + attr(m, "weighting") <- c(paste("SMART", spec), "SMART") + + m <- if (isDTM) t(m) else m +}, "SMART2", "SMART2") + weightSMART <- WeightFunction(function(m, spec = "nnn", control = list()) { stopifnot(inherits(m, c("DocumentTermMatrix", "TermDocumentMatrix")), diff --git a/src/RcppExports.o b/src/RcppExports.o new file mode 100644 index 0000000..4907210 Binary files /dev/null and b/src/RcppExports.o differ diff --git a/src/copy.o b/src/copy.o new file mode 100644 index 0000000..294a12c Binary files /dev/null and b/src/copy.o differ diff --git a/src/init.o b/src/init.o new file mode 100644 index 0000000..ee94330 Binary files /dev/null and b/src/init.o differ diff --git a/src/remove.o b/src/remove.o new file mode 100644 index 0000000..8b7d27c Binary files /dev/null and b/src/remove.o differ diff --git a/src/scan.o b/src/scan.o new file mode 100644 index 0000000..1511c29 Binary files /dev/null and b/src/scan.o differ diff --git a/src/tdm.o b/src/tdm.o new file mode 100644 index 0000000..6177c4b Binary files /dev/null and b/src/tdm.o differ diff --git a/src/tm.so b/src/tm.so new file mode 100755 index 0000000..9ac6351 Binary files /dev/null and b/src/tm.so differ diff --git a/src/tokenizer.o b/src/tokenizer.o new file mode 100644 index 0000000..a91a0f1 Binary files /dev/null and b/src/tokenizer.o differ