diff --git a/NEWS.md b/NEWS.md index e0c9193ac..9e115794b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,9 +11,9 @@ * The `liquidSVM` engine for `svm_rbf()` was deprecated due to that package's removal from CRAN. (#425) -* New model specification `survival_reg()` for the new mode `"censored regression"` (#444). `surv_reg()` is now soft-deprecated (#448). +* The xgboost engine for boosted trees was translating `mtry` to xgboost's `colsample_bytree`. We now map `mtry` to `colsample_bynode` since that is more consistent with how random forest works. `colsample_bytree` can still be optimized by passing it in as an engine argument. `colsample_bynode` was added to xgboost after the `parsnip` package code was written. (#495) -* New model specification `proportional_hazards()` for the `"censored regression"` mode (#451). +* For xgboost boosting, `mtry` and `colsample_bytree` can be passed as integer counts or proportions while `subsample` and `validation` should be proportions. `xgb_train()` now has a new option `counts` for state what scale `mtry` and `colsample_bytree` are being used. (#461) ## Other Changes @@ -23,12 +23,8 @@ * Re-organized model documentation for `update` methods (#479). - - * `generics::required_pkgs()` was extended for `parsnip` objects. - - # parsnip 0.1.5 * An RStudio add-in is available that makes writing multiple `parsnip` model specifications to the source window. It can be accessed via the IDE addin menus or by calling `parsnip_addin()`. diff --git a/R/boost_tree.R b/R/boost_tree.R index 4707ebf0d..0ba9635ed 100644 --- a/R/boost_tree.R +++ b/R/boost_tree.R @@ -264,20 +264,26 @@ check_args.boost_tree <- function(object) { #' @param max_depth An integer for the maximum depth of the tree. #' @param nrounds An integer for the number of boosting iterations. #' @param eta A numeric value between zero and one to control the learning rate. -#' @param colsample_bytree Subsampling proportion of columns. +#' @param colsample_bytree Subsampling proportion of columns for each tree. +#' See the `counts` argument below. The default uses all columns. +#' @param colsample_bynode Subsampling proportion of columns for each node +#' within each tree. See the `counts` argument below. The default uses all +#' columns. #' @param min_child_weight A numeric value for the minimum sum of instance #' weights needed in a child to continue to split. #' @param gamma A number for the minimum loss reduction required to make a #' further partition on a leaf node of the tree -#' @param subsample Subsampling proportion of rows. -#' @param validation A positive number. If on `[0, 1)` the value, `validation` -#' is a random proportion of data in `x` and `y` that are used for performance -#' assessment and potential early stopping. If 1 or greater, it is the _number_ -#' of training set samples use for these purposes. +#' @param subsample Subsampling proportion of rows. By default, all of the +#' training data are used. +#' @param validation The _proportion_ of the data that are used for performance +#' assessment and potential early stopping. #' @param early_stop An integer or `NULL`. If not `NULL`, it is the number of #' training iterations without improvement before stopping. If `validation` is #' used, performance is base on the validation set; otherwise, the training set #' is used. +#' @param counts A logical. If `FALSE`, `colsample_bynode` and +#' `colsample_bytree` are both assumed to be _proportions_ of the proportion of +#' columns affects (instead of counts). #' @param objective A single string (or NULL) that defines the loss function that #' `xgboost` uses to create trees. See [xgboost::xgb.train()] for options. If left #' NULL, an appropriate loss function is chosen. @@ -290,11 +296,10 @@ check_args.boost_tree <- function(object) { #' @export xgb_train <- function( x, y, - max_depth = 6, nrounds = 15, eta = 0.3, colsample_bytree = 1, - min_child_weight = 1, gamma = 0, subsample = 1, validation = 0, - early_stop = NULL, objective = NULL, - event_level = c("first", "second"), - ...) { + max_depth = 6, nrounds = 15, eta = 0.3, colsample_bynode = NULL, + colsample_bytree = NULL, min_child_weight = 1, gamma = 0, subsample = 1, + validation = 0, early_stop = NULL, objective = NULL, counts = TRUE, + event_level = c("first", "second"), ...) { event_level <- rlang::arg_match(event_level, c("first", "second")) others <- list(...) @@ -304,6 +309,7 @@ xgb_train <- function( if (!is.numeric(validation) || validation < 0 || validation >= 1) { rlang::abort("`validation` should be on [0, 1).") } + if (!is.null(early_stop)) { if (early_stop <= 1) { rlang::abort(paste0("`early_stop` should be on [2, ", nrounds, ").")) @@ -313,7 +319,6 @@ xgb_train <- function( } } - if (is.null(objective)) { if (is.numeric(y)) { objective <- "reg:squarederror" @@ -331,19 +336,21 @@ xgb_train <- function( x <- as_xgb_data(x, y, validation, event_level) - # translate `subsample` and `colsample_bytree` to be on (0, 1] if not - if (subsample > 1) { - subsample <- subsample/n - } - if (subsample > 1) { - subsample <- 1 - } - if (colsample_bytree > 1) { - colsample_bytree <- colsample_bytree/p + if (!is.numeric(subsample) || subsample < 0 || subsample > 1) { + rlang::abort("`subsample` should be on [0, 1].") } - if (colsample_bytree > 1) { + + # initialize + if (is.null(colsample_bytree)) { colsample_bytree <- 1 + } else { + colsample_bytree <- recalc_param(colsample_bytree, counts, p) + } + if (is.null(colsample_bynode)) { + colsample_bynode <- 1 + } else { + colsample_bynode <- recalc_param(colsample_bynode, counts, p) } if (min_child_weight > n) { @@ -358,6 +365,7 @@ xgb_train <- function( max_depth = max_depth, gamma = gamma, colsample_bytree = colsample_bytree, + colsample_bynode = colsample_bynode, min_child_weight = min(min_child_weight, n), subsample = subsample, objective = objective @@ -390,6 +398,30 @@ xgb_train <- function( eval_tidy(call, env = current_env()) } +recalc_param <- function(x, counts, denom) { + nm <- as.character(match.call()$x) + if (is.null(x)) { + x <- 1 + } else { + if (counts) { + maybe_proportion(x, nm) + x <- min(denom, x)/denom + } + } + x +} + +maybe_proportion <- function(x, nm) { + if (x < 1) { + msg <- paste0( + "The option `counts = TRUE` was used but parameter `", nm, + "` was given as ", signif(x, 3), ". Please use a value >= 1 or use ", + "`counts = FALSE`." + ) + rlang::abort(msg) + } +} + #' @importFrom stats binomial xgb_pred <- function(object, newdata, ...) { if (!inherits(newdata, "xgb.DMatrix")) { @@ -432,7 +464,8 @@ as_xgb_data <- function(x, y, validation = 0, event_level = "first", ...) { if (!inherits(x, "xgb.DMatrix")) { if (validation > 0) { - trn_index <- sample(1:n, size = floor(n * (1 - validation)) + 1) + m <- floor(n * (1 - validation)) + 1 + trn_index <- sample(1:n, size = max(m, 2)) wlist <- list(validation = xgboost::xgb.DMatrix(x[-trn_index, ], label = y[-trn_index], missing = NA)) dat <- xgboost::xgb.DMatrix(x[trn_index, ], label = y[trn_index], missing = NA) diff --git a/R/boost_tree_data.R b/R/boost_tree_data.R index da069d8c9..75adef565 100644 --- a/R/boost_tree_data.R +++ b/R/boost_tree_data.R @@ -37,7 +37,7 @@ set_model_arg( model = "boost_tree", eng = "xgboost", parsnip = "mtry", - original = "colsample_bytree", + original = "colsample_bynode", func = list(pkg = "dials", fun = "mtry"), has_submodel = FALSE ) diff --git a/man/boost_tree.Rd b/man/boost_tree.Rd index 30d1649df..0141a218d 100644 --- a/man/boost_tree.Rd +++ b/man/boost_tree.Rd @@ -227,11 +227,11 @@ parameter.\tabular{llll}{ tree_depth \tab max_depth (6) \tab NA \tab max_depth (5) \cr trees \tab nrounds (15) \tab trials (15) \tab max_iter (20) \cr learn_rate \tab eta (0.3) \tab NA \tab step_size (0.1) \cr - mtry \tab colsample_bytree (1) \tab NA \tab feature_subset_strategy (see below) \cr + mtry \tab colsample_bynode (character(0)) \tab NA \tab feature_subset_strategy (see below) \cr min_n \tab min_child_weight (1) \tab minCases (2) \tab min_instances_per_node (1) \cr loss_reduction \tab gamma (0) \tab NA \tab min_info_gain (0) \cr sample_size \tab subsample (1) \tab sample (0) \tab subsampling_rate (1) \cr - stop_iter \tab early_stop \tab NA \tab NA \cr + stop_iter \tab early_stop (NULL) \tab NA \tab NA \cr } diff --git a/man/rmd/boost-tree.Rmd b/man/rmd/boost-tree.Rmd index 258b83590..43c338a1b 100644 --- a/man/rmd/boost-tree.Rmd +++ b/man/rmd/boost-tree.Rmd @@ -38,8 +38,7 @@ mod_param <- update(sample_size = sample_prop(c(0.4, 0.9))) ``` -For this engine, tuning over `trees` is very efficient since the same model -object can be used to make predictions over multiple values of `trees`. +For this engine, tuning over `trees` is very efficient since the same model object can be used to make predictions over multiple values of `trees`. Note that `xgboost` models require that non-numeric predictors (e.g., factors) must be converted to dummy variables or some other numeric representation. By default, when using `fit()` with `xgboost`, a one-hot encoding is used to convert factor predictors to indicator variables. @@ -89,7 +88,7 @@ get_defaults_boost_tree <- function() { "boost_tree", "xgboost", "tree_depth", "max_depth", get_arg("parsnip", "xgb_train", "max_depth"), "boost_tree", "xgboost", "trees", "nrounds", get_arg("parsnip", "xgb_train", "nrounds"), "boost_tree", "xgboost", "learn_rate", "eta", get_arg("parsnip", "xgb_train", "eta"), - "boost_tree", "xgboost", "mtry", "colsample_bytree", get_arg("parsnip", "xgb_train", "colsample_bytree"), + "boost_tree", "xgboost", "mtry", "colsample_bynode", get_arg("parsnip", "xgb_train", "colsample_bynode"), "boost_tree", "xgboost", "min_n", "min_child_weight", get_arg("parsnip", "xgb_train", "min_child_weight"), "boost_tree", "xgboost", "loss_reduction", "gamma", get_arg("parsnip", "xgb_train", "gamma"), "boost_tree", "xgboost", "sample_size", "subsample", get_arg("parsnip", "xgb_train", "subsample"), diff --git a/man/xgb_train.Rd b/man/xgb_train.Rd index 8b0c9a8cc..9b963ad11 100644 --- a/man/xgb_train.Rd +++ b/man/xgb_train.Rd @@ -10,13 +10,15 @@ xgb_train( max_depth = 6, nrounds = 15, eta = 0.3, - colsample_bytree = 1, + colsample_bynode = NULL, + colsample_bytree = NULL, min_child_weight = 1, gamma = 0, subsample = 1, validation = 0, early_stop = NULL, objective = NULL, + counts = TRUE, event_level = c("first", "second"), ... ) @@ -32,7 +34,12 @@ xgb_train( \item{eta}{A numeric value between zero and one to control the learning rate.} -\item{colsample_bytree}{Subsampling proportion of columns.} +\item{colsample_bynode}{Subsampling proportion of columns for each node +within each tree. See the \code{counts} argument below. The default uses all +columns.} + +\item{colsample_bytree}{Subsampling proportion of columns for each tree. +See the \code{counts} argument below. The default uses all columns.} \item{min_child_weight}{A numeric value for the minimum sum of instance weights needed in a child to continue to split.} @@ -40,12 +47,11 @@ weights needed in a child to continue to split.} \item{gamma}{A number for the minimum loss reduction required to make a further partition on a leaf node of the tree} -\item{subsample}{Subsampling proportion of rows.} +\item{subsample}{Subsampling proportion of rows. By default, all of the +training data are used.} -\item{validation}{A positive number. If on \verb{[0, 1)} the value, \code{validation} -is a random proportion of data in \code{x} and \code{y} that are used for performance -assessment and potential early stopping. If 1 or greater, it is the \emph{number} -of training set samples use for these purposes.} +\item{validation}{The \emph{proportion} of the data that are used for performance +assessment and potential early stopping.} \item{early_stop}{An integer or \code{NULL}. If not \code{NULL}, it is the number of training iterations without improvement before stopping. If \code{validation} is @@ -56,6 +62,10 @@ is used.} \code{xgboost} uses to create trees. See \code{\link[xgboost:xgb.train]{xgboost::xgb.train()}} for options. If left NULL, an appropriate loss function is chosen.} +\item{counts}{A logical. If \code{FALSE}, \code{colsample_bynode} and +\code{colsample_bytree} are both assumed to be \emph{proportions} of the proportion of +columns affects (instead of counts).} + \item{event_level}{For binary classification, this is a single string of either \code{"first"} or \code{"second"} to pass along describing which level of the outcome should be considered the "event".} diff --git a/tests/testthat/test_boost_tree_xgboost.R b/tests/testthat/test_boost_tree_xgboost.R index 94448b6df..56a567dc9 100644 --- a/tests/testthat/test_boost_tree_xgboost.R +++ b/tests/testthat/test_boost_tree_xgboost.R @@ -414,9 +414,9 @@ test_that('argument checks for data dimensions', { xy_fit <- spec %>% fit_xy(x = penguins_dummy, y = penguins$species), "1000 samples were requested" ) - expect_equal(f_fit$fit$params$colsample_bytree, 1) + expect_equal(f_fit$fit$params$colsample_bynode, 1) expect_equal(f_fit$fit$params$min_child_weight, nrow(penguins)) - expect_equal(xy_fit$fit$params$colsample_bytree, 1) + expect_equal(xy_fit$fit$params$colsample_bynode, 1) expect_equal(xy_fit$fit$params$min_child_weight, nrow(penguins)) }) @@ -482,3 +482,49 @@ test_that("fit and prediction with `event_level`", { expect_equal(pred_p_2[[".pred_male"]], pred_xgb_2) }) + +test_that("count/proportion parameters", { + skip_if_not_installed("xgboost") + fit1 <- + boost_tree(mtry = 7, trees = 4) %>% + set_engine("xgboost") %>% + set_mode("regression") %>% + fit(mpg ~ ., data = mtcars) + expect_equal(fit1$fit$params$colsample_bytree, 1) + expect_equal(fit1$fit$params$colsample_bynode, 7/(ncol(mtcars) - 1)) + + fit2 <- + boost_tree(mtry = 7, trees = 4) %>% + set_engine("xgboost", colsample_bytree = 4) %>% + set_mode("regression") %>% + fit(mpg ~ ., data = mtcars) + expect_equal(fit2$fit$params$colsample_bytree, 4/(ncol(mtcars) - 1)) + expect_equal(fit2$fit$params$colsample_bynode, 7/(ncol(mtcars) - 1)) + + fit3 <- + boost_tree(trees = 4) %>% + set_engine("xgboost") %>% + set_mode("regression") %>% + fit(mpg ~ ., data = mtcars) + expect_equal(fit3$fit$params$colsample_bytree, 1) + expect_equal(fit3$fit$params$colsample_bynode, 1) + + fit4 <- + boost_tree(mtry = .9, trees = 4) %>% + set_engine("xgboost", colsample_bytree = .1, counts = FALSE) %>% + set_mode("regression") %>% + fit(mpg ~ ., data = mtcars) + expect_equal(fit4$fit$params$colsample_bytree, .1) + expect_equal(fit4$fit$params$colsample_bynode, .9) + + expect_error( + boost_tree(mtry = .9, trees = 4) %>% + set_engine("xgboost") %>% + set_mode("regression") %>% + fit(mpg ~ ., data = mtcars), + "was given as 0.9" + ) + +}) + +