Merge pull request #499 from tidymodels/xgb-mtry

topepo · web-flow · commit 46a20184b3e8 · 2021-05-21T09:51:01.000-04:00
xgboost mtry parameter swap for #495
diff --git a/NEWS.md b/NEWS.md
@@ -11,9 +11,9 @@
   
 * The `liquidSVM` engine for `svm_rbf()` was deprecated due to that package's removal from CRAN. (#425)
 
-* New model specification `survival_reg()` for the new mode `"censored regression"` (#444). `surv_reg()` is now soft-deprecated (#448).
+* The xgboost engine for boosted trees was translating `mtry` to xgboost's `colsample_bytree`. We now map `mtry` to `colsample_bynode` since that is more consistent with how random forest works. `colsample_bytree` can still be optimized by passing it in as an engine argument. `colsample_bynode` was added to xgboost after the `parsnip` package code was written. (#495)
 
-* New model specification `proportional_hazards()` for the `"censored regression"` mode (#451).
+* For xgboost boosting, `mtry` and `colsample_bytree` can be passed as integer counts or proportions while `subsample` and `validation` should be proportions. `xgb_train()` now has a new option `counts` for state what scale `mtry` and `colsample_bytree` are being used. (#461)  
 
 ## Other Changes
 
@@ -23,12 +23,8 @@
 
 * Re-organized model documentation for `update` methods (#479).
 
- 
-  
 * `generics::required_pkgs()` was extended for `parsnip` objects. 
 
-
-
 # parsnip 0.1.5
 
 * An RStudio add-in is available that makes writing multiple `parsnip` model specifications to the source window. It can be accessed via the IDE addin menus or by calling `parsnip_addin()`.
diff --git a/R/boost_tree.R b/R/boost_tree.R
@@ -264,20 +264,26 @@ check_args.boost_tree <- function(object) {
 #' @param max_depth An integer for the maximum depth of the tree.
 #' @param nrounds An integer for the number of boosting iterations.
 #' @param eta A numeric value between zero and one to control the learning rate.
-#' @param colsample_bytree Subsampling proportion of columns.
+#' @param colsample_bytree Subsampling proportion of columns for each tree.
+#' See the `counts` argument below. The default uses all columns.
+#' @param colsample_bynode Subsampling proportion of columns for each node
+#' within each tree. See the `counts` argument below. The default uses all
+#' columns.
 #' @param min_child_weight A numeric value for the minimum sum of instance
 #'  weights needed in a child to continue to split.
 #' @param gamma A number for the minimum loss reduction required to make a
 #'  further partition on a leaf node of the tree
-#' @param subsample Subsampling proportion of rows.
-#' @param validation A positive number. If on `[0, 1)` the value, `validation`
-#' is a random proportion of data in `x` and `y` that are used for performance
-#' assessment and potential early stopping. If 1 or greater, it is the _number_
-#' of training set samples use for these purposes.
+#' @param subsample Subsampling proportion of rows. By default, all of the
+#' training data are used.
+#' @param validation The _proportion_ of the data that are used for performance
+#' assessment and potential early stopping.
 #' @param early_stop An integer or `NULL`. If not `NULL`, it is the number of
 #' training iterations without improvement before stopping. If `validation` is
 #' used, performance is base on the validation set; otherwise, the training set
 #' is used.
+#' @param counts A logical. If `FALSE`, `colsample_bynode` and
+#' `colsample_bytree` are both assumed to be _proportions_ of the proportion of
+#' columns affects (instead of counts).
 #' @param objective A single string (or NULL) that defines the loss function that
 #' `xgboost` uses to create trees. See [xgboost::xgb.train()] for options. If left
 #' NULL, an appropriate loss function is chosen.
@@ -290,11 +296,10 @@ check_args.boost_tree <- function(object) {
 #' @export
 xgb_train <- function(
   x, y,
-  max_depth = 6, nrounds = 15, eta  = 0.3, colsample_bytree = 1,
-  min_child_weight = 1, gamma = 0, subsample = 1, validation = 0,
-  early_stop = NULL, objective = NULL,
-  event_level = c("first", "second"),
-  ...) {
+  max_depth = 6, nrounds = 15, eta  = 0.3, colsample_bynode = NULL,
+  colsample_bytree = NULL, min_child_weight = 1, gamma = 0, subsample = 1,
+  validation = 0, early_stop = NULL, objective = NULL, counts = TRUE,
+  event_level = c("first", "second"), ...) {
 
   event_level <- rlang::arg_match(event_level, c("first", "second"))
   others <- list(...)
@@ -304,6 +309,7 @@ xgb_train <- function(
   if (!is.numeric(validation) || validation < 0 || validation >= 1) {
     rlang::abort("`validation` should be on [0, 1).")
   }
+
   if (!is.null(early_stop)) {
     if (early_stop <= 1) {
       rlang::abort(paste0("`early_stop` should be on [2, ",  nrounds, ")."))
@@ -313,7 +319,6 @@ xgb_train <- function(
     }
   }
 
-
   if (is.null(objective)) {
     if (is.numeric(y)) {
       objective <- "reg:squarederror"
@@ -331,19 +336,21 @@ xgb_train <- function(
 
   x <- as_xgb_data(x, y, validation, event_level)
 
-  # translate `subsample` and `colsample_bytree` to be on (0, 1] if not
-  if (subsample > 1) {
-    subsample <- subsample/n
-  }
-  if (subsample > 1) {
-    subsample <- 1
-  }
 
-  if (colsample_bytree > 1) {
-    colsample_bytree <- colsample_bytree/p
+  if (!is.numeric(subsample) || subsample < 0 || subsample > 1) {
+    rlang::abort("`subsample` should be on [0, 1].")
   }
-  if (colsample_bytree > 1) {
+
+  # initialize
+  if (is.null(colsample_bytree)) {
     colsample_bytree <- 1
+  } else {
+    colsample_bytree <- recalc_param(colsample_bytree, counts, p)
+  }
+  if (is.null(colsample_bynode)) {
+    colsample_bynode <- 1
+  } else {
+    colsample_bynode <- recalc_param(colsample_bynode, counts, p)
   }
 
   if (min_child_weight > n) {
@@ -358,6 +365,7 @@ xgb_train <- function(
     max_depth = max_depth,
     gamma = gamma,
     colsample_bytree = colsample_bytree,
+    colsample_bynode = colsample_bynode,
     min_child_weight = min(min_child_weight, n),
     subsample = subsample,
     objective = objective
@@ -390,6 +398,30 @@ xgb_train <- function(
   eval_tidy(call, env = current_env())
 }
 
+recalc_param <- function(x, counts, denom) {
+  nm <- as.character(match.call()$x)
+  if (is.null(x)) {
+    x <- 1
+  } else {
+    if (counts) {
+      maybe_proportion(x, nm)
+      x <- min(denom, x)/denom
+    }
+  }
+  x
+}
+
+maybe_proportion <- function(x, nm) {
+  if (x < 1) {
+    msg <- paste0(
+      "The option `counts = TRUE` was used but parameter `", nm,
+      "` was given as ", signif(x, 3), ". Please use a value >= 1 or use ",
+      "`counts = FALSE`."
+    )
+    rlang::abort(msg)
+  }
+}
+
 #' @importFrom stats binomial
 xgb_pred <- function(object, newdata, ...) {
   if (!inherits(newdata, "xgb.DMatrix")) {
@@ -432,7 +464,8 @@ as_xgb_data <- function(x, y, validation = 0, event_level = "first", ...) {
 
   if (!inherits(x, "xgb.DMatrix")) {
     if (validation > 0) {
-      trn_index <- sample(1:n, size = floor(n * (1 - validation)) + 1)
+      m <- floor(n * (1 - validation)) + 1
+      trn_index <- sample(1:n, size = max(m, 2))
       wlist <-
         list(validation = xgboost::xgb.DMatrix(x[-trn_index, ], label = y[-trn_index], missing = NA))
       dat <- xgboost::xgb.DMatrix(x[trn_index, ], label = y[trn_index], missing = NA)
diff --git a/R/boost_tree_data.R b/R/boost_tree_data.R
@@ -37,7 +37,7 @@ set_model_arg(
   model = "boost_tree",
   eng = "xgboost",
   parsnip = "mtry",
-  original = "colsample_bytree",
+  original = "colsample_bynode",
   func = list(pkg = "dials", fun = "mtry"),
   has_submodel = FALSE
 )
diff --git a/man/boost_tree.Rd b/man/boost_tree.Rd
diff --git a/man/rmd/boost-tree.Rmd b/man/rmd/boost-tree.Rmd
@@ -38,8 +38,7 @@ mod_param <-
   update(sample_size = sample_prop(c(0.4, 0.9)))
 ```
 
-For this engine, tuning over `trees` is very efficient since the same model 
-object can be used to make predictions over multiple values of `trees`.  
+For this engine, tuning over `trees` is very efficient since the same model  object can be used to make predictions over multiple values of `trees`.  
 
 Note that `xgboost` models require that non-numeric predictors (e.g., factors) must be converted to dummy variables or some other numeric representation. By default, when using `fit()` with `xgboost`, a one-hot encoding is used to convert factor predictors to indicator variables. 
 
@@ -89,7 +88,7 @@ get_defaults_boost_tree <- function() {
     "boost_tree", "xgboost",      "tree_depth",                "max_depth", get_arg("parsnip", "xgb_train", "max_depth"),
     "boost_tree", "xgboost",           "trees",                 "nrounds",  get_arg("parsnip", "xgb_train", "nrounds"),
     "boost_tree", "xgboost",      "learn_rate",                     "eta",  get_arg("parsnip", "xgb_train", "eta"),
-    "boost_tree", "xgboost",            "mtry",        "colsample_bytree",  get_arg("parsnip", "xgb_train", "colsample_bytree"),
+    "boost_tree", "xgboost",            "mtry",        "colsample_bynode",  get_arg("parsnip", "xgb_train", "colsample_bynode"),
     "boost_tree", "xgboost",           "min_n",        "min_child_weight",  get_arg("parsnip", "xgb_train", "min_child_weight"),
     "boost_tree", "xgboost",  "loss_reduction",                   "gamma",  get_arg("parsnip", "xgb_train", "gamma"),
     "boost_tree", "xgboost",     "sample_size",               "subsample",  get_arg("parsnip", "xgb_train", "subsample"),
diff --git a/man/xgb_train.Rd b/man/xgb_train.Rd
diff --git a/tests/testthat/test_boost_tree_xgboost.R b/tests/testthat/test_boost_tree_xgboost.R
@@ -414,9 +414,9 @@ test_that('argument checks for data dimensions', {
     xy_fit <- spec %>% fit_xy(x = penguins_dummy, y = penguins$species),
     "1000 samples were requested"
   )
-  expect_equal(f_fit$fit$params$colsample_bytree, 1)
+  expect_equal(f_fit$fit$params$colsample_bynode, 1)
   expect_equal(f_fit$fit$params$min_child_weight, nrow(penguins))
-  expect_equal(xy_fit$fit$params$colsample_bytree, 1)
+  expect_equal(xy_fit$fit$params$colsample_bynode, 1)
   expect_equal(xy_fit$fit$params$min_child_weight, nrow(penguins))
 
 })
@@ -482,3 +482,49 @@ test_that("fit and prediction with `event_level`", {
   expect_equal(pred_p_2[[".pred_male"]], pred_xgb_2)
 
 })
+
+test_that("count/proportion parameters", {
+  skip_if_not_installed("xgboost")
+  fit1 <-
+    boost_tree(mtry = 7, trees = 4) %>%
+    set_engine("xgboost") %>%
+    set_mode("regression") %>%
+    fit(mpg ~ ., data = mtcars)
+  expect_equal(fit1$fit$params$colsample_bytree, 1)
+  expect_equal(fit1$fit$params$colsample_bynode, 7/(ncol(mtcars) - 1))
+
+  fit2 <-
+    boost_tree(mtry = 7, trees = 4) %>%
+    set_engine("xgboost", colsample_bytree = 4) %>%
+    set_mode("regression") %>%
+    fit(mpg ~ ., data = mtcars)
+  expect_equal(fit2$fit$params$colsample_bytree, 4/(ncol(mtcars) - 1))
+  expect_equal(fit2$fit$params$colsample_bynode, 7/(ncol(mtcars) - 1))
+
+  fit3 <-
+    boost_tree(trees = 4) %>%
+    set_engine("xgboost") %>%
+    set_mode("regression") %>%
+    fit(mpg ~ ., data = mtcars)
+  expect_equal(fit3$fit$params$colsample_bytree, 1)
+  expect_equal(fit3$fit$params$colsample_bynode, 1)
+
+  fit4 <-
+    boost_tree(mtry = .9, trees = 4) %>%
+    set_engine("xgboost", colsample_bytree = .1, counts = FALSE) %>%
+    set_mode("regression") %>%
+    fit(mpg ~ ., data = mtcars)
+  expect_equal(fit4$fit$params$colsample_bytree, .1)
+  expect_equal(fit4$fit$params$colsample_bynode, .9)
+
+  expect_error(
+    boost_tree(mtry = .9, trees = 4) %>%
+      set_engine("xgboost") %>%
+      set_mode("regression") %>%
+      fit(mpg ~ ., data = mtcars),
+   "was given as 0.9"
+  )
+
+})
+
+

Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ set_model_arg(`
`37`	`37`	`model = "boost_tree",`
`38`	`38`	`eng = "xgboost",`
`39`	`39`	`parsnip = "mtry",`
`40`		`- original = "colsample_bytree",`
	`40`	`+ original = "colsample_bynode",`
`41`	`41`	`func = list(pkg = "dials", fun = "mtry"),`
`42`	`42`	`has_submodel = FALSE`
`43`	`43`	`)`