xgboost mtry parameter swap for #495

topepo · topepo · commit adc9d7781d89 · 2021-05-19T10:43:57.000-04:00
diff --git a/NEWS.md b/NEWS.md
@@ -11,9 +11,7 @@
   
 * The `liquidSVM` engine for `svm_rbf()` was deprecated due to that package's removal from CRAN. (#425)
 
-* New model specification `survival_reg()` for the new mode `"censored regression"` (#444). `surv_reg()` is now soft-deprecated (#448).
-
-* New model specification `proportional_hazards()` for the `"censored regression"` mode (#451).
+* The xgboost engine for boosted trees was translating `mtry` to xgboost's `colsample_bytree`. We now map `mtry` to `colsample_bynode` since that is more consistent with how random forest works. `colsample_bytree` can still be optimized by passing it in as an engine argument. (#495)
 
 ## Other Changes
 
diff --git a/R/boost_tree.R b/R/boost_tree.R
@@ -264,7 +264,9 @@ check_args.boost_tree <- function(object) {
 #' @param max_depth An integer for the maximum depth of the tree.
 #' @param nrounds An integer for the number of boosting iterations.
 #' @param eta A numeric value between zero and one to control the learning rate.
-#' @param colsample_bytree Subsampling proportion of columns.
+#' @param colsample_bytree Subsampling proportion of columns for each tree.
+#' @param colsample_bynode Subsampling proportion of columns for each node
+#' within each tree.
 #' @param min_child_weight A numeric value for the minimum sum of instance
 #'  weights needed in a child to continue to split.
 #' @param gamma A number for the minimum loss reduction required to make a
@@ -290,8 +292,8 @@ check_args.boost_tree <- function(object) {
 #' @export
 xgb_train <- function(
   x, y,
-  max_depth = 6, nrounds = 15, eta  = 0.3, colsample_bytree = 1,
-  min_child_weight = 1, gamma = 0, subsample = 1, validation = 0,
+  max_depth = 6, nrounds = 15, eta  = 0.3, colsample_bynode = 1,
+  colsample_bytree = 1, min_child_weight = 1, gamma = 0, subsample = 1, validation = 0,
   early_stop = NULL, objective = NULL,
   event_level = c("first", "second"),
   ...) {
@@ -346,6 +348,13 @@ xgb_train <- function(
     colsample_bytree <- 1
   }
 
+  if (colsample_bynode > 1) {
+    colsample_bynode <- colsample_bynode/p
+  }
+  if (colsample_bynode > 1) {
+    colsample_bynode <- 1
+  }
+
   if (min_child_weight > n) {
     msg <- paste0(min_child_weight, " samples were requested but there were ",
                   n, " rows in the data. ", n, " will be used.")
@@ -358,6 +367,7 @@ xgb_train <- function(
     max_depth = max_depth,
     gamma = gamma,
     colsample_bytree = colsample_bytree,
+    colsample_bynode = colsample_bynode,
     min_child_weight = min(min_child_weight, n),
     subsample = subsample,
     objective = objective
diff --git a/R/boost_tree_data.R b/R/boost_tree_data.R
@@ -37,7 +37,7 @@ set_model_arg(
   model = "boost_tree",
   eng = "xgboost",
   parsnip = "mtry",
-  original = "colsample_bytree",
+  original = "colsample_bynode",
   func = list(pkg = "dials", fun = "mtry"),
   has_submodel = FALSE
 )
diff --git a/man/boost_tree.Rd b/man/boost_tree.Rd
diff --git a/man/rmd/boost-tree.Rmd b/man/rmd/boost-tree.Rmd
@@ -38,8 +38,7 @@ mod_param <-
   update(sample_size = sample_prop(c(0.4, 0.9)))
 ```
 
-For this engine, tuning over `trees` is very efficient since the same model 
-object can be used to make predictions over multiple values of `trees`.  
+For this engine, tuning over `trees` is very efficient since the same model  object can be used to make predictions over multiple values of `trees`.  
 
 Note that `xgboost` models require that non-numeric predictors (e.g., factors) must be converted to dummy variables or some other numeric representation. By default, when using `fit()` with `xgboost`, a one-hot encoding is used to convert factor predictors to indicator variables. 
 
@@ -89,7 +88,7 @@ get_defaults_boost_tree <- function() {
     "boost_tree", "xgboost",      "tree_depth",                "max_depth", get_arg("parsnip", "xgb_train", "max_depth"),
     "boost_tree", "xgboost",           "trees",                 "nrounds",  get_arg("parsnip", "xgb_train", "nrounds"),
     "boost_tree", "xgboost",      "learn_rate",                     "eta",  get_arg("parsnip", "xgb_train", "eta"),
-    "boost_tree", "xgboost",            "mtry",        "colsample_bytree",  get_arg("parsnip", "xgb_train", "colsample_bytree"),
+    "boost_tree", "xgboost",            "mtry",        "colsample_bynode",  get_arg("parsnip", "xgb_train", "colsample_bynode"),
     "boost_tree", "xgboost",           "min_n",        "min_child_weight",  get_arg("parsnip", "xgb_train", "min_child_weight"),
     "boost_tree", "xgboost",  "loss_reduction",                   "gamma",  get_arg("parsnip", "xgb_train", "gamma"),
     "boost_tree", "xgboost",     "sample_size",               "subsample",  get_arg("parsnip", "xgb_train", "subsample"),
diff --git a/man/xgb_train.Rd b/man/xgb_train.Rd
diff --git a/tests/testthat/test_boost_tree_xgboost.R b/tests/testthat/test_boost_tree_xgboost.R
@@ -414,9 +414,9 @@ test_that('argument checks for data dimensions', {
     xy_fit <- spec %>% fit_xy(x = penguins_dummy, y = penguins$species),
     "1000 samples were requested"
   )
-  expect_equal(f_fit$fit$params$colsample_bytree, 1)
+  expect_equal(f_fit$fit$params$colsample_bynode, 1)
   expect_equal(f_fit$fit$params$min_child_weight, nrow(penguins))
-  expect_equal(xy_fit$fit$params$colsample_bytree, 1)
+  expect_equal(xy_fit$fit$params$colsample_bynode, 1)
   expect_equal(xy_fit$fit$params$min_child_weight, nrow(penguins))
 
 })
@@ -482,3 +482,16 @@ test_that("fit and prediction with `event_level`", {
   expect_equal(pred_p_2[[".pred_male"]], pred_xgb_2)
 
 })
+
+test_that("mtry parameters", {
+  skip_if_not_installed("xgboost")
+  fit <-
+    boost_tree(mtry = .7, trees = 4) %>%
+    set_engine("xgboost") %>%
+    set_mode("regression") %>%
+    fit(mpg ~ ., data = mtcars)
+  expect_equal(fit$fit$params$colsample_bytree, 1)
+  expect_equal(fit$fit$params$colsample_bynode, 0.7)
+})
+
+

Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ set_model_arg(`
`37`	`37`	`model = "boost_tree",`
`38`	`38`	`eng = "xgboost",`
`39`	`39`	`parsnip = "mtry",`
`40`		`- original = "colsample_bytree",`
	`40`	`+ original = "colsample_bynode",`
`41`	`41`	`func = list(pkg = "dials", fun = "mtry"),`
`42`	`42`	`has_submodel = FALSE`
`43`	`43`	`)`