-
Notifications
You must be signed in to change notification settings - Fork 29
Description
After finalizing a model_stack with fit_members(), I get a large object (~13.5 GB) that is inconvenient to use in my specific research production setting. Is there a way to trim the final stack object while still maintaining it's wonderfully simple predict() implementation?
My initial thought was to simply remove the "data_stack" and "splits" (~6.5 GB and ~5.2 GB, respectively, out of the 13.5 GB total). As far as I can tell the splits aren't require in predict.model_stack(), but I see that the data_stack is being called to retrieve the outcome when type = "class".
If I'm only interesting in type = "prob", I can remove the data_stack and splits elements using the proper indexing, and still pass this trimmed object to predict.model_stack() without consequence (I think).
But maybe there is a more elegant solution to this.
I've attached a reprex example below.
# HOUSEKEEPING ####
rm(list = ls(all = TRUE)) # clean house
# CRAN libraries
library(tidyverse) # install.packages("tidyverse")
library(tidymodels) # install.packages("tidymodels")
library(stacks) # install.packages("stacks")
# DEFINE SEED ####
seed <- 42
# DEFINE SET ####
tree_frogs_class_train <- tree_frogs %>%
dplyr::select(-c(clutch, latency))
# SET UP RESAMPLING ####
set.seed(seed)
cv_folds <- tree_frogs_class_train %>% rsample::vfold_cv(v = 5)
# BUILD MODELS ####
mod_svmlinear <- parsnip::svm_linear(cost = tune(), margin = tune()) %>%
parsnip::set_engine("kernlab") %>%
parsnip::set_mode("classification")
mod_elasticnet <- parsnip::logistic_reg(penalty = tune(), mixture = tune()) %>%
parsnip::set_engine("glmnet") %>%
parsnip::set_mode("classification")
# BUILD RECIPE ####
recipe_set <- recipes::recipe(hatched ~ ., data = tree_frogs_class_train) %>%
recipes::step_dummy(recipes::all_nominal(), -hatched) %>%
recipes::step_zv(recipes::all_predictors()) %>%
recipes::step_normalize(recipes::all_numeric())
# CREATE WORKFLOW ####
workflow_set <- workflowsets::workflow_set(
preproc = list(recipe_set),
models = list(
svm_linear = mod_svmlinear,
elasticnet = mod_elasticnet
)
)
# TUNING ####
res_tune <- workflow_set %>%
workflowsets::workflow_map(
seed = seed,
resamples = cv_folds,
fn = "tune_grid",
grid = 25,
metrics = yardstick::metric_set(roc_auc),
control = tune::control_grid(
save_pred = TRUE,
save_workflow = TRUE,
verbose = FALSE,
allow_par = FALSE,
)
)
# DEFINE ENSEMBLE STACK ####
stack_finalized <- stacks::stacks() %>%
stacks::add_candidates(res_tune) %>%
stacks::blend_predictions() %>%
stacks::fit_members()
# CHECK THE ELEMENTS OF THE MODEL_STACK ####
stack_finalized %>% names()
#> [1] "model_defs" "coefs" "penalty" "metrics"
#> [5] "equations" "cols_map" "model_metrics" "train"
#> [9] "mode" "outcome" "data_stack" "splits"
#> [13] "member_fits"
# CHECK THE SIZE OF THE WHOLE MODEL_STACK ####
stack_finalized %>%
object.size() %>%
format("MB")
#> [1] "2 Mb"
# CHECK THE SIZE OF THE MODEL_STACK WITHOUT data_stack AND splits ####
stack_finalized[-c(11, 12)] %>%
object.size() %>%
format("MB")
#> [1] "0.9 Mb"
# TRIM THE MODEL_STACK
stack_finalized_trimmed <- stack_finalized[-c(11, 12)]
# COMPARE THE CLASSES ####
stack_finalized %>% class()
#> [1] "linear_stack" "model_stack" "list"
stack_finalized_trimmed %>% class() # just a list
#> [1] "list"
# MAKE PREDICTIONS WITH WHOLE MODEL_STACK ####
stack_finalized %>%
stacks::predict.model_stack(new_data = tree_frogs_class_train, type = "prob") %>%
head
#> # A tibble: 6 <d7> 2
#> .pred_yes .pred_no
#> <dbl> <dbl>
#> 1 0.934 0.0660
#> 2 0.397 0.603
#> 3 0.602 0.398
#> 4 0.165 0.835
#> 5 0.0815 0.918
#> 6 0.112 0.888
# MAKE PREDICTIONS WITH TRIMMED MODEL_STACK ####
stack_finalized_trimmed %>%
stacks::predict.model_stack(new_data = tree_frogs_class_train, type = "prob") %>%
head
#> # A tibble: 6 <d7> 2
#> .pred_yes .pred_no
#> <dbl> <dbl>
#> 1 0.934 0.0660
#> 2 0.397 0.603
#> 3 0.602 0.398
#> 4 0.165 0.835
#> 5 0.0815 0.918
#> 6 0.112 0.888
# still works!Created on 2024-05-01 with reprex v2.1.0
Session info
sessioninfo::session_info()
#> - Session info ---------------------------------------------------------------
#> setting value
#> version R version 4.3.3 (2024-02-29 ucrt)
#> os Windows 11 x64 (build 22631)
#> system x86_64, mingw32
#> ui RTerm
#> language (EN)
#> collate English_Canada.utf8
#> ctype English_Canada.utf8
#> tz America/Edmonton
#> date 2024-05-01
#> pandoc 3.1.6.1 @ C:/PROGRA~1/Pandoc/ (via rmarkdown)
#>
#> - Packages -------------------------------------------------------------------
#> package * version date (UTC) lib source
#> backports 1.4.1 2021-12-13 [1] CRAN (R 4.3.0)
#> broom * 1.0.5 2023-06-09 [1] CRAN (R 4.3.1)
#> butcher 0.3.4 2024-04-11 [1] CRAN (R 4.3.3)
#> class 7.3-22 2023-05-03 [2] CRAN (R 4.3.3)
#> cli 3.6.2 2023-12-11 [1] CRAN (R 4.3.3)
#> codetools 0.2-20 2024-03-31 [1] CRAN (R 4.3.3)
#> colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.3.1)
#> data.table 1.15.4 2024-03-30 [1] CRAN (R 4.3.3)
#> dials * 1.2.1 2024-02-22 [1] CRAN (R 4.3.3)
#> DiceDesign 1.10 2023-12-07 [1] CRAN (R 4.3.3)
#> digest 0.6.35 2024-03-11 [1] CRAN (R 4.3.3)
#> dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.3)
#> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.3.1)
#> evaluate 0.23 2023-11-01 [1] CRAN (R 4.3.2)
#> fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.3)
#> fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.1)
#> forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.1)
#> foreach 1.5.2 2022-02-02 [1] CRAN (R 4.3.1)
#> fs 1.6.3 2023-07-20 [1] CRAN (R 4.3.1)
#> furrr 0.3.1 2022-08-15 [1] CRAN (R 4.3.1)
#> future 1.33.2 2024-03-26 [1] CRAN (R 4.3.3)
#> future.apply 1.11.2 2024-03-28 [1] CRAN (R 4.3.3)
#> generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.1)
#> ggplot2 * 3.5.0 2024-02-23 [1] CRAN (R 4.3.3)
#> glmnet * 4.1-8 2023-08-22 [1] CRAN (R 4.3.2)
#> globals 0.16.3 2024-03-08 [1] CRAN (R 4.3.3)
#> glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.3)
#> gower 1.0.1 2022-12-22 [1] CRAN (R 4.3.0)
#> GPfit 1.0-8 2019-02-08 [1] CRAN (R 4.3.1)
#> gtable 0.3.4 2023-08-21 [1] CRAN (R 4.3.1)
#> hardhat 1.3.1 2024-02-02 [1] CRAN (R 4.3.3)
#> hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.1)
#> htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.3.3)
#> infer * 1.0.7 2024-03-25 [1] CRAN (R 4.3.3)
#> ipred 0.9-14 2023-03-09 [1] CRAN (R 4.3.1)
#> iterators 1.0.14 2022-02-05 [1] CRAN (R 4.3.1)
#> kernlab * 0.9-32 2023-01-31 [1] CRAN (R 4.3.0)
#> knitr 1.46 2024-04-06 [1] CRAN (R 4.3.3)
#> lattice 0.22-6 2024-03-20 [1] CRAN (R 4.3.3)
#> lava 1.8.0 2024-03-05 [1] CRAN (R 4.3.3)
#> lhs 1.1.6 2022-12-17 [1] CRAN (R 4.3.1)
#> lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.2)
#> listenv 0.9.1 2024-01-29 [1] CRAN (R 4.3.3)
#> lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.3.3)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.1)
#> MASS 7.3-60.0.1 2024-01-13 [1] CRAN (R 4.3.3)
#> Matrix * 1.6-5 2024-01-11 [1] CRAN (R 4.3.2)
#> modeldata * 1.3.0 2024-01-21 [1] CRAN (R 4.3.3)
#> munsell 0.5.1 2024-04-01 [1] CRAN (R 4.3.3)
#> nnet 7.3-19 2023-05-03 [1] CRAN (R 4.3.1)
#> parallelly 1.37.1 2024-02-29 [1] CRAN (R 4.3.3)
#> parsnip * 1.2.1 2024-03-22 [1] CRAN (R 4.3.3)
#> pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.1)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.1)
#> prodlim 2023.08.28 2023-08-28 [1] CRAN (R 4.3.3)
#> purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.3.1)
#> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.3.1)
#> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.3.0)
#> R.oo 1.26.0 2024-01-24 [1] CRAN (R 4.3.3)
#> R.utils 2.12.3 2023-11-18 [1] CRAN (R 4.3.2)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.2)
#> Rcpp 1.0.12 2024-01-09 [1] CRAN (R 4.3.3)
#> readr * 2.1.5 2024-01-10 [1] CRAN (R 4.3.3)
#> recipes * 1.0.10.9000 2024-03-03 [1] Github (tidymodels/recipes@7858c1e)
#> reprex 2.1.0 2024-01-11 [1] CRAN (R 4.3.3)
#> rlang 1.1.3 2024-01-10 [1] CRAN (R 4.3.3)
#> rmarkdown 2.26 2024-03-05 [1] CRAN (R 4.3.3)
#> rpart 4.1.23 2023-12-05 [1] CRAN (R 4.3.3)
#> rsample * 1.2.1 2024-03-25 [1] CRAN (R 4.3.3)
#> scales * 1.3.0 2023-11-28 [1] CRAN (R 4.3.2)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.1)
#> shape 1.4.6.1 2024-02-23 [1] CRAN (R 4.3.2)
#> stacks * 1.0.4.9000 2024-04-29 [1] Github (tidymodels/stacks@f95369f)
#> stringi 1.8.3 2023-12-11 [1] CRAN (R 4.3.2)
#> stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.3.2)
#> styler 1.10.3 2024-04-07 [1] CRAN (R 4.3.3)
#> survival 3.5-8 2024-02-14 [1] CRAN (R 4.3.3)
#> tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.3.1)
#> tidymodels * 1.2.0 2024-03-25 [1] CRAN (R 4.3.3)
#> tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.3.3)
#> tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.3.3)
#> tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.3.1)
#> timechange 0.3.0 2024-01-18 [1] CRAN (R 4.3.3)
#> timeDate 4032.109 2023-12-14 [1] CRAN (R 4.3.2)
#> tune * 1.2.0 2024-03-20 [1] CRAN (R 4.3.3)
#> tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.3.1)
#> utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.1)
#> vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.3)
#> withr 3.0.0 2024-01-16 [1] CRAN (R 4.3.3)
#> workflows * 1.1.4 2024-02-19 [1] CRAN (R 4.3.3)
#> workflowsets * 1.1.0 2024-03-21 [1] CRAN (R 4.3.3)
#> xfun 0.43 2024-03-25 [1] CRAN (R 4.3.3)
#> yaml 2.3.8 2023-12-11 [1] CRAN (R 4.3.2)
#> yardstick * 1.3.1 2024-03-21 [1] CRAN (R 4.3.3)
#>
#> [1] C:/Users/pgaut/AppData/Local/R/win-library/4.3
#> [2] C:/Program Files/R/R-4.3.3/library
#>
#> ------------------------------------------------------------------------------