From 1a71329f0cbf7524f8edb7c07ca9706b1952ced2 Mon Sep 17 00:00:00 2001 From: bcjaeger Date: Wed, 17 Apr 2024 09:18:52 -0400 Subject: [PATCH 1/3] variables_included for selecting it is annoying to use predictors selected because they are referent coded and the dataset is usually not. With the additional column it should be easy to subset dataframes to include just the outcome and the variables that were selected by orsf_vs --- R/orsf_R6.R | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/R/orsf_R6.R b/R/orsf_R6.R index 92e24282..ec5462fe 100644 --- a/R/orsf_R6.R +++ b/R/orsf_R6.R @@ -2883,6 +2883,7 @@ ObliqueForest <- R6::R6Class( oob_data <- data.table( n_predictors = seq(n_predictors), stat_value = rep(NA_real_, n_predictors), + variables_included = vector(mode = 'list', length = n_predictors), predictors_included = vector(mode = 'list', length = n_predictors), predictor_dropped = rep(NA_character_, n_predictors) ) @@ -2915,6 +2916,26 @@ ObliqueForest <- R6::R6Class( importance_group_factors = TRUE, write_forest = FALSE) + + fctr_key <- lapply(self$get_fctr_info()$keys, function(x) x[-1]) + + fctr_key <- data.frame( + variable = rep(names(fctr_key), sapply(fctr_key, length)), + predictor = unlist(fctr_key), + row.names = NULL + ) + + variable_key <- data.frame( + predictor = self$get_names_x(ref_coded = TRUE) + ) + + variable_key <- merge(variable_key, fctr_key, + by = 'predictor', + all.x = TRUE) + + variable_key$variable[is.na(variable_key$variable)] <- + variable_key$predictor[is.na(variable_key$variable)] + max_progress <- n_predictors - n_predictor_min current_progress <- 0 start_time <- last_time <- Sys.time() @@ -2963,9 +2984,17 @@ ObliqueForest <- R6::R6Class( worst_index <- which.min(cpp_output$importance) worst_predictor <- colnames(cpp_args$x)[worst_index] + + .variables_included <- with( + variable_key, + unique(variable[predictor %in% colnames(cpp_args$x)]) + ) + + oob_data[n_predictors, `:=`(n_predictors = n_predictors, stat_value = cpp_output$eval_oobag$stat_values[1,1], + variables_included = .variables_included, predictors_included = colnames(cpp_args$x), predictor_dropped = worst_predictor)] From ff635944438198a59911609757885993881e24c1 Mon Sep 17 00:00:00 2001 From: bcjaeger Date: Wed, 17 Apr 2024 09:23:24 -0400 Subject: [PATCH 2/3] update docs --- R/orsf_vs.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/R/orsf_vs.R b/R/orsf_vs.R index 96d1a7c6..ac5236f5 100644 --- a/R/orsf_vs.R +++ b/R/orsf_vs.R @@ -8,11 +8,19 @@ #' @return a [data.table][data.table::data.table-package] with four columns: #' - *n_predictors*: the number of predictors used #' - *stat_value*: the out-of-bag statistic +#' - *variables_included*: the names of the variables included #' - *predictors_included*: the names of the predictors included #' - *predictor_dropped*: the predictor selected to be dropped #' #' @details #' +#' The difference between `variables_included` and `predictors_included` is +#' referent coding. The `variable` would be the name of a factor variable +#' in the training data, while the `predictor` would be the name of that +#' same factor with the levels of the factor appended. For example, if +#' the variable is `diabetes` with `levels = c("no", "yes")`, then the +#' variable name is `diabetes` and the predictor name is `diabetes_yes`. +#' #' `tree_seeds` should be specified in `object` so that each successive run #' of `orsf` will be evaluated in the same out-of-bag samples as the initial #' run. From f9f45fc5f9a9de7904ec237cc3db6791c11ccdae Mon Sep 17 00:00:00 2001 From: bcjaeger Date: Thu, 18 Apr 2024 09:37:16 -0400 Subject: [PATCH 3/3] allow case with no factors --- R/orsf_R6.R | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/R/orsf_R6.R b/R/orsf_R6.R index ec5462fe..7349f6a3 100644 --- a/R/orsf_R6.R +++ b/R/orsf_R6.R @@ -2929,9 +2929,17 @@ ObliqueForest <- R6::R6Class( predictor = self$get_names_x(ref_coded = TRUE) ) - variable_key <- merge(variable_key, fctr_key, - by = 'predictor', - all.x = TRUE) + if(is_empty(fctr_key)){ + + variable_key$variable <- variable_key$predictor + + } else { + + variable_key <- merge(variable_key, fctr_key, + by = 'predictor', + all.x = TRUE) + + } variable_key$variable[is.na(variable_key$variable)] <- variable_key$predictor[is.na(variable_key$variable)]