Merge pull request #28 from ropensci/pd-R

Partial dependence fixes to plug memory leaks
ropensci · Oct 25, 2023 · 2370ef8 · 2370ef8
2 parents 0e4d243 + dfeb57d
commit 2370ef8
Show file tree

Hide file tree

Showing 4 changed files with 213 additions and 44 deletions.
diff --git a/R/melt_aorsf.R b/R/melt_aorsf.R
@@ -0,0 +1,39 @@
+
+# need to make this to avoid possible memory leak in data.table melt
+
+melt_aorsf <-
+ function(data,
+          id.vars,
+          measure.vars,
+          variable.name = "variable",
+          value.name = "value") {
+  if (!is.data.frame(data)) {
+   stop("Input 'data' must be a data frame.")
+  }
+
+  if (!is.character(id.vars)) {
+   stop("Input 'id.vars' must be a character vector.")
+  }
+
+  if (!is.character(measure.vars)) {
+   stop("Input 'measure.vars' must be a character vector.")
+  }
+
+  # Select id variables and measure variables
+  id_data <- data[id.vars]
+  measure_data <- data[measure.vars]
+
+  # Create a sequence variable to represent the variable names
+  variable_data <- rep(names(measure_data), each = nrow(data))
+
+  # Reshape the data
+  long_data <- data.frame(id_data,
+                          variable = variable_data,
+                          value = unlist(measure_data, use.names = FALSE))
+
+  names(long_data)[names(long_data) == 'variable'] <- variable.name
+  names(long_data)[names(long_data) == 'value'] <- value.name
+
+  return(long_data)
+ }
+
diff --git a/R/orsf_pd.R b/R/orsf_pd.R
@@ -430,7 +430,7 @@ orsf_pred_dependence <- function(object,
 
    }
 
-   x_cols[[i]] <- match(names(pred_spec_new[[i]]), colnames(x_new))-1
+   x_cols[[i]] <- match(names(pred_spec_new[[i]]), colnames(x_new)) - 1
    pred_spec_new[[i]] <- as.matrix(pred_spec_new[[i]])
 
   }
@@ -439,6 +439,99 @@ orsf_pred_dependence <- function(object,
 
  control <- get_control(object)
 
+ pred_horizon_order <- order(pred_horizon)
+ pred_horizon_ordered <- pred_horizon[pred_horizon_order]
+
+ # results <- list()
+ #
+ # for(i in seq_along(pred_spec_new)){
+ #
+ #  results_i <- list()
+ #
+ #  x_pd <- x_new
+ #
+ #  for(j in seq(nrow(pred_spec_new[[i]]))){
+ #
+ #   x_pd[, x_cols[[i]]] <- pred_spec_new[[i]][j, ]
+ #
+ #   results_i[[j]] <- orsf_cpp(
+ #    x = x_pd,
+ #    y = matrix(1, ncol=2),
+ #    w = rep(1, nrow(x_new)),
+ #    tree_type_R = get_tree_type(object),
+ #    tree_seeds = get_tree_seeds(object),
+ #    loaded_forest = object$forest,
+ #    n_tree = get_n_tree(object),
+ #    mtry = get_mtry(object),
+ #    sample_with_replacement = get_sample_with_replacement(object),
+ #    sample_fraction = get_sample_fraction(object),
+ #    vi_type_R = 0,
+ #    vi_max_pvalue = get_vi_max_pvalue(object),
+ #    oobag_R_function = get_f_oobag_eval(object),
+ #    leaf_min_events = get_leaf_min_events(object),
+ #    leaf_min_obs = get_leaf_min_obs(object),
+ #    split_rule_R = switch(get_split_rule(object),
+ #                          "logrank" = 1,
+ #                          "cstat" = 2),
+ #    split_min_events = get_split_min_events(object),
+ #    split_min_obs = get_split_min_obs(object),
+ #    split_min_stat = get_split_min_stat(object),
+ #    split_max_cuts = get_n_split(object),
+ #    split_max_retry = get_n_retry(object),
+ #    lincomb_R_function = control$lincomb_R_function,
+ #    lincomb_type_R = switch(control$lincomb_type,
+ #                            'glm' = 1,
+ #                            'random' = 2,
+ #                            'net' = 3,
+ #                            'custom' = 4),
+ #    lincomb_eps = control$lincomb_eps,
+ #    lincomb_iter_max = control$lincomb_iter_max,
+ #    lincomb_scale = control$lincomb_scale,
+ #    lincomb_alpha = control$lincomb_alpha,
+ #    lincomb_df_target = control$lincomb_df_target,
+ #    lincomb_ties_method = switch(tolower(control$lincomb_ties_method),
+ #                                 'breslow' = 0,
+ #                                 'efron'   = 1),
+ #    pred_type_R = pred_type_R,
+ #    pred_mode = TRUE,
+ #    pred_aggregate = TRUE,
+ #    pred_horizon = pred_horizon_ordered,
+ #    oobag = oobag,
+ #    oobag_eval_type_R = 0,
+ #    oobag_eval_every = get_n_tree(object),
+ #    pd_type_R = 0,
+ #    pd_x_vals = list(matrix(0, ncol=1, nrow=1)),
+ #    pd_x_cols = list(matrix(1L, ncol=1, nrow=1)),
+ #    pd_probs = c(0),
+ #    n_thread = n_thread,
+ #    write_forest = FALSE,
+ #    run_forest = TRUE,
+ #    verbosity = 0)$pred_new
+ #
+ #  }
+ #
+ #  if(type_output == 'smry'){
+ #   results_i <- lapply(
+ #    results_i,
+ #    function(x) {
+ #     apply(x, 2, function(x_col){
+ #      as.numeric(
+ #       c(mean(x_col, na.rm = TRUE),
+ #         quantile(x_col, probs = prob_values, na.rm = TRUE))
+ #      )
+ #     })
+ #    }
+ #   )
+ #  }
+ #
+ #
+ #  results[[i]] <- results_i
+ #
+ # }
+ #
+ # pd_vals <- results
+ # browser()
+
  orsf_out <- orsf_cpp(x = x_new,
                       y = matrix(1, ncol=2),
                       w = rep(1, nrow(x_new)),
@@ -513,14 +606,25 @@ orsf_pred_dependence <- function(object,
    else
     colnames(pd_vals[[i]][[j]]) <- c(paste(1:nrow(x_new)))
 
-   pd_vals[[i]][[j]] <- as.data.table(pd_vals[[i]][[j]],
-                                      keep.rownames = 'pred_horizon')
+   ph <- rownames(pd_vals[[i]][[j]])
+
+   pd_vals[[i]][[j]] <- as.data.frame(pd_vals[[i]][[j]])
+
+   rownames(pd_vals[[i]][[j]]) <- NULL
+
+   pd_vals[[i]][[j]][['pred_horizon']] <- ph
+
+   if(type_output == 'ice'){
+
+    pd_vals[[i]][[j]] <- melt_aorsf(
+     data = pd_vals[[i]][[j]],
+     id.vars = 'pred_horizon',
+     variable.name = 'id_row',
+     value.name = 'pred',
+     measure.vars = setdiff(names(pd_vals[[i]][[j]]), 'pred_horizon'))
+
+   }
 
-   if(type_output == 'ice')
-    pd_vals[[i]][[j]] <- melt(data = pd_vals[[i]][[j]],
-                              id.vars = 'pred_horizon',
-                              variable.name = 'id_row',
-                              value.name = 'pred')
 
   }
 
@@ -592,3 +696,33 @@ orsf_pred_dependence <- function(object,
 }
 
 
+pd_list_split <- function(x_vals, x_cols){
+
+ x_vals_out <- x_cols_out <- vector(mode = 'list')
+ counter <- 1
+
+ for(i in seq_along(x_vals)){
+
+  x_vals_split <- split(x_vals[[i]], row(x_vals[[i]]))
+
+  for(j in seq_along(x_vals_split)){
+
+   x_vals_out[[counter]] <- matrix(x_vals_split[[j]],
+                                   ncol = ncol(x_vals[[i]]),
+                                   nrow = 1)
+   colnames(x_vals_out[[counter]]) <- colnames(x_vals[[i]])
+
+   x_cols_out[[counter]] <- x_cols[[i]]
+
+   counter <- counter + 1
+
+  }
+
+ }
+
+ list(
+  x_vals = x_vals_out,
+  x_cols = x_cols_out
+ )
+
+}
diff --git a/cran-comments.md b/cran-comments.md
@@ -1,5 +1,7 @@
 ## Version 0.1.1
 
+Update, October 25: Thank you for helping me with this. I have tidied up threads and avoided usage of the `data.table` functions that were creating possible memory leaks.  I have checked this submission locally with valgrind and on rhub, with both indicating 0 memory leaks. However, if this submission does not pass on your end, I would like to request an extension on the October 28th deadline.
+
 Update, October 21: I have updated the submission to fix memory leaks. Many of the leaks were caused by my omission of a virtual de-constructor for derived classes or by omission of a delete statement for dynamically allocated memory. I apologize for these oversights. After reviewing, you may still see a possible memory leak from `orsf_ice` functions. From what I can tell, this possible leak could either be measurement error or could be attributed to `data.table`. I do not think it's from `aorsf`.
 
 Initial submission: This version is being submitted to CRAN early due to a memory error that was identified in version 1.0.0. I apologize for the oversight. As `aorsf` would be removed from CRAN if the issue is not fixed before October 28, I would like to request an expedited submission. I have run the current submission's tests and examples with valgrind to ensure the memory error has been fixed. 

diff --git a/tests/testthat/test-orsf_pd.R b/tests/testthat/test-orsf_pd.R
@@ -57,9 +57,6 @@ test_that(
 )
 
 funs <- list(
- # ice_new = orsf_ice_new,
- # ice_inb = orsf_ice_inb,
- # ice_oob = orsf_ice_oob,
  pd_new = orsf_pd_new,
  pd_inb = orsf_pd_inb,
  pd_oob = orsf_pd_oob
@@ -87,8 +84,7 @@ for(i in seq_along(funs)){
 
  formals <- setdiff(names(formals(funs[[i]])), '...')
 
- for(pred_type in c('mort')){
- # for(pred_type in setdiff(pred_types_surv, c('leaf', 'mort'))){
+ for(pred_type in setdiff(pred_types_surv, c('leaf'))){
 
   args_grid$pred_type = pred_type
   args_loop$pred_type = pred_type
@@ -147,43 +143,40 @@ for(i in seq_along(funs)){
 }
 
 
-# pd_vals_ice <- orsf_ice_new(
-#  fit,
-#  new_data = pbc_orsf,
-#  pred_spec = list(bili = 1:4),
-#  pred_horizon = 1000
-# )
-#
-pd_vals_smry <- orsf_pd_new(
+pd_vals_ice <- orsf_ice_new(
  fit,
- new_data = pbc_orsf,
+ new_data = pbc_test,
  pred_spec = list(bili = 1:4),
  pred_horizon = 1000
 )
-#
-# test_that(
-#  'ice values summarized are the same as pd values',
-#  code = {
-#
-#   pd_vals_check <- pd_vals_ice[, .(medn = median(pred)), by = id_variable]
-#
-#   expect_equal(
-#    pd_vals_check$medn,
-#    pd_vals_smry$medn
-#   )
-#
-#  }
-# )
 
 
+pd_vals_smry <- orsf_pd_new(
+ fit,
+ new_data = pbc_test,
+ pred_spec = list(bili = 1:4),
+ pred_horizon = 1000
+)
+
 test_that(
- 'No missing values in output',
+ 'ice values summarized are the same as pd values',
  code = {
 
-  # expect_false(any(is.na(pd_vals_ice)))
-  # expect_false(any(is.nan(as.matrix(pd_vals_ice))))
-  # expect_false(any(is.infinite(as.matrix(pd_vals_ice))))
+  grps <- split(pd_vals_ice, pd_vals_ice$id_variable)
+  pd_vals_check <- sapply(grps, function(x) median(x$pred))
+
+  expect_equal(
+   as.numeric(pd_vals_check),
+   pd_vals_smry$medn
+  )
+
+ }
+)
 
+
+test_that(
+ 'No missing values in summary output',
+ code = {
   expect_false(any(is.na(pd_vals_smry)))
   expect_false(any(is.nan(as.matrix(pd_vals_smry))))
   expect_false(any(is.infinite(as.matrix(pd_vals_smry))))
@@ -200,10 +193,9 @@ test_that(
    pred_horizon = c(1000, 2000, 3000)
   )
 
-  # risk must increase or remain steady over time
+  # risk monotonically increases
   expect_lte(pd_smry_multi_horiz$mean[1], pd_smry_multi_horiz$mean[2])
   expect_lte(pd_smry_multi_horiz$mean[2], pd_smry_multi_horiz$mean[3])
-
   expect_lte(pd_smry_multi_horiz$medn[1], pd_smry_multi_horiz$medn[2])
   expect_lte(pd_smry_multi_horiz$medn[2], pd_smry_multi_horiz$medn[3])
 
@@ -213,9 +205,11 @@ test_that(
    pred_horizon = c(1000, 2000, 3000)
   )
 
-  ice_check <- pd_ice_multi_horiz[, .(m = mean(pred, na.rm=TRUE)), by = pred_horizon]
+  grps <- split(pd_ice_multi_horiz, pd_ice_multi_horiz$pred_horizon)
+
+  ice_check <- sapply(grps, function(x) mean(x$pred, na.rm=TRUE))
 
-  expect_equal(ice_check$m, pd_smry_multi_horiz$mean)
+  expect_equal(as.numeric(ice_check), pd_smry_multi_horiz$mean)
 
  }