Skip to content

Commit

Permalink
Merge pull request #28 from ropensci/pd-R
Browse files Browse the repository at this point in the history
Partial dependence fixes to plug memory leaks
  • Loading branch information
bcjaeger authored Oct 25, 2023
2 parents 0e4d243 + dfeb57d commit 2370ef8
Show file tree
Hide file tree
Showing 4 changed files with 213 additions and 44 deletions.
39 changes: 39 additions & 0 deletions R/melt_aorsf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@

# need to make this to avoid possible memory leak in data.table melt

melt_aorsf <-
function(data,
id.vars,
measure.vars,
variable.name = "variable",
value.name = "value") {
if (!is.data.frame(data)) {
stop("Input 'data' must be a data frame.")
}

if (!is.character(id.vars)) {
stop("Input 'id.vars' must be a character vector.")
}

if (!is.character(measure.vars)) {
stop("Input 'measure.vars' must be a character vector.")
}

# Select id variables and measure variables
id_data <- data[id.vars]
measure_data <- data[measure.vars]

# Create a sequence variable to represent the variable names
variable_data <- rep(names(measure_data), each = nrow(data))

# Reshape the data
long_data <- data.frame(id_data,
variable = variable_data,
value = unlist(measure_data, use.names = FALSE))

names(long_data)[names(long_data) == 'variable'] <- variable.name
names(long_data)[names(long_data) == 'value'] <- value.name

return(long_data)
}

150 changes: 142 additions & 8 deletions R/orsf_pd.R
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ orsf_pred_dependence <- function(object,

}

x_cols[[i]] <- match(names(pred_spec_new[[i]]), colnames(x_new))-1
x_cols[[i]] <- match(names(pred_spec_new[[i]]), colnames(x_new)) - 1
pred_spec_new[[i]] <- as.matrix(pred_spec_new[[i]])

}
Expand All @@ -439,6 +439,99 @@ orsf_pred_dependence <- function(object,

control <- get_control(object)

pred_horizon_order <- order(pred_horizon)
pred_horizon_ordered <- pred_horizon[pred_horizon_order]

# results <- list()
#
# for(i in seq_along(pred_spec_new)){
#
# results_i <- list()
#
# x_pd <- x_new
#
# for(j in seq(nrow(pred_spec_new[[i]]))){
#
# x_pd[, x_cols[[i]]] <- pred_spec_new[[i]][j, ]
#
# results_i[[j]] <- orsf_cpp(
# x = x_pd,
# y = matrix(1, ncol=2),
# w = rep(1, nrow(x_new)),
# tree_type_R = get_tree_type(object),
# tree_seeds = get_tree_seeds(object),
# loaded_forest = object$forest,
# n_tree = get_n_tree(object),
# mtry = get_mtry(object),
# sample_with_replacement = get_sample_with_replacement(object),
# sample_fraction = get_sample_fraction(object),
# vi_type_R = 0,
# vi_max_pvalue = get_vi_max_pvalue(object),
# oobag_R_function = get_f_oobag_eval(object),
# leaf_min_events = get_leaf_min_events(object),
# leaf_min_obs = get_leaf_min_obs(object),
# split_rule_R = switch(get_split_rule(object),
# "logrank" = 1,
# "cstat" = 2),
# split_min_events = get_split_min_events(object),
# split_min_obs = get_split_min_obs(object),
# split_min_stat = get_split_min_stat(object),
# split_max_cuts = get_n_split(object),
# split_max_retry = get_n_retry(object),
# lincomb_R_function = control$lincomb_R_function,
# lincomb_type_R = switch(control$lincomb_type,
# 'glm' = 1,
# 'random' = 2,
# 'net' = 3,
# 'custom' = 4),
# lincomb_eps = control$lincomb_eps,
# lincomb_iter_max = control$lincomb_iter_max,
# lincomb_scale = control$lincomb_scale,
# lincomb_alpha = control$lincomb_alpha,
# lincomb_df_target = control$lincomb_df_target,
# lincomb_ties_method = switch(tolower(control$lincomb_ties_method),
# 'breslow' = 0,
# 'efron' = 1),
# pred_type_R = pred_type_R,
# pred_mode = TRUE,
# pred_aggregate = TRUE,
# pred_horizon = pred_horizon_ordered,
# oobag = oobag,
# oobag_eval_type_R = 0,
# oobag_eval_every = get_n_tree(object),
# pd_type_R = 0,
# pd_x_vals = list(matrix(0, ncol=1, nrow=1)),
# pd_x_cols = list(matrix(1L, ncol=1, nrow=1)),
# pd_probs = c(0),
# n_thread = n_thread,
# write_forest = FALSE,
# run_forest = TRUE,
# verbosity = 0)$pred_new
#
# }
#
# if(type_output == 'smry'){
# results_i <- lapply(
# results_i,
# function(x) {
# apply(x, 2, function(x_col){
# as.numeric(
# c(mean(x_col, na.rm = TRUE),
# quantile(x_col, probs = prob_values, na.rm = TRUE))
# )
# })
# }
# )
# }
#
#
# results[[i]] <- results_i
#
# }
#
# pd_vals <- results
# browser()

orsf_out <- orsf_cpp(x = x_new,
y = matrix(1, ncol=2),
w = rep(1, nrow(x_new)),
Expand Down Expand Up @@ -513,14 +606,25 @@ orsf_pred_dependence <- function(object,
else
colnames(pd_vals[[i]][[j]]) <- c(paste(1:nrow(x_new)))

pd_vals[[i]][[j]] <- as.data.table(pd_vals[[i]][[j]],
keep.rownames = 'pred_horizon')
ph <- rownames(pd_vals[[i]][[j]])

pd_vals[[i]][[j]] <- as.data.frame(pd_vals[[i]][[j]])

rownames(pd_vals[[i]][[j]]) <- NULL

pd_vals[[i]][[j]][['pred_horizon']] <- ph

if(type_output == 'ice'){

pd_vals[[i]][[j]] <- melt_aorsf(
data = pd_vals[[i]][[j]],
id.vars = 'pred_horizon',
variable.name = 'id_row',
value.name = 'pred',
measure.vars = setdiff(names(pd_vals[[i]][[j]]), 'pred_horizon'))

}

if(type_output == 'ice')
pd_vals[[i]][[j]] <- melt(data = pd_vals[[i]][[j]],
id.vars = 'pred_horizon',
variable.name = 'id_row',
value.name = 'pred')

}

Expand Down Expand Up @@ -592,3 +696,33 @@ orsf_pred_dependence <- function(object,
}


pd_list_split <- function(x_vals, x_cols){

x_vals_out <- x_cols_out <- vector(mode = 'list')
counter <- 1

for(i in seq_along(x_vals)){

x_vals_split <- split(x_vals[[i]], row(x_vals[[i]]))

for(j in seq_along(x_vals_split)){

x_vals_out[[counter]] <- matrix(x_vals_split[[j]],
ncol = ncol(x_vals[[i]]),
nrow = 1)
colnames(x_vals_out[[counter]]) <- colnames(x_vals[[i]])

x_cols_out[[counter]] <- x_cols[[i]]

counter <- counter + 1

}

}

list(
x_vals = x_vals_out,
x_cols = x_cols_out
)

}
2 changes: 2 additions & 0 deletions cran-comments.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
## Version 0.1.1

Update, October 25: Thank you for helping me with this. I have tidied up threads and avoided usage of the `data.table` functions that were creating possible memory leaks. I have checked this submission locally with valgrind and on rhub, with both indicating 0 memory leaks. However, if this submission does not pass on your end, I would like to request an extension on the October 28th deadline.

Update, October 21: I have updated the submission to fix memory leaks. Many of the leaks were caused by my omission of a virtual de-constructor for derived classes or by omission of a delete statement for dynamically allocated memory. I apologize for these oversights. After reviewing, you may still see a possible memory leak from `orsf_ice` functions. From what I can tell, this possible leak could either be measurement error or could be attributed to `data.table`. I do not think it's from `aorsf`.

Initial submission: This version is being submitted to CRAN early due to a memory error that was identified in version 1.0.0. I apologize for the oversight. As `aorsf` would be removed from CRAN if the issue is not fixed before October 28, I would like to request an expedited submission. I have run the current submission's tests and examples with valgrind to ensure the memory error has been fixed.
Expand Down
66 changes: 30 additions & 36 deletions tests/testthat/test-orsf_pd.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,6 @@ test_that(
)

funs <- list(
# ice_new = orsf_ice_new,
# ice_inb = orsf_ice_inb,
# ice_oob = orsf_ice_oob,
pd_new = orsf_pd_new,
pd_inb = orsf_pd_inb,
pd_oob = orsf_pd_oob
Expand Down Expand Up @@ -87,8 +84,7 @@ for(i in seq_along(funs)){

formals <- setdiff(names(formals(funs[[i]])), '...')

for(pred_type in c('mort')){
# for(pred_type in setdiff(pred_types_surv, c('leaf', 'mort'))){
for(pred_type in setdiff(pred_types_surv, c('leaf'))){

args_grid$pred_type = pred_type
args_loop$pred_type = pred_type
Expand Down Expand Up @@ -147,43 +143,40 @@ for(i in seq_along(funs)){
}


# pd_vals_ice <- orsf_ice_new(
# fit,
# new_data = pbc_orsf,
# pred_spec = list(bili = 1:4),
# pred_horizon = 1000
# )
#
pd_vals_smry <- orsf_pd_new(
pd_vals_ice <- orsf_ice_new(
fit,
new_data = pbc_orsf,
new_data = pbc_test,
pred_spec = list(bili = 1:4),
pred_horizon = 1000
)
#
# test_that(
# 'ice values summarized are the same as pd values',
# code = {
#
# pd_vals_check <- pd_vals_ice[, .(medn = median(pred)), by = id_variable]
#
# expect_equal(
# pd_vals_check$medn,
# pd_vals_smry$medn
# )
#
# }
# )


pd_vals_smry <- orsf_pd_new(
fit,
new_data = pbc_test,
pred_spec = list(bili = 1:4),
pred_horizon = 1000
)

test_that(
'No missing values in output',
'ice values summarized are the same as pd values',
code = {

# expect_false(any(is.na(pd_vals_ice)))
# expect_false(any(is.nan(as.matrix(pd_vals_ice))))
# expect_false(any(is.infinite(as.matrix(pd_vals_ice))))
grps <- split(pd_vals_ice, pd_vals_ice$id_variable)
pd_vals_check <- sapply(grps, function(x) median(x$pred))

expect_equal(
as.numeric(pd_vals_check),
pd_vals_smry$medn
)

}
)


test_that(
'No missing values in summary output',
code = {
expect_false(any(is.na(pd_vals_smry)))
expect_false(any(is.nan(as.matrix(pd_vals_smry))))
expect_false(any(is.infinite(as.matrix(pd_vals_smry))))
Expand All @@ -200,10 +193,9 @@ test_that(
pred_horizon = c(1000, 2000, 3000)
)

# risk must increase or remain steady over time
# risk monotonically increases
expect_lte(pd_smry_multi_horiz$mean[1], pd_smry_multi_horiz$mean[2])
expect_lte(pd_smry_multi_horiz$mean[2], pd_smry_multi_horiz$mean[3])

expect_lte(pd_smry_multi_horiz$medn[1], pd_smry_multi_horiz$medn[2])
expect_lte(pd_smry_multi_horiz$medn[2], pd_smry_multi_horiz$medn[3])

Expand All @@ -213,9 +205,11 @@ test_that(
pred_horizon = c(1000, 2000, 3000)
)

ice_check <- pd_ice_multi_horiz[, .(m = mean(pred, na.rm=TRUE)), by = pred_horizon]
grps <- split(pd_ice_multi_horiz, pd_ice_multi_horiz$pred_horizon)

ice_check <- sapply(grps, function(x) mean(x$pred, na.rm=TRUE))

expect_equal(ice_check$m, pd_smry_multi_horiz$mean)
expect_equal(as.numeric(ice_check), pd_smry_multi_horiz$mean)

}

Expand Down

0 comments on commit 2370ef8

Please sign in to comment.