diff --git a/DESCRIPTION b/DESCRIPTION index 6c3e46ae..a5a662a6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,7 +25,7 @@ Authors@R: c( family = "Burk", role = "rev") ) -Description: Fit, interpret, and make predictions with oblique random survival forests. Oblique decision trees are notoriously slow compared to their axis based counterparts, but 'aorsf' runs as fast or faster than axis-based decision tree algorithms for right-censored time-to-event outcomes. Methods to accelerate and interpret the oblique random survival forest are described in Jaeger et al., (2022) . +Description: Fit, interpret, and make predictions with oblique random survival forests. Oblique decision trees are notoriously slow compared to their axis based counterparts, but 'aorsf' runs as fast or faster than axis-based decision tree algorithms for right-censored time-to-event outcomes. Methods to accelerate and interpret the oblique random survival forest are described in Jaeger et al., (2023) . License: MIT + file LICENSE Encoding: UTF-8 LazyData: true diff --git a/R/orsf.R b/R/orsf.R index 1235b147..c1d55f35 100644 --- a/R/orsf.R +++ b/R/orsf.R @@ -325,7 +325,7 @@ #' #' `r roxy_cite_jaeger_2019()` #' -#' `r roxy_cite_jaeger_2022()` +#' `r roxy_cite_jaeger_2023()` #' #' @export #' diff --git a/R/orsf_vi.R b/R/orsf_vi.R index 996c1a71..f556a132 100644 --- a/R/orsf_vi.R +++ b/R/orsf_vi.R @@ -76,7 +76,7 @@ #' #' `r roxy_cite_menze_2011()` #' -#' `r roxy_cite_jaeger_2022()` +#' `r roxy_cite_jaeger_2023()` #' #' orsf_vi <- function(object, diff --git a/R/roxy.R b/R/roxy.R index 509fcac4..afaa26e2 100644 --- a/R/roxy.R +++ b/R/roxy.R @@ -191,15 +191,16 @@ roxy_cite_jaeger_2019 <- function(){ } -roxy_cite_jaeger_2022 <- function(){ +roxy_cite_jaeger_2023 <- function(){ roxy_cite( authors = "Jaeger BC, Welden S, Lenoir K, Speiser JL, Segar MW, Pandey A, Pajewski NM", title = "Accelerated and interpretable oblique random survival forests", - journal = "arXiv e-prints", - date = "2022 Aug", - number = 'arXiv-2208', - url = "https://arxiv.org/abs/2208.01129" + journal = "Journal of Computational and Graphical Statistics", + date = "Published online 08 Aug 2023", + number = NULL, + # doi = "10.1080/10618600.2023.2231048", + url = "https://doi.org/10.1080/10618600.2023.2231048" ) } @@ -270,7 +271,7 @@ roxy_dots <- function(){ roxy_vi_describe <- function(type){ switch(type, - 'negate' = "Each variable is assessed separately by multiplying the variable's coefficients by -1 and then determining how much the model's performance changes. The worse the model's performance after negating coefficients for a given variable, the more important the variable. This technique is promising b/c it does not require permutation and it emphasizes variables with larger coefficients in linear combinations, but it is also relatively new and hasn't been studied as much as permutation importance. See [Jaeger, 2022](https://arxiv.org/abs/2208.01129) for more details on this technique.", + 'negate' = "Each variable is assessed separately by multiplying the variable's coefficients by -1 and then determining how much the model's performance changes. The worse the model's performance after negating coefficients for a given variable, the more important the variable. This technique is promising b/c it does not require permutation and it emphasizes variables with larger coefficients in linear combinations, but it is also relatively new and hasn't been studied as much as permutation importance. See [Jaeger, 2023](https://doi.org/10.1080/10618600.2023.2231048) for more details on this technique.", 'permute' = "Each variable is assessed separately by randomly permuting the variable's values and then determining how much the model's performance changes. The worse the model's performance after permuting the values of a given variable, the more important the variable. This technique is flexible, intuitive, and frequently used. It also has several [known limitations](https://christophm.github.io/interpretable-ml-book/feature-importance.html#disadvantages-9)", 'anova' = "A p-value is computed for each coefficient in each linear combination of variables in each decision tree. Importance for an individual predictor variable is the proportion of times a p-value for its coefficient is < 0.01. This technique is very efficient computationally, but may not be as effective as permutation or negation in terms of selecting signal over noise variables. See [Menze, 2011](https://link.springer.com/chapter/10.1007/978-3-642-23783-6_29) for more details on this technique.") diff --git a/README.Rmd b/README.Rmd index 3062991a..c043206d 100644 --- a/README.Rmd +++ b/README.Rmd @@ -78,7 +78,7 @@ knitr::include_graphics('man/figures/tree_axis_v_oblique.png') ## Examples -The `orsf()` function can fit several types of ORSF ensembles. My personal favorite is the accelerated ORSF because it has a great combination of prediction accuracy and computational efficiency (see [arXiv paper](https://arxiv.org/abs/2208.01129)).^2^ +The `orsf()` function can fit several types of ORSF ensembles. My personal favorite is the accelerated ORSF because it has a great combination of prediction accuracy and computational efficiency (see [JCGS paper](https://doi.org/10.1080/10618600.2023.2231048)).^2^ ```{r, child='Rmd/orsf-fit-accelerated.Rmd'} @@ -152,7 +152,7 @@ For more on ICE, see the [vignette](https://docs.ropensci.org/aorsf/articles/pd. ## Comparison to existing software -Comparisons between `aorsf` and existing software are presented in our [arXiv paper](https://arxiv.org/abs/2208.01129). The paper +Comparisons between `aorsf` and existing software are presented in our [JCGS paper](https://doi.org/10.1080/10618600.2023.2231048). The paper: - describes `aorsf` in detail with a summary of the procedures used in the tree fitting algorithm @@ -173,7 +173,7 @@ A more hands-on comparison of `aorsf` and other R packages is provided in [orsf cat("1. ", aorsf:::roxy_cite_jaeger_2019(), '\n\n') -cat("2. ", aorsf:::roxy_cite_jaeger_2022(), '\n\n') +cat("2. ", aorsf:::roxy_cite_jaeger_2023(), '\n\n') cat("3. ", aorsf:::roxy_cite_menze_2011()) diff --git a/README.md b/README.md index 4416347e..56d46957 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,8 @@ separating the two classes. The `orsf()` function can fit several types of ORSF ensembles. My personal favorite is the accelerated ORSF because it has a great combination of prediction accuracy and computational efficiency (see -[arXiv paper](https://arxiv.org/abs/2208.01129)).2 +[JCGS +paper](https://doi.org/10.1080/10618600.2023.2231048)).2 ``` r @@ -144,20 +145,20 @@ using `aorsf`: require permutation and it emphasizes variables with larger coefficients in linear combinations, but it is also relatively new and hasn’t been studied as much as permutation importance. See [Jaeger, - 2022](https://arxiv.org/abs/2208.01129) for more details on this - technique. + 2023](https://doi.org/10.1080/10618600.2023.2231048) for more details + on this technique. ``` r orsf_vi_negate(fit) #> bili sex copper ast age - #> 0.1190578208 0.0619364315 0.0290605798 0.0260108174 0.0251162396 + #> 0.1190290560 0.0619448918 0.0290622719 0.0260108174 0.0251263919 #> stage protime edema ascites hepato - #> 0.0237810058 0.0158443269 0.0117270641 0.0105685230 0.0092028195 + #> 0.0237725455 0.0158527871 0.0117258458 0.0105685230 0.0092045115 #> albumin chol trt alk.phos spiders - #> 0.0082647861 0.0041510636 0.0036548364 0.0010239241 -0.0003298163 + #> 0.0082732463 0.0041510636 0.0036632967 0.0010256161 -0.0003298163 #> trig platelet - #> -0.0011111508 -0.0045314656 + #> -0.0011060747 -0.0045517701 ``` - **permutation**: Each variable is assessed separately by randomly @@ -172,13 +173,13 @@ using `aorsf`: orsf_vi_permute(fit) #> bili copper ast age sex - #> 0.0514084384 0.0170611427 0.0142227933 0.0140274813 0.0131527430 + #> 0.0514033622 0.0170611427 0.0142515581 0.0140224052 0.0131459748 #> stage protime ascites edema albumin - #> 0.0119752045 0.0102865556 0.0098067817 0.0081730899 0.0080568255 + #> 0.0119768965 0.0102950158 0.0098067817 0.0081730899 0.0080652857 #> hepato chol alk.phos trig spiders - #> 0.0069734562 0.0032811220 0.0015862128 0.0014909643 0.0007811902 + #> 0.0069734562 0.0032811220 0.0015862128 0.0014943484 0.0007825752 #> trt platelet - #> -0.0007067631 -0.0022135241 + #> -0.0007067631 -0.0022338286 ``` - **analysis of variance (ANOVA)**3: A p-value is computed @@ -223,18 +224,18 @@ orsf_summarize_uni(fit, n_variables = 2) #> #> -- bili (VI Rank: 1) ---------------------------- #> -#> |----------------- risk -----------------| +#> |----------------- Risk -----------------| #> Value Mean Median 25th % 75th % -#> 0.70 0.2074286 0.09039332 0.03827337 0.3146957 -#> 1.3 0.2261739 0.10784929 0.04915971 0.3425934 -#> 3.2 0.3071951 0.21242141 0.11889617 0.4358309 +#> 0.70 0.2094827 0.09046313 0.03827429 0.3184979 +#> 1.3 0.2283358 0.11078307 0.05347112 0.3492104 +#> 3.2 0.3090977 0.21368937 0.11889617 0.4412656 #> #> -- sex (VI Rank: 2) ----------------------------- #> -#> |----------------- risk -----------------| +#> |----------------- Risk -----------------| #> Value Mean Median 25th % 75th % -#> m 0.3648659 0.2572239 0.15554270 0.5735661 -#> f 0.2479179 0.1021787 0.04161796 0.3591612 +#> m 0.3667488 0.2614335 0.15611841 0.5836574 +#> f 0.2507675 0.1051310 0.04355687 0.3596206 #> #> Predicted risk at time t = 1826.25 for top 2 predictors ``` @@ -255,7 +256,7 @@ For more on ICE, see the ## Comparison to existing software Comparisons between `aorsf` and existing software are presented in our -[arXiv paper](https://arxiv.org/abs/2208.01129). The paper +[JCGS paper](https://doi.org/10.1080/10618600.2023.2231048). The paper: - describes `aorsf` in detail with a summary of the procedures used in the tree fitting algorithm @@ -286,8 +287,9 @@ examples](https://docs.ropensci.org/aorsf/reference/orsf.html#tidymodels) 2. Jaeger BC, Welden S, Lenoir K, Speiser JL, Segar MW, Pandey A, Pajewski NM. Accelerated and interpretable oblique random survival - forests. *arXiv e-prints* 2022 Aug; arXiv-2208. URL: - + forests. *Journal of Computational and Graphical Statistics* + Published online 08 Aug 2023. URL: + 3. Menze BH, Kelm BM, Splitthoff DN, Koethe U, Hamprecht FA. On oblique random forests. *Joint European Conference on Machine Learning and diff --git a/cran-comments.md b/cran-comments.md index 27af3ae7..a6d682fb 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,11 +1,15 @@ +## Version 0.1.0 ## R CMD check results -Duration: 4m 3.8s +Duration: 3m 53.1s -0 errors v | 0 warnings v | 0 notes v +❯ checking C++ specification ... NOTE + Specified C++14: please drop specification unless essential -R CMD check succeeded +0 errors ✔ | 0 warnings ✔ | 1 note ✖ + +I have specified C++14 for this release. C++14 is essential, as this release uses `std::make_unique`. ## Downstream dependencies diff --git a/man/aorsf-package.Rd b/man/aorsf-package.Rd index 18d87b92..606da2a8 100644 --- a/man/aorsf-package.Rd +++ b/man/aorsf-package.Rd @@ -8,7 +8,7 @@ \description{ \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} -Fit, interpret, and make predictions with oblique random survival forests. Oblique decision trees are notoriously slow compared to their axis based counterparts, but 'aorsf' runs as fast or faster than axis-based decision tree algorithms for right-censored time-to-event outcomes. Methods to accelerate and interpret the oblique random survival forest are described in Jaeger et al., (2022) \href{https://arxiv.org/abs/2208.01129}{arXiv:2208.01129}. +Fit, interpret, and make predictions with oblique random survival forests. Oblique decision trees are notoriously slow compared to their axis based counterparts, but 'aorsf' runs as fast or faster than axis-based decision tree algorithms for right-censored time-to-event outcomes. Methods to accelerate and interpret the oblique random survival forest are described in Jaeger et al., (2023) \doi{ 10.1080/10618600.2023.2231048}. } \seealso{ Useful links: diff --git a/man/orsf.Rd b/man/orsf.Rd index d9c05477..da540775 100644 --- a/man/orsf.Rd +++ b/man/orsf.Rd @@ -520,12 +520,12 @@ The AUC values, from highest to lowest: }\if{html}{\out{}} \if{html}{\out{
}}\preformatted{## model times AUC se lower upper -## 1: net 1788 0.9179396 0.02012887 0.8784877 0.9573915 -## 2: accel 1788 0.9106396 0.02076004 0.8699507 0.9513286 -## 3: cph 1788 0.9061167 0.02277540 0.8614777 0.9507556 -## 4: rlt 1788 0.9012605 0.02178982 0.8585533 0.9439678 -## 5: rando 1788 0.8997729 0.02201363 0.8566270 0.9429188 -## 6: pca 1788 0.8996927 0.02245483 0.8556821 0.9437034 +## 1: net 1788 0.9134593 0.02079935 0.8726933 0.9542253 +## 2: cph 1788 0.9109155 0.02111657 0.8695278 0.9523032 +## 3: accel 1788 0.9099638 0.02122647 0.8683607 0.9515669 +## 4: rlt 1788 0.9069752 0.02132529 0.8651783 0.9487720 +## 5: rando 1788 0.9023489 0.02218936 0.8588586 0.9458393 +## 6: pca 1788 0.8994220 0.02201713 0.8562692 0.9425748 }\if{html}{\out{
}} And the indices of prediction accuracy: @@ -534,12 +534,12 @@ And the indices of prediction accuracy: }\if{html}{\out{}} \if{html}{\out{
}}\preformatted{## model times IPA -## 1: net 1788 0.5020652 -## 2: cph 1788 0.4759061 -## 3: accel 1788 0.4743392 -## 4: pca 1788 0.4398468 -## 5: rlt 1788 0.4373910 -## 6: rando 1788 0.4219209 +## 1: net 1788 0.4916815 +## 2: cph 1788 0.4833913 +## 3: accel 1788 0.4749974 +## 4: rlt 1788 0.4630984 +## 5: pca 1788 0.4371223 +## 6: rando 1788 0.4258456 ## 7: Null model 1788 0.0000000 }\if{html}{\out{
}} @@ -651,29 +651,29 @@ glimpse(results) \if{html}{\out{
}}\preformatted{## Rows: 276 ## Columns: 23 -## $ id 16, 29, 43, 62, 79, 82, 103, 105, 111, 114, 115, 139, 141,~ -## $ trt placebo, placebo, d_penicill_main, placebo, d_penicill_mai~ -## $ age 40.44353, 63.87680, 48.87064, 60.70637, 46.51608, 67.31006~ -## $ sex f, f, f, f, f, f, f, f, f, m, f, f, f, f, f, f, f, f, f, f~ -## $ ascites 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0~ -## $ hepato 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1~ -## $ spiders 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1~ -## $ edema 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~ -## $ bili 0.7, 0.7, 1.1, 1.3, 0.8, 4.5, 2.5, 1.1, 5.5, 3.2, 0.7, 1.1~ -## $ chol 204, 370, 361, 302, 315, 472, 188, 464, 528, 259, 303, 328~ -## $ albumin 3.66, 3.78, 3.64, 2.75, 4.24, 4.09, 3.67, 4.20, 4.18, 4.30~ -## $ copper 28, 24, 36, 58, 13, 154, 57, 38, 77, 208, 81, 159, 59, 76,~ -## $ alk.phos 685.0, 5833.0, 5430.2, 1523.0, 1637.0, 1580.0, 1273.0, 164~ -## $ ast 72.85, 73.53, 67.08, 43.40, 170.50, 117.80, 119.35, 151.90~ -## $ trig 58, 86, 89, 112, 70, 272, 102, 102, 78, 78, 156, 134, 56, ~ -## $ platelet 198, 390, 203, 329, 426, 412, 110, 348, 467, 268, 307, 142~ -## $ protime 10.8, 10.6, 10.6, 13.2, 10.9, 11.1, 11.1, 10.3, 10.7, 11.7~ -## $ stage 3, 2, 2, 4, 3, 3, 4, 3, 3, 3, 3, 4, 2, 2, 3, 4, 2, 3, 4, 4~ -## $ time 3672, 4509, 4556, 3090, 3707, 3574, 110, 3092, 2350, 3395,~ -## $ status 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0~ -## $ pred_aorsf 0.02210163, 0.12510110, 0.07571520, 0.59580668, 0.12839078~ -## $ pred_rfsrc 0.01861595, 0.15632904, 0.07635485, 0.62281617, 0.19145913~ -## $ pred_ranger 0.02143363, 0.13367920, 0.05892584, 0.54481330, 0.21380654~ +## $ id 3, 39, 43, 48, 50, 54, 64, 66, 78, 80, 83, 114, 131, 141, ~ +## $ trt d_penicill_main, d_penicill_main, d_penicill_main, placebo~ +## $ age 70.07255, 55.39220, 48.87064, 49.13621, 53.50856, 39.19781~ +## $ sex m, f, f, m, f, f, f, m, f, m, f, m, f, f, f, f, m, f, f, f~ +## $ ascites 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~ +## $ hepato 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1~ +## $ spiders 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0~ +## $ edema 0.5, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0, ~ +## $ bili 1.4, 0.7, 1.1, 1.9, 1.1, 1.3, 2.1, 1.4, 6.3, 7.2, 1.3, 3.2~ +## $ chol 176, 282, 361, 259, 257, 288, 373, 427, 436, 247, 250, 259~ +## $ albumin 3.48, 3.00, 3.64, 3.70, 3.36, 3.40, 3.50, 3.70, 3.02, 3.72~ +## $ copper 210, 52, 36, 281, 43, 262, 52, 105, 75, 269, 48, 208, 74, ~ +## $ alk.phos 516.0, 9066.8, 5430.2, 10396.8, 1080.0, 5487.2, 1009.0, 19~ +## $ ast 96.10, 72.24, 67.08, 188.34, 106.95, 73.53, 150.35, 182.90~ +## $ trig 55, 111, 89, 178, 73, 125, 188, 171, 104, 91, 100, 78, 104~ +## $ platelet 151, 563, 203, 214, 128, 254, 178, 123, 236, 360, 81, 268,~ +## $ protime 12.0, 10.6, 10.6, 11.0, 10.6, 11.0, 11.0, 11.0, 10.6, 11.2~ +## $ stage 4, 4, 2, 3, 4, 4, 3, 3, 4, 4, 4, 3, 4, 2, 3, 4, 2, 3, 4, 3~ +## $ time 1012, 2297, 4556, 4427, 2598, 1434, 1487, 4191, 1690, 890,~ +## $ status 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0~ +## $ pred_aorsf 0.76027848, 0.25291419, 0.06284001, 0.59437152, 0.15286015~ +## $ pred_rfsrc 0.47891074, 0.16833427, 0.05141013, 0.46526027, 0.06438684~ +## $ pred_ranger 0.61304990, 0.13930022, 0.03715869, 0.48395613, 0.04959462~ }\if{html}{\out{
}} And finish by aggregating the predictions and computing performance in @@ -699,16 +699,16 @@ counts. ## Results by model: ## ## model times AUC lower upper -## 1: aorsf 1826 91.0 86.8 95.2 -## 2: rfsrc 1826 89.2 84.8 93.7 -## 3: ranger 1826 89.6 85.3 94.0 +## 1: aorsf 1826 90.9 86.7 95.1 +## 2: rfsrc 1826 90.0 85.8 94.3 +## 3: ranger 1826 90.1 86.0 94.3 ## ## Results of model comparisons: ## -## times model reference delta.AUC lower upper p -## 1: 1826 rfsrc aorsf -1.7 -3.4 -0.1 0.04 -## 2: 1826 ranger aorsf -1.3 -2.9 0.2 0.08 -## 3: 1826 ranger rfsrc 0.4 -0.8 1.6 0.52 +## times model reference delta.AUC lower upper p +## 1: 1826 rfsrc aorsf -0.9 -2.2 0.5 0.2 +## 2: 1826 ranger aorsf -0.8 -2.1 0.6 0.3 +## 3: 1826 ranger rfsrc 0.1 -0.8 1.0 0.8 ## ## NOTE: Values are multiplied by 100 and given in \%. @@ -722,19 +722,19 @@ counts. ## ## model times Brier lower upper IPA ## 1: Null model 1826.25 20.5 18.1 22.9 0.0 -## 2: aorsf 1826.25 10.9 8.7 13.1 46.9 -## 3: rfsrc 1826.25 12.0 9.9 14.2 41.3 -## 4: ranger 1826.25 12.0 9.9 14.1 41.5 +## 2: aorsf 1826.25 10.8 8.5 13.0 47.4 +## 3: rfsrc 1826.25 11.8 9.6 13.9 42.6 +## 4: ranger 1826.25 11.7 9.6 13.8 42.7 ## ## Results of model comparisons: ## ## times model reference delta.Brier lower upper p -## 1: 1826.25 aorsf Null model -9.6 -12.2 -7.0 9.364941e-13 -## 2: 1826.25 rfsrc Null model -8.5 -10.7 -6.2 2.074175e-13 -## 3: 1826.25 ranger Null model -8.5 -10.8 -6.2 3.712823e-13 -## 4: 1826.25 rfsrc aorsf 1.1 0.3 2.0 1.075856e-02 -## 5: 1826.25 ranger aorsf 1.1 0.3 1.9 4.825778e-03 -## 6: 1826.25 ranger rfsrc -0.1 -0.6 0.5 8.429772e-01 +## 1: 1826.25 aorsf Null model -9.7 -12.4 -7.0 2.820785e-12 +## 2: 1826.25 rfsrc Null model -8.7 -11.0 -6.4 5.857526e-14 +## 3: 1826.25 ranger Null model -8.7 -11.1 -6.4 1.380943e-13 +## 4: 1826.25 rfsrc aorsf 1.0 0.2 1.8 1.507974e-02 +## 5: 1826.25 ranger aorsf 1.0 0.3 1.7 8.236836e-03 +## 6: 1826.25 ranger rfsrc -0.0 -0.5 0.4 9.336601e-01 ## ## NOTE: Values are multiplied by 100 and given in \%. @@ -919,5 +919,5 @@ Ishwaran H, Kogalur UB, Blackstone EH, Lauer MS. Random survival forests. \emph{ Jaeger BC, Long DL, Long DM, Sims M, Szychowski JM, Min YI, Mcclure LA, Howard G, Simon N. Oblique random survival forests. \emph{Annals of applied statistics} 2019 Sep; 13(3):1847-83. DOI: 10.1214/19-AOAS1261 -Jaeger BC, Welden S, Lenoir K, Speiser JL, Segar MW, Pandey A, Pajewski NM. Accelerated and interpretable oblique random survival forests. \emph{arXiv e-prints} 2022 Aug; arXiv-2208. URL: https://arxiv.org/abs/2208.01129 +Jaeger BC, Welden S, Lenoir K, Speiser JL, Segar MW, Pandey A, Pajewski NM. Accelerated and interpretable oblique random survival forests. \emph{Journal of Computational and Graphical Statistics} Published online 08 Aug 2023. URL: https://doi.org/10.1080/10618600.2023.2231048 } diff --git a/man/orsf_control_custom.Rd b/man/orsf_control_custom.Rd index 40c4620a..fe1e66dd 100644 --- a/man/orsf_control_custom.Rd +++ b/man/orsf_control_custom.Rd @@ -70,7 +70,7 @@ fit_rando ## Average leaves per tree: 20 ## Min observations in leaf: 5 ## Min events in leaf: 1 -## OOB stat value: 0.84 +## OOB stat value: 0.83 ## OOB stat type: Harrell's C-statistic ## Variable importance: anova ## @@ -110,7 +110,7 @@ prediction accuracy based on out-of-bag predictions: \if{html}{\out{
}}\preformatted{library(riskRegression) }\if{html}{\out{
}} -\if{html}{\out{
}}\preformatted{## riskRegression version 2023.03.22 +\if{html}{\out{
}}\preformatted{## riskRegression version 2023.09.08 }\if{html}{\out{
}} \if{html}{\out{
}}\preformatted{library(survival) @@ -135,15 +135,15 @@ The PCA ORSF does quite well! (higher IPA is better) ## ## model times Brier lower upper IPA ## 1: Null model 1788 20.479 18.090 22.868 0.000 -## 2: rando 1788 11.604 9.535 13.673 43.339 -## 3: pca 1788 12.870 10.872 14.869 37.153 +## 2: rando 1788 11.809 9.727 13.890 42.338 +## 3: pca 1788 12.967 10.983 14.950 36.683 ## ## Results of model comparisons: ## ## times model reference delta.Brier lower upper p -## 1: 1788 rando Null model -8.875 -11.063 -6.688 1.852437e-15 -## 2: 1788 pca Null model -7.609 -9.351 -5.866 1.136928e-17 -## 3: 1788 pca rando 1.267 0.449 2.084 2.384937e-03 +## 1: 1788 rando Null model -8.670 -10.843 -6.498 5.218847e-15 +## 2: 1788 pca Null model -7.512 -9.183 -5.842 1.226512e-18 +## 3: 1788 pca rando 1.158 0.305 2.011 7.810716e-03 ## ## NOTE: Values are multiplied by 100 and given in \%. diff --git a/man/orsf_ice_oob.Rd b/man/orsf_ice_oob.Rd index 8f662c5c..d9396498 100644 --- a/man/orsf_ice_oob.Rd +++ b/man/orsf_ice_oob.Rd @@ -153,16 +153,16 @@ ice_oob \if{html}{\out{
}}\preformatted{## id_variable id_row pred_horizon bili pred ## 1: 1 1 1788 1 0.9295584 -## 2: 1 2 1788 1 0.1412476 +## 2: 1 2 1788 1 0.1422392 ## 3: 1 3 1788 1 0.7047846 -## 4: 1 4 1788 1 0.3673939 +## 4: 1 4 1788 1 0.3845760 ## 5: 1 5 1788 1 0.1206201 ## --- ## 6896: 25 272 1788 10 0.3878561 ## 6897: 25 273 1788 10 0.4854526 ## 6898: 25 274 1788 10 0.4389557 ## 6899: 25 275 1788 10 0.3639220 -## 6900: 25 276 1788 10 0.5409864 +## 6900: 25 276 1788 10 0.5461205 }\if{html}{\out{
}} Much more detailed examples are given in the diff --git a/man/orsf_pd_oob.Rd b/man/orsf_pd_oob.Rd index 44b9cc2f..58e11315 100644 --- a/man/orsf_pd_oob.Rd +++ b/man/orsf_pd_oob.Rd @@ -160,12 +160,12 @@ You can compute partial dependence and ICE three ways with \code{aorsf}: pd_train }\if{html}{\out{
}} -\if{html}{\out{
}}\preformatted{## pred_horizon bili mean lwr medn upr -## 1: 1826.25 1 0.2167954 0.01432475 0.0946650 0.8243506 -## 2: 1826.25 2 0.2520765 0.03084190 0.1333465 0.8442959 -## 3: 1826.25 3 0.2964487 0.05324065 0.1937964 0.8578131 -## 4: 1826.25 4 0.3518250 0.09798050 0.2751326 0.8699063 -## 5: 1826.25 5 0.3936739 0.14573200 0.2984227 0.8781099 +\if{html}{\out{
}}\preformatted{## pred_horizon bili mean lwr medn upr +## 1: 1826.25 1 0.2188047 0.01435497 0.09604722 0.8243506 +## 2: 1826.25 2 0.2540831 0.03086042 0.13766124 0.8442959 +## 3: 1826.25 3 0.2982917 0.05324065 0.19470910 0.8578131 +## 4: 1826.25 4 0.3536969 0.09755193 0.27774884 0.8699063 +## 5: 1826.25 5 0.3955249 0.14622431 0.29945708 0.8775099 }\if{html}{\out{
}} \item using out-of-bag predictions for the training data @@ -175,11 +175,11 @@ pd_train }\if{html}{\out{
}} \if{html}{\out{
}}\preformatted{## pred_horizon bili mean lwr medn upr -## 1: 1826.25 1 0.2161745 0.01183390 0.1001640 0.8304537 -## 2: 1826.25 2 0.2521996 0.02447359 0.1419482 0.8484741 -## 3: 1826.25 3 0.2961802 0.04854875 0.1992512 0.8640601 -## 4: 1826.25 4 0.3532215 0.10111235 0.2666702 0.8642393 -## 5: 1826.25 5 0.3940203 0.14768055 0.3270825 0.8737186 +## 1: 1826.25 1 0.2182691 0.01218789 0.1008030 0.8304537 +## 2: 1826.25 2 0.2542021 0.02447359 0.1453580 0.8484741 +## 3: 1826.25 3 0.2980946 0.04854875 0.1997769 0.8640601 +## 4: 1826.25 4 0.3552203 0.10116417 0.2691853 0.8642393 +## 5: 1826.25 5 0.3959143 0.14768055 0.3264149 0.8737186 }\if{html}{\out{
}} \item using predictions for a new set of data @@ -191,11 +191,11 @@ pd_test }\if{html}{\out{
}} \if{html}{\out{
}}\preformatted{## pred_horizon bili mean lwr medn upr -## 1: 1826.25 1 0.2643571 0.01758300 0.2098936 0.8410357 -## 2: 1826.25 2 0.2990417 0.04063388 0.2516202 0.8547441 -## 3: 1826.25 3 0.3432271 0.06843859 0.3056799 0.8664949 -## 4: 1826.25 4 0.3967879 0.11801725 0.3593064 0.8721247 -## 5: 1826.25 5 0.4388518 0.16038177 0.4094224 0.8800138 +## 1: 1826.25 1 0.2643662 0.01758300 0.2098936 0.8410357 +## 2: 1826.25 2 0.2990578 0.04063388 0.2516202 0.8553218 +## 3: 1826.25 3 0.3432503 0.06843859 0.3056799 0.8670726 +## 4: 1826.25 4 0.3968111 0.11801725 0.3593064 0.8725208 +## 5: 1826.25 5 0.4388962 0.16038177 0.4094224 0.8809027 }\if{html}{\out{
}} \item in-bag partial dependence indicates relationships that the model has learned during training. This is helpful if your goal is to interpret diff --git a/man/orsf_vi.Rd b/man/orsf_vi.Rd index 9dc36477..1f952a6c 100644 --- a/man/orsf_vi.Rd +++ b/man/orsf_vi.Rd @@ -109,7 +109,7 @@ or not (see examples). \section{Variable importance methods}{ -\strong{negation importance}: Each variable is assessed separately by multiplying the variable's coefficients by -1 and then determining how much the model's performance changes. The worse the model's performance after negating coefficients for a given variable, the more important the variable. This technique is promising b/c it does not require permutation and it emphasizes variables with larger coefficients in linear combinations, but it is also relatively new and hasn't been studied as much as permutation importance. See \href{https://arxiv.org/abs/2208.01129}{Jaeger, 2022} for more details on this technique. +\strong{negation importance}: Each variable is assessed separately by multiplying the variable's coefficients by -1 and then determining how much the model's performance changes. The worse the model's performance after negating coefficients for a given variable, the more important the variable. This technique is promising b/c it does not require permutation and it emphasizes variables with larger coefficients in linear combinations, but it is also relatively new and hasn't been studied as much as permutation importance. See \href{https://doi.org/10.1080/10618600.2023.2231048}{Jaeger, 2023} for more details on this technique. \strong{permutation importance}: Each variable is assessed separately by randomly permuting the variable's values and then determining how much the model's performance changes. The worse the model's performance after permuting the values of a given variable, the more important the variable. This technique is flexible, intuitive, and frequently used. It also has several \href{https://christophm.github.io/interpretable-ml-book/feature-importance.html#disadvantages-9}{known limitations} @@ -221,24 +221,24 @@ orsf_vi_negate(fit_no_vi) }\if{html}{\out{}} \if{html}{\out{
}}\preformatted{## bili copper sex stage protime age -## 0.117812588 0.046758641 0.038085253 0.026584826 0.023881400 0.022583421 +## 0.117833946 0.046771025 0.038096005 0.026596235 0.023892153 0.022568331 ## albumin ascites chol ast edema hepato -## 0.020499282 0.015799793 0.013503968 0.011507068 0.007446917 0.007302135 +## 0.020502226 0.015764542 0.013505575 0.011507061 0.007444267 0.007318432 ## trt spiders alk.phos trig platelet -## 0.006136687 0.005411473 0.003425486 0.003358903 0.001218844 +## 0.006135388 0.005416366 0.003385460 0.003359579 0.001225734 }\if{html}{\out{
}} \if{html}{\out{
}}\preformatted{orsf_vi_permute(fit_no_vi) }\if{html}{\out{
}} \if{html}{\out{
}}\preformatted{## bili copper age protime albumin -## 0.0557765573 0.0229935033 0.0142175615 0.0138958680 0.0138130775 +## 0.0557854459 0.0230058852 0.0142318894 0.0139189306 0.0138242166 ## ascites stage chol ast edema -## 0.0122566919 0.0122549383 0.0062487607 0.0060082953 0.0057947595 +## 0.0122576604 0.0122514140 0.0062628391 0.0060073065 0.0057933534 ## hepato spiders sex trig alk.phos -## 0.0052884525 0.0038391405 0.0031617255 0.0014482532 0.0009073479 +## 0.0052890246 0.0038620727 0.0031610738 0.0014580912 0.0009063636 ## platelet trt -## 0.0001091183 -0.0018037065 +## 0.0001124081 -0.0017971380 }\if{html}{\out{
}} } @@ -255,13 +255,13 @@ orsf_vi_permute(fit_permute_vi) }\if{html}{\out{}} \if{html}{\out{
}}\preformatted{## bili copper age ascites protime -## 0.0537366984 0.0232882267 0.0135861353 0.0127592970 0.0125356157 +## 0.0537706105 0.0232845222 0.0135823364 0.0127916446 0.0125320108 ## albumin stage ast edema hepato -## 0.0114480445 0.0108792612 0.0063934776 0.0062786593 0.0048395390 -## chol spiders sex trig platelet -## 0.0042838996 0.0030700999 0.0025439674 0.0022397137 0.0010979018 -## alk.phos trt -## 0.0010942194 -0.0006194685 +## 0.0115100144 0.0109035858 0.0063943212 0.0062769135 0.0048230621 +## chol spiders sex trig alk.phos +## 0.0042752565 0.0030699653 0.0025422803 0.0022410492 0.0010977282 +## platelet trt +## 0.0010972387 -0.0005947093 }\if{html}{\out{
}} You can still get negation VI from this fit, but it needs to be computed @@ -270,11 +270,11 @@ You can still get negation VI from this fit, but it needs to be computed }\if{html}{\out{}} \if{html}{\out{
}}\preformatted{## bili copper sex age protime stage -## 0.120813894 0.046517819 0.036351782 0.022681350 0.021826811 0.021032630 +## 0.120854614 0.046515980 0.036380485 0.022668834 0.021816803 0.021111101 ## albumin ascites ast chol edema spiders -## 0.018906444 0.014062700 0.013029040 0.011226505 0.008009926 0.006151980 +## 0.018969867 0.014101778 0.013042103 0.011220170 0.008009693 0.006193354 ## trt hepato trig alk.phos platelet -## 0.005108172 0.005105800 0.003378537 0.003155270 0.002240910 +## 0.005184060 0.005113622 0.003389060 0.003156121 0.002242597 }\if{html}{\out{
}} } } @@ -286,5 +286,5 @@ Breiman L. Random forests. \emph{Machine learning} 2001 Oct; 45(1):5-32. DOI: 10 Menze BH, Kelm BM, Splitthoff DN, Koethe U, Hamprecht FA. On oblique random forests. \emph{Joint European Conference on Machine Learning and Knowledge Discovery in Databases} 2011 Sep 4; pp. 453-469. DOI: 10.1007/978-3-642-23783-6_29 -Jaeger BC, Welden S, Lenoir K, Speiser JL, Segar MW, Pandey A, Pajewski NM. Accelerated and interpretable oblique random survival forests. \emph{arXiv e-prints} 2022 Aug; arXiv-2208. URL: https://arxiv.org/abs/2208.01129 +Jaeger BC, Welden S, Lenoir K, Speiser JL, Segar MW, Pandey A, Pajewski NM. Accelerated and interpretable oblique random survival forests. \emph{Journal of Computational and Graphical Statistics} Published online 08 Aug 2023. URL: https://doi.org/10.1080/10618600.2023.2231048 } diff --git a/man/predict.orsf_fit.Rd b/man/predict.orsf_fit.Rd index 6183a264..1aa15166 100644 --- a/man/predict.orsf_fit.Rd +++ b/man/predict.orsf_fit.Rd @@ -120,7 +120,7 @@ predict(fit, }\if{html}{\out{}} \if{html}{\out{
}}\preformatted{## [,1] [,2] [,3] -## [1,] 0.49828550 0.77570208 0.91812971 +## [1,] 0.49884105 0.77681319 0.91901860 ## [2,] 0.04475471 0.09161544 0.17682278 ## [3,] 0.12850458 0.27603519 0.41455070 ## [4,] 0.01279086 0.02980402 0.06458151 @@ -134,12 +134,12 @@ predict(fit, pred_horizon = c(500, 1000, 1500)) }\if{html}{\out{
}} -\if{html}{\out{
}}\preformatted{## [,1] [,2] [,3] -## [1,] 0.5017145 0.2242979 0.08187029 -## [2,] 0.9552453 0.9083846 0.82317722 -## [3,] 0.8714954 0.7239648 0.58544930 -## [4,] 0.9872091 0.9701960 0.93541849 -## [5,] 0.9872268 0.9775023 0.95124323 +\if{html}{\out{
}}\preformatted{## [,1] [,2] [,3] +## [1,] 0.5011589 0.2231868 0.0809814 +## [2,] 0.9552453 0.9083846 0.8231772 +## [3,] 0.8714954 0.7239648 0.5854493 +## [4,] 0.9872091 0.9701960 0.9354185 +## [5,] 0.9872268 0.9775023 0.9512432 }\if{html}{\out{
}} \if{html}{\out{
}}\preformatted{# predicted cumulative hazard function @@ -151,7 +151,7 @@ predict(fit, }\if{html}{\out{
}} \if{html}{\out{
}}\preformatted{## [,1] [,2] [,3] -## [1,] 0.70791303 1.40367742 1.79658865 +## [1,] 0.70860748 1.40641948 1.79893071 ## [2,] 0.04954335 0.11460828 0.24130253 ## [3,] 0.16616222 0.43287394 0.71524591 ## [4,] 0.01443848 0.03640393 0.08366798 @@ -169,7 +169,7 @@ prediction horizon }\if{html}{\out{
}} \if{html}{\out{
}}\preformatted{## [,1] -## [1,] 81.19473 +## [1,] 81.23490 ## [2,] 27.69730 ## [3,] 41.52408 ## [4,] 15.79522 diff --git a/src/Forest.cpp b/src/Forest.cpp index 28afd964..6d70aa27 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -471,6 +471,73 @@ void Forest::compute_prediction_accuracy(Data* prediction_data, } +std::vector> Forest::compute_dependence(bool oobag){ + + std::vector> result; + + result.reserve(pd_x_vals.size()); + + // looping through each item in the pd list + for(uword k = 0; k < pd_x_vals.size(); ++k){ + + uword n = pd_x_vals[k].n_rows; + + std::vector result_k; + + result_k.reserve(n); + + // saving x values + for(const auto& x_col : pd_x_cols[k]){ + data->save_col(x_col); + } + + // loop through each row in the current pd matrix + for(uword i = 0; i < n; ++i){ + + uword j = 0; + // fill x with current pd values + for(const auto& x_col : pd_x_cols[k]){ + data->fill_col(pd_x_vals[k].at(i, j), x_col); + ++j; + } + + if(oobag) oobag_denom.fill(0); + + mat preds = predict(oobag); + + if(pd_type == PD_SUMMARY){ + + if(preds.has_nonfinite()){ + uvec is_finite = find_finite(preds.col(0)); + preds = preds.rows(is_finite); + } + + mat preds_summary = mean(preds, 0); + mat preds_quant = quantile(preds, pd_probs, 0); + + result_k.push_back(join_vert(preds_summary, preds_quant)); + + } else if(pd_type == PD_ICE) { + + result_k.push_back(preds); + + } + + } + + // bring back original values before moving to next pd item + for(const auto& x_col : pd_x_cols[k]){ + data->restore_col(x_col); + } + + result.push_back(result_k); + + } + + return(result); + +} + mat Forest::predict(bool oobag) { mat result; @@ -569,72 +636,6 @@ mat Forest::predict(bool oobag) { } -std::vector> Forest::compute_dependence(bool oobag){ - - std::vector> result; - - result.reserve(pd_x_vals.size()); - - // looping through each item in the pd list - for(uword k = 0; k < pd_x_vals.size(); ++k){ - - uword n = pd_x_vals[k].n_rows; - - std::vector result_k; - - result_k.reserve(n); - - // saving x values - for(const auto& x_col : pd_x_cols[k]){ - data->save_col(x_col); - } - - // loop through each row in the current pd matrix - for(uword i = 0; i < n; ++i){ - - uword j = 0; - // fill x with current pd values - for(const auto& x_col : pd_x_cols[k]){ - data->fill_col(pd_x_vals[k].at(i, j), x_col); - ++j; - } - - if(oobag) oobag_denom.fill(0); - - mat preds = predict(oobag); - - if(pd_type == PD_SUMMARY){ - - if(preds.has_nonfinite()){ - uvec is_finite = find_finite(preds.col(0)); - preds = preds.rows(is_finite); - } - - mat preds_summary = mean(preds, 0); - mat preds_quant = quantile(preds, pd_probs, 0); - - result_k.push_back(join_vert(preds_summary, preds_quant)); - - } else if(pd_type == PD_ICE) { - - result_k.push_back(preds); - - } - - } - - // bring back original values before moving to next pd item - for(const auto& x_col : pd_x_cols[k]){ - data->restore_col(x_col); - } - - result.push_back(result_k); - - } - - return(result); - -} void Forest::predict_single_thread(Data* prediction_data, bool oobag, @@ -787,6 +788,20 @@ void Forest::resize_oobag_eval(){ } +void Forest::resize_pred_mat(arma::mat& p){ + + if(pred_type == PRED_TERMINAL_NODES || !pred_aggregate){ + + p.zeros(data->n_rows, n_tree); + + } else { + + resize_pred_mat_internal(p); + + } + +} + void Forest::show_progress(std::string operation, size_t max_progress) { using std::chrono::steady_clock; @@ -835,19 +850,6 @@ void Forest::show_progress(std::string operation, size_t max_progress) { } } -void Forest::resize_pred_mat(arma::mat& p){ - - if(pred_type == PRED_TERMINAL_NODES || !pred_aggregate){ - - p.zeros(data->n_rows, n_tree); - - } else { - - resize_pred_mat_internal(p); - - } - -} } diff --git a/src/Tree.cpp b/src/Tree.cpp index 8c195926..36969c39 100644 --- a/src/Tree.cpp +++ b/src/Tree.cpp @@ -610,20 +610,21 @@ } + // not currently used but will be in the future + // # nocov start void Tree::sprout_leaf(uword node_id){ if(verbosity > 2){ - // # nocov start Rcout << "-- sprouting node " << node_id << " into a leaf"; Rcout << " (N = " << sum(w_node) << ")"; Rcout << std::endl; Rcout << std::endl; - // # nocov end } leaf_summary[node_id] = mean(y_node.col(0)); } + // # nocov end // not currently used but will be in the future // # nocov start diff --git a/tests/testthat/test-oobag.R b/tests/testthat/test-oobag.R deleted file mode 100644 index 6ac24f22..00000000 --- a/tests/testthat/test-oobag.R +++ /dev/null @@ -1,25 +0,0 @@ - - -test_that( - desc = 'oobag error works w/oobag_eval_every & custom oobag fun works', - code = { - - fit_custom_oobag <- orsf(pbc, - formula = Surv(time, status) ~ ., - n_tree = n_tree_test, - oobag_eval_every = 1, - oobag_fun = oobag_c_survival, - tree_seeds = seeds_standard) - - expect_equal_leaf_summary(fit_custom_oobag, fit_standard_pbc$fast) - - expect_equal( - get_last_oob_stat_value(fit_standard_pbc$fast), - get_last_oob_stat_value(fit_custom_oobag) - ) - - } -) - - - diff --git a/tests/testthat/test-orsf.R b/tests/testthat/test-orsf.R index 89648731..62a9bb64 100644 --- a/tests/testthat/test-orsf.R +++ b/tests/testthat/test-orsf.R @@ -738,6 +738,28 @@ test_that( } ) +test_that( + desc = 'oobag error works w/oobag_eval_every & custom oobag fun works', + code = { + + fit_custom_oobag <- orsf(pbc, + formula = Surv(time, status) ~ ., + n_tree = n_tree_test, + oobag_eval_every = 1, + oobag_fun = oobag_c_survival, + tree_seeds = seeds_standard) + + expect_equal_leaf_summary(fit_custom_oobag, fit_standard_pbc$fast) + + expect_equal( + get_last_oob_stat_value(fit_standard_pbc$fast), + get_last_oob_stat_value(fit_custom_oobag) + ) + + } +) + + # high pred horizon diff --git a/vignettes/aorsf.Rmd b/vignettes/aorsf.Rmd index b4dbfad0..6f2de70a 100644 --- a/vignettes/aorsf.Rmd +++ b/vignettes/aorsf.Rmd @@ -27,7 +27,7 @@ The oblique random survival forest (ORSF) is an extension of the axis-based RSF - See [orsf](https://docs.ropensci.org/aorsf/reference/orsf.html) for more details on ORSFs. -- see the [arXiv](https://arxiv.org/abs/2208.01129) paper for more details on algorithms used specifically by `aorsf`. +- see the [JCGS](https://doi.org/10.1080/10618600.2023.2231048) paper for more details on algorithms used specifically by `aorsf`. ## Accelerated ORSF