|
| 1 | +#' Select informative regions (extended) |
| 2 | +#' |
| 3 | +#' This function generates a list of informative regions to be used to estimate |
| 4 | +#' the purity of a set of tumor samples. |
| 5 | +#' |
| 6 | +#' Informative regions are divided into \code{hyper} and \code{hypo} depending |
| 7 | +#' on their level of methylation with respect to the average beta-score of |
| 8 | +#' normal samples. Both sets will be used to compute purity. |
| 9 | +#' |
| 10 | +#' @param tumor_table A matrix of beta-values (percentage) from tumor samples. |
| 11 | +#' @param control_table A matrix of beta-values (percentage) from normal/control samples. |
| 12 | +#' @param auc A vector of AUC scores generated by \code{compute_AUC}. |
| 13 | +#' @param max_sites Maximum number of regions to retrieve (half hyper-, half |
| 14 | +#' hypo-methylated) (default = 20). |
| 15 | +#' @param hyper_range A vector of length 2 with minimum lower and upper values |
| 16 | +#' required to select hyper-methylated informative sites. |
| 17 | +#' @param hypo_range A vector of length 2 with minimum lower and upper values |
| 18 | +#' required to select hypo-methylated informative sites. |
| 19 | +#' @param method How to select sites: "even" (half hyper-, half hypo-methylated sites), |
| 20 | +#' "top" (highest AUC irregardless of hyper or hypomethylation), "hyper" (hyper-methylated sites only), |
| 21 | +#' "hypo" (hypo-methylated, sites only). |
| 22 | +#' @param percentiles Vector of length 2: lower and upper percentiles to |
| 23 | +#' select sites with beta values outside hypo- and hyper-ranges (default = |
| 24 | +#' 0,100; 0th and 100th percentiles, i.e. only min and max beta should be outside of ranges). |
| 25 | +#' @return A named list of indexes of informative regions ("hyper-" and "hypo-methylated"). |
| 26 | +#' @importFrom dplyr "%>%" |
| 27 | +#' @export |
| 28 | +#' @examples |
| 29 | +#' reduced_data <- reduce_to_regions(bs_toy_matrix, bs_toy_sites, cpg_islands[1:1000,]) |
| 30 | +#' auc_data <- compute_AUC(reduced_data[,1:10], reduced_data[,11:20]) |
| 31 | +#' info_regions <- select_informative_regions(reduced_data[,1:10], auc_data) |
| 32 | +select_informative_regions_ext <- function(tumor_table, control_table, auc, |
| 33 | + max_sites = 20, percentiles = c(0,100), |
| 34 | + hyper_range = c(min = 40, max = 90), hypo_range = c(min = 10, max = 60), |
| 35 | + control_costraints = c(20,80), |
| 36 | + method = c("even", "top", "hyper", "hypo"), return_info=FALSE){ |
| 37 | + |
| 38 | + message(sprintf("[%s] # Select informative regions #", Sys.time())) |
| 39 | + # check parameters |
| 40 | + diff_range_t <- diff(range(tumor_table, na.rm = TRUE)) |
| 41 | + diff_range_c <- diff(range(control_table, na.rm = TRUE)) |
| 42 | + assertthat::assert_that(diff_range_t > 1, diff_range_t <= 100, msg="For computation efficiency convert tumor_table to percentage values.") |
| 43 | + assertthat::assert_that(diff_range_c > 1, diff_range_c <= 100, msg="For computation efficiency convert control_table to percentage values.") |
| 44 | + |
| 45 | + tumor_table <- as.matrix(tumor_table) |
| 46 | + tumor_table <- round(tumor_table) |
| 47 | + storage.mode(tumor_table) <- "integer" |
| 48 | + |
| 49 | + assertthat::assert_that(nrow(tumor_table) == length(auc)) |
| 50 | + |
| 51 | + assertthat::assert_that(is.numeric(max_sites)) |
| 52 | + |
| 53 | + assertthat::assert_that(is.numeric(hyper_range)) |
| 54 | + assertthat::assert_that(is.numeric(hypo_range)) |
| 55 | + assertthat::assert_that(is.numeric(control_costraints)) |
| 56 | + assertthat::assert_that(is.numeric(percentiles)) |
| 57 | + |
| 58 | + assertthat::assert_that(length(hyper_range) == 2) |
| 59 | + assertthat::assert_that(length(hypo_range) == 2) |
| 60 | + assertthat::assert_that(length(control_costraints) == 2) |
| 61 | + assertthat::assert_that(length(percentiles) == 2) |
| 62 | + |
| 63 | + assertthat::assert_that(all(dplyr::between(hyper_range, 0, 100))) |
| 64 | + assertthat::assert_that(all(dplyr::between(hypo_range, 0, 100))) |
| 65 | + assertthat::assert_that(all(dplyr::between(control_costraints, 0, 100))) |
| 66 | + assertthat::assert_that(all(dplyr::between(percentiles, 0, 100))) |
| 67 | + |
| 68 | + method <- match.arg(method) |
| 69 | + if (method == "even") |
| 70 | + assertthat::assert_that(max_sites %% 2 == 0, msg="method is set to 'even' but max_sites is not even") |
| 71 | + |
| 72 | + assertthat::assert_that(is.logical(return_info)) |
| 73 | + |
| 74 | + message(sprintf("- Method: %s", method)) |
| 75 | + message(sprintf("- Number of regions to retrieve: %i", max_sites)) |
| 76 | + message(sprintf("- Hyper-methylated regions range: %i-%i", hyper_range[1], hyper_range[2])) |
| 77 | + message(sprintf("- Hypo-methylated regions range: %i-%i", hypo_range[1], hypo_range[2])) |
| 78 | + message(sprintf("- Control constraints: %i-%i", control_costraints[1], control_costraints[2])) |
| 79 | + message(sprintf("- Percentiles: %ith-%ith", percentiles[1], percentiles[2])) |
| 80 | + |
| 81 | + # minimum and maximum beta per region |
| 82 | + message(sprintf("[%s] Compute min-/max- beta scores...", Sys.time())) |
| 83 | + min_beta <- suppressWarnings(apply(tumor_table, 1, quantile, probs = percentiles[1]/100, na.rm = TRUE)) |
| 84 | + max_beta <- suppressWarnings(apply(tumor_table, 1, quantile, probs = percentiles[2]/100, na.rm = TRUE)) |
| 85 | + |
| 86 | + message(sprintf("[%s] Compute control interquartiles...", Sys.time())) |
| 87 | + lower_quart <- suppressWarnings(apply(control_table, 1, quantile, probs = .25, na.rm = TRUE)) |
| 88 | + upper_quart <- suppressWarnings(apply(control_table, 1, quantile, probs = .75, na.rm = TRUE)) |
| 89 | + |
| 90 | + if (is.null(names(auc))) |
| 91 | + names(auc) <- sprintf("CpG_%06d", seq_along(auc)) |
| 92 | + |
| 93 | + message(sprintf("[%s] Select regions...", Sys.time())) |
| 94 | + diff_meth_regions <- dplyr::tibble(Probe = names(auc), |
| 95 | + Index = seq_along(auc), |
| 96 | + AUC = auc, |
| 97 | + Max_beta = max_beta, |
| 98 | + Min_beta = min_beta, |
| 99 | + Lower_Quart = lower_quart, |
| 100 | + Upper_Quart = upper_quart) |
| 101 | + diff_meth_regions <- dplyr::mutate(diff_meth_regions, |
| 102 | + Type = dplyr::case_when(AUC > .80 & Min_beta < hyper_range[1] & Max_beta > hyper_range[2] & Upper_Quart < control_costraints[1] ~ "Hyper", |
| 103 | + AUC < .20 & Min_beta < hypo_range[1] & Max_beta > hypo_range[2] & Lower_Quart > control_costraints[2] ~ "Hypo", |
| 104 | + TRUE ~ "No_diff")) |
| 105 | + diff_meth_regions <- dplyr::mutate(diff_meth_regions, AUC = dplyr::if_else(Type == "Hypo", 1-AUC, AUC)) |
| 106 | + diff_meth_regions <- dplyr::arrange(diff_meth_regions, -AUC) |
| 107 | + |
| 108 | + regions_hyper <- dplyr::filter(diff_meth_regions, Type == "Hyper") |
| 109 | + regions_hypo <- dplyr::filter(diff_meth_regions, Type == "Hypo") |
| 110 | + message(sprintf("* Total hyper-methylated regions = %i", nrow(regions_hyper))) |
| 111 | + message(sprintf("* Total hypo-methylated regions = %i", nrow(regions_hypo))) |
| 112 | + |
| 113 | + if (method == "even") { |
| 114 | + regions <- list(hyper = regions_hyper %>% dplyr::slice(seq_len(max_sites/2)) %>% dplyr::pull(Index), |
| 115 | + hypo = regions_hypo %>% dplyr::slice(seq_len(max_sites/2)) %>% dplyr::pull(Index)) |
| 116 | + } else if (method == "top") { |
| 117 | + top_regions <- dplyr::bind_rows(regions_hyper, regions_hypo) %>% dplyr::arrange(-AUC) %>% dplyr::slice(seq_len(max_sites)) |
| 118 | + regions <- list(hyper = top_regions %>% dplyr::filter(Type == "Hyper") %>% dplyr::pull(Index), |
| 119 | + hypo = top_regions %>% dplyr::filter(Type == "Hypo") %>% dplyr::pull(Index)) |
| 120 | + } else if (method == "hyper") { |
| 121 | + regions <- list(hyper = regions_hyper %>% dplyr::slice(seq_len(max_sites)) %>% dplyr::pull(Index)) |
| 122 | + } else if (method == "hypo") { |
| 123 | + regions <- list(hypo = regions_hypo %>% dplyr::slice(seq_len(max_sites)) %>% dplyr::pull(Index)) |
| 124 | + } |
| 125 | + |
| 126 | + message(sprintf("* Retrieved hyper-methylated regions = %i", length(regions$hyper))) |
| 127 | + message(sprintf("* Retrieved hypo-methylated regions = %i", length(regions$hypo))) |
| 128 | + |
| 129 | + message(sprintf("[%s] Done", Sys.time())) |
| 130 | + if (return_info) { |
| 131 | + return(list(regions, diff_meth_regions)) |
| 132 | + } else { |
| 133 | + return(regions) |
| 134 | + } |
| 135 | +} |
0 commit comments