Skip to content

Commit d0e86c1

Browse files
committed
Major update:
- add select_informative_regions_ext (with control costraint) - change compute_AUC `simplify` to `return_info` - update tests, `select_informative_` functions
1 parent 741af9b commit d0e86c1

14 files changed

+325
-129
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
.Rhistory
33
.RData
44
.Ruserdata
5+
install.sh
56
*.un~

DESCRIPTION

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
Package: PAMES
2-
Date: 2019-11-29
2+
Date: 2020-09-09
33
Type: Package
44
Title: Purity Assessment from clonal MEthylation Sites
55
Description: Exploiting data from DNA methylation, this package provides the operations
66
required to evaluate the purity of tumor samples.
7-
Version: 2.5.0
7+
Version: 2.6.0
88
Authors@R: c(person("Dario", "Romagnoli", role=c("aut", "cre"),
99
1010
person("Matteo", "Benelli", role="aut"),

NAMESPACE

+2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ export(compute_AUC)
44
export(compute_purity)
55
export(reduce_to_regions)
66
export(select_informative_regions)
7+
export(select_informative_regions_ext)
78
export(select_informative_sites)
89
export(select_informative_sites_ext)
910
importFrom(dplyr,"%>%")
1011
importFrom(stats,median)
12+
importFrom(stats,setNames)

R/compute_AUC.R

+15-11
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,17 @@
99
#' @param ncores Number of parallel processes to use for parallel computing.
1010
#' @param min_samples_frac Fraction of samples (independently in tumor and
1111
#' control samples) that are not NA required to analyze a site (default=100).
12-
#' @param simplify If TRUE (default) return a vector else compute all AUC and return a data.frame
12+
#' @param return_info If TRUE (default) return a vector else compute all AUC and return a data.frame
1313
#' reporting fraction of NAs in tumor and control tables.
1414
#' @param na_threshold (DEPRECATED) Fraction of NAs (considered independently in tumor and
1515
#' control samples) above which a site will not be selected (default=0).
1616
#' @return A vector of AUC scores (NA if not analyzed) or a data.frame with AUC scores
1717
#' and the fraction of non-NA samples in tumor and contol tables.
1818
#' @examples
1919
#' auc_data <- compute_AUC(tumor_toy_table, control_toy_table)
20+
#' @importFrom stats setNames
2021
#' @export
21-
compute_AUC <- function(tumor_table, control_table, ncores=1, na_threshold, simplify=TRUE, min_samples_frac=1) {
22+
compute_AUC <- function(tumor_table, control_table, ncores=1, na_threshold, return_info=TRUE, min_samples_frac=1) {
2223
message(sprintf("[%s] # Compute AUC #", Sys.time()))
2324
# check parameters
2425
if (!missing(na_threshold)){
@@ -33,13 +34,16 @@ compute_AUC <- function(tumor_table, control_table, ncores=1, na_threshold, simp
3334
assertthat::assert_that(diff_range_t > 1, diff_range_t <= 100, msg="For computation efficiency, convert tumor table to percentage values.")
3435
assertthat::assert_that(diff_range_c > 1, diff_range_c <= 100, msg="For computation efficiency, convert control table to percentage values.")
3536
assertthat::assert_that(nrow(tumor_table) == nrow(control_table))
36-
if (!is.null(rownames(tumor_table)) & !is.null(rownames(tumor_table)))
37-
warning("tumor_table and control_table have different rownames")
37+
if (!is.null(rownames(tumor_table)) & !is.null(rownames(control_table))){
38+
if (any(rownames(tumor_table) != rownames(control_table))){
39+
warning("tumor_table and control_table have different rownames")
40+
}
41+
}
3842

3943
assertthat::assert_that(is.numeric(ncores))
4044
ncores <- min(max(ncores, 1), parallel::detectCores())
4145
assertthat::assert_that(is.numeric(min_samples_frac), dplyr::between(min_samples_frac, 0, 1))
42-
assertthat::assert_that(is.logical(simplify))
46+
assertthat::assert_that(is.logical(return_info))
4347

4448
beta_table <- as.matrix(cbind(tumor_table, control_table))
4549
beta_table <- round(beta_table)
@@ -48,16 +52,16 @@ compute_AUC <- function(tumor_table, control_table, ncores=1, na_threshold, simp
4852
is_tumor <- c(rep(TRUE, ncol(tumor_table)), rep(FALSE, ncol(control_table)))
4953

5054
cl <- parallel::makeCluster(ncores)
51-
if (isTRUE(simplify)){
55+
if (isTRUE(return_info)){
5256
# select rows by NAs
53-
message(sprintf("[%s] Selecting sites with fraction of valid beta-scores greater than or equal to %.2f...", Sys.time(), min_samples_frac))
54-
tumor_valid_sites <- which(rowSums(!is.na(beta_table[,is_tumor]))/sum(is_tumor) >= min_samples_frac)
55-
control_valid_sites <- which(rowSums(!is.na(beta_table[,!is_tumor]))/sum(!is_tumor) >= min_samples_frac)
56-
valid_sites <- intersect(tumor_valid_sites, control_valid_sites)
57+
message(sprintf("[%s] Filter sites with fraction of available beta-scores greater than or equal to %.2f...", Sys.time(), min_samples_frac))
58+
tumor_available_sites <- which(rowSums(!is.na(beta_table[,is_tumor]))/sum(is_tumor) >= min_samples_frac)
59+
control_available_sites <- which(rowSums(!is.na(beta_table[,!is_tumor]))/sum(!is_tumor) >= min_samples_frac)
60+
available_sites <- intersect(tumor_available_sites, control_available_sites)
5761

5862
message(sprintf("[%s] Computing...", Sys.time()))
5963
auc <- setNames(rep(NA_real_, nrow(beta_table)), rownames(beta_table))
60-
auc[valid_sites] <- parallel::parApply(cl, beta_table[valid_sites,,drop=FALSE], 1, single_AUC, is_tumor=is_tumor)
64+
auc[available_sites] <- parallel::parApply(cl, beta_table[available_sites,,drop=FALSE], 1, single_AUC, is_tumor=is_tumor)
6165

6266
} else {
6367
message(sprintf("[%s] Computing...", Sys.time()))

R/compute_purity.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ compute_purity <- function(tumor_table, list_of_sites, platform) {
2020
assertthat::assert_that(any(c("hyper", "hypo") %in% names(list_of_sites)))
2121
diff_range_t <- diff(range(tumor_table, na.rm = TRUE))
2222
assertthat::assert_that(diff_range_t > 1, diff_range_t <= 100, msg="Unexpected range of beta values: convert tumor_table to percentage values.")
23-
message(sprintf("> Using %i hyper- and %i hypo-methylated sites",
23+
message(sprintf("- Using %i hyper- and %i hypo-methylated sites",
2424
length(list_of_sites[["hyper"]]),
2525
length(list_of_sites[["hypo"]])))
2626

R/select_informative_regions.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ select_informative_regions <- function(tumor_table, auc, max_sites = 20,
7070
message(sprintf("- Percentiles: %ith-%ith", percentiles[1], percentiles[2]))
7171

7272
# minimum and maximum beta per region
73-
message(sprintf("[%s] Evaluating sites...", Sys.time()))
73+
message(sprintf("[%s] Compute min-/max- beta scores...", Sys.time()))
7474
min_beta <- suppressWarnings(apply(tumor_table, 1, quantile, probs = percentiles[1]/100, na.rm = TRUE))
7575
max_beta <- suppressWarnings(apply(tumor_table, 1, quantile, probs = percentiles[2]/100, na.rm = TRUE))
7676

R/select_informative_regions_ext.R

+135
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
#' Select informative regions (extended)
2+
#'
3+
#' This function generates a list of informative regions to be used to estimate
4+
#' the purity of a set of tumor samples.
5+
#'
6+
#' Informative regions are divided into \code{hyper} and \code{hypo} depending
7+
#' on their level of methylation with respect to the average beta-score of
8+
#' normal samples. Both sets will be used to compute purity.
9+
#'
10+
#' @param tumor_table A matrix of beta-values (percentage) from tumor samples.
11+
#' @param control_table A matrix of beta-values (percentage) from normal/control samples.
12+
#' @param auc A vector of AUC scores generated by \code{compute_AUC}.
13+
#' @param max_sites Maximum number of regions to retrieve (half hyper-, half
14+
#' hypo-methylated) (default = 20).
15+
#' @param hyper_range A vector of length 2 with minimum lower and upper values
16+
#' required to select hyper-methylated informative sites.
17+
#' @param hypo_range A vector of length 2 with minimum lower and upper values
18+
#' required to select hypo-methylated informative sites.
19+
#' @param method How to select sites: "even" (half hyper-, half hypo-methylated sites),
20+
#' "top" (highest AUC irregardless of hyper or hypomethylation), "hyper" (hyper-methylated sites only),
21+
#' "hypo" (hypo-methylated, sites only).
22+
#' @param percentiles Vector of length 2: lower and upper percentiles to
23+
#' select sites with beta values outside hypo- and hyper-ranges (default =
24+
#' 0,100; 0th and 100th percentiles, i.e. only min and max beta should be outside of ranges).
25+
#' @return A named list of indexes of informative regions ("hyper-" and "hypo-methylated").
26+
#' @importFrom dplyr "%>%"
27+
#' @export
28+
#' @examples
29+
#' reduced_data <- reduce_to_regions(bs_toy_matrix, bs_toy_sites, cpg_islands[1:1000,])
30+
#' auc_data <- compute_AUC(reduced_data[,1:10], reduced_data[,11:20])
31+
#' info_regions <- select_informative_regions(reduced_data[,1:10], auc_data)
32+
select_informative_regions_ext <- function(tumor_table, control_table, auc,
33+
max_sites = 20, percentiles = c(0,100),
34+
hyper_range = c(min = 40, max = 90), hypo_range = c(min = 10, max = 60),
35+
control_costraints = c(20,80),
36+
method = c("even", "top", "hyper", "hypo"), return_info=FALSE){
37+
38+
message(sprintf("[%s] # Select informative regions #", Sys.time()))
39+
# check parameters
40+
diff_range_t <- diff(range(tumor_table, na.rm = TRUE))
41+
diff_range_c <- diff(range(control_table, na.rm = TRUE))
42+
assertthat::assert_that(diff_range_t > 1, diff_range_t <= 100, msg="For computation efficiency convert tumor_table to percentage values.")
43+
assertthat::assert_that(diff_range_c > 1, diff_range_c <= 100, msg="For computation efficiency convert control_table to percentage values.")
44+
45+
tumor_table <- as.matrix(tumor_table)
46+
tumor_table <- round(tumor_table)
47+
storage.mode(tumor_table) <- "integer"
48+
49+
assertthat::assert_that(nrow(tumor_table) == length(auc))
50+
51+
assertthat::assert_that(is.numeric(max_sites))
52+
53+
assertthat::assert_that(is.numeric(hyper_range))
54+
assertthat::assert_that(is.numeric(hypo_range))
55+
assertthat::assert_that(is.numeric(control_costraints))
56+
assertthat::assert_that(is.numeric(percentiles))
57+
58+
assertthat::assert_that(length(hyper_range) == 2)
59+
assertthat::assert_that(length(hypo_range) == 2)
60+
assertthat::assert_that(length(control_costraints) == 2)
61+
assertthat::assert_that(length(percentiles) == 2)
62+
63+
assertthat::assert_that(all(dplyr::between(hyper_range, 0, 100)))
64+
assertthat::assert_that(all(dplyr::between(hypo_range, 0, 100)))
65+
assertthat::assert_that(all(dplyr::between(control_costraints, 0, 100)))
66+
assertthat::assert_that(all(dplyr::between(percentiles, 0, 100)))
67+
68+
method <- match.arg(method)
69+
if (method == "even")
70+
assertthat::assert_that(max_sites %% 2 == 0, msg="method is set to 'even' but max_sites is not even")
71+
72+
assertthat::assert_that(is.logical(return_info))
73+
74+
message(sprintf("- Method: %s", method))
75+
message(sprintf("- Number of regions to retrieve: %i", max_sites))
76+
message(sprintf("- Hyper-methylated regions range: %i-%i", hyper_range[1], hyper_range[2]))
77+
message(sprintf("- Hypo-methylated regions range: %i-%i", hypo_range[1], hypo_range[2]))
78+
message(sprintf("- Control constraints: %i-%i", control_costraints[1], control_costraints[2]))
79+
message(sprintf("- Percentiles: %ith-%ith", percentiles[1], percentiles[2]))
80+
81+
# minimum and maximum beta per region
82+
message(sprintf("[%s] Compute min-/max- beta scores...", Sys.time()))
83+
min_beta <- suppressWarnings(apply(tumor_table, 1, quantile, probs = percentiles[1]/100, na.rm = TRUE))
84+
max_beta <- suppressWarnings(apply(tumor_table, 1, quantile, probs = percentiles[2]/100, na.rm = TRUE))
85+
86+
message(sprintf("[%s] Compute control interquartiles...", Sys.time()))
87+
lower_quart <- suppressWarnings(apply(control_table, 1, quantile, probs = .25, na.rm = TRUE))
88+
upper_quart <- suppressWarnings(apply(control_table, 1, quantile, probs = .75, na.rm = TRUE))
89+
90+
if (is.null(names(auc)))
91+
names(auc) <- sprintf("CpG_%06d", seq_along(auc))
92+
93+
message(sprintf("[%s] Select regions...", Sys.time()))
94+
diff_meth_regions <- dplyr::tibble(Probe = names(auc),
95+
Index = seq_along(auc),
96+
AUC = auc,
97+
Max_beta = max_beta,
98+
Min_beta = min_beta,
99+
Lower_Quart = lower_quart,
100+
Upper_Quart = upper_quart)
101+
diff_meth_regions <- dplyr::mutate(diff_meth_regions,
102+
Type = dplyr::case_when(AUC > .80 & Min_beta < hyper_range[1] & Max_beta > hyper_range[2] & Upper_Quart < control_costraints[1] ~ "Hyper",
103+
AUC < .20 & Min_beta < hypo_range[1] & Max_beta > hypo_range[2] & Lower_Quart > control_costraints[2] ~ "Hypo",
104+
TRUE ~ "No_diff"))
105+
diff_meth_regions <- dplyr::mutate(diff_meth_regions, AUC = dplyr::if_else(Type == "Hypo", 1-AUC, AUC))
106+
diff_meth_regions <- dplyr::arrange(diff_meth_regions, -AUC)
107+
108+
regions_hyper <- dplyr::filter(diff_meth_regions, Type == "Hyper")
109+
regions_hypo <- dplyr::filter(diff_meth_regions, Type == "Hypo")
110+
message(sprintf("* Total hyper-methylated regions = %i", nrow(regions_hyper)))
111+
message(sprintf("* Total hypo-methylated regions = %i", nrow(regions_hypo)))
112+
113+
if (method == "even") {
114+
regions <- list(hyper = regions_hyper %>% dplyr::slice(seq_len(max_sites/2)) %>% dplyr::pull(Index),
115+
hypo = regions_hypo %>% dplyr::slice(seq_len(max_sites/2)) %>% dplyr::pull(Index))
116+
} else if (method == "top") {
117+
top_regions <- dplyr::bind_rows(regions_hyper, regions_hypo) %>% dplyr::arrange(-AUC) %>% dplyr::slice(seq_len(max_sites))
118+
regions <- list(hyper = top_regions %>% dplyr::filter(Type == "Hyper") %>% dplyr::pull(Index),
119+
hypo = top_regions %>% dplyr::filter(Type == "Hypo") %>% dplyr::pull(Index))
120+
} else if (method == "hyper") {
121+
regions <- list(hyper = regions_hyper %>% dplyr::slice(seq_len(max_sites)) %>% dplyr::pull(Index))
122+
} else if (method == "hypo") {
123+
regions <- list(hypo = regions_hypo %>% dplyr::slice(seq_len(max_sites)) %>% dplyr::pull(Index))
124+
}
125+
126+
message(sprintf("* Retrieved hyper-methylated regions = %i", length(regions$hyper)))
127+
message(sprintf("* Retrieved hypo-methylated regions = %i", length(regions$hypo)))
128+
129+
message(sprintf("[%s] Done", Sys.time()))
130+
if (return_info) {
131+
return(list(regions, diff_meth_regions))
132+
} else {
133+
return(regions)
134+
}
135+
}

0 commit comments

Comments
 (0)