Skip to content

Commit bf7b6ad

Browse files
committed
removed parallel; added cpg_islands na filtering
1 parent 9d545ff commit bf7b6ad

33 files changed

+382
-293
lines changed

DESCRIPTION

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,13 @@ Type: Package
44
Title: Purity Assessment through clonal MEthylation Sites
55
Description: Exploiting data from DNA methylation, this package provides the operations
66
required to evaluate the purity of tumor samples.
7-
Version: 0.1.0
7+
Version: 0.2.0
88
Authors@R: c(person("Dario", "Romagnoli", email="[email protected]",
99
role=c("aut", "cre")),
1010
person("Matteo", "Benelli", role="aut"))
1111
Depends: R (>= 3.3.1)
1212
biocViews: ROC
1313
Imports:
14-
assertthat,
1514
ROC,
1615
parallel
1716
Suggests:

NAMESPACE

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22

33
export(compute_AUC)
44
export(compute_purity)
5-
export(map_to_islands)
5+
export(reduce_to_islands)
66
export(select_informative_islands)
77
export(select_informative_sites)

PAMES.Rproj

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ AlwaysSaveHistory: Default
66

77
EnableCodeIndexing: Yes
88
UseSpacesForTab: Yes
9-
NumSpacesForTab: 4
9+
NumSpacesForTab: 2
1010
Encoding: UTF-8
1111

1212
RnwWeave: Sweave

R/PAMES.R

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
#' The two main functions of PAMES are \code{\link{select_informative_sites}}
77
#' (to retreive sites of interest), and \code{\link{compute_purity}} (to compute
88
#' purity of tumor samples providing a list of sites generated by
9-
#' \code{\link{select_informative_sites}}). Addionally, if working with
9+
#' \code{\link{select_informative_sites}}). Additionally, if working with
1010
#' Bisulphite Sequencing data, users must first
11-
#' map the data with \code{\link{map_to_islands}}, than select informative
11+
#' reduce the data with \code{\link{reduce_to_islands}}, than select informative
1212
#' CpG islands with \code{\link{select_informative_islands}}, and finally
1313
#' estimate purity with \code{\link{compute_purity}}.
1414
#

R/compute_AUC.R

+23-22
Original file line numberDiff line numberDiff line change
@@ -5,30 +5,31 @@
55
#'
66
#' @param tumor a matrix of beta scores generated by an Illumina BeadChip.
77
#' @param control a matrix of beta scores generated by an Illumina BeadChip.
8-
#' @param max_NAs_fraction fraction of NAs (considered independently in tumor and
9-
#' control samples) above which a site will not be selected (default=0.5).
10-
#' @param n_threads number of cores to use
8+
#' @param max_NAs_frac a value between 0 and 1. Sites with a fraction of NAs
9+
#' major than or equal to it are excluded (default = 1; no NAs allowed).
1110
#' @return a vector of AUC scores
1211
#' @export
13-
compute_AUC <- function(tumor, control, max_NAs_frac=0.5){
14-
stopifnot(max_NAs_frac >= 0 | max_NAs_frac <= 1)
15-
stopifnot(nrow(tumor) == nrow(control))
12+
compute_AUC <- function(tumor, control, max_NAs_frac=1){
13+
tumor <- as.matrix(tumor)
14+
control <- as.matrix(control)
15+
stopifnot(max_NAs_frac >= 0 | max_NAs_frac <= 1)
16+
stopifnot(nrow(tumor) == nrow(control))
1617

17-
message(sprintf("[%s] Checking per-row NAs fraction...", Sys.time()))
18-
auc <- rep(NA, nrow(tumor))
19-
valid_tumor_row_logi <- apply(tumor, 1, function(x) sum(is.na(x)) / length(x) < max_NAs_frac)
20-
valid_control_row_logi <- apply(control, 1, function(x) sum(is.na(x)) / length(x) < max_NAs_frac)
21-
valid_row_idx <- which(valid_tumor_row_logi & valid_control_row_logi)
18+
message(sprintf("[%s] Checking per-row NAs fraction...", Sys.time()))
19+
auc <- rep(NA, nrow(tumor))
20+
valid_tumor_row_logi <- apply(tumor, 1, function(x) sum(is.na(x)) / length(x) < max_NAs_frac)
21+
valid_control_row_logi <- apply(control, 1, function(x) sum(is.na(x)) / length(x) < max_NAs_frac)
22+
valid_row_idx <- which(valid_tumor_row_logi & valid_control_row_logi)
2223

23-
full_table <- cbind(tumor[valid_row_idx, ], control[valid_row_idx, ])
24-
state <- c(rep(1, ncol(tumor)), rep(0, ncol(control)))
25-
message(sprintf("[%s] Computing AUC ", Sys.time()))
26-
valid_auc <- apply(full_table, 1, function(x){
27-
non_NA_idx <- which(!is.na(x))
28-
roc <- ROC::rocdemo.sca(truth=state[non_NA_idx], data=x[non_NA_idx], cutpts=seq(0, 1, .01))
29-
ROC::AUC(roc)
30-
})
31-
auc[valid_row_idx] <- valid_auc
32-
message(sprintf("[%s] Done.", Sys.time()))
33-
return(auc)
24+
full_table <- cbind(tumor[valid_row_idx, ], control[valid_row_idx, ])
25+
state <- c(rep(1, ncol(tumor)), rep(0, ncol(control)))
26+
message(sprintf("[%s] Computing AUC ", Sys.time()))
27+
valid_auc <- apply(full_table, 1, function(x){
28+
non_NA_idx <- which(!is.na(x))
29+
roc <- ROC::rocdemo.sca(truth=state[non_NA_idx], data=x[non_NA_idx], cutpts=seq(0, 1, .01))
30+
ROC::AUC(roc)
31+
})
32+
auc[valid_row_idx] <- valid_auc
33+
message(sprintf("[%s] Done.", Sys.time()))
34+
return(auc)
3435
}

R/compute_islands_indexes.R

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#' Associated to each cpg islands the corresponding cpg site
2+
#' @param cpg_sites a data.frame reporting the location of CpG sites from Bisulphite
3+
#' Sequencing data and requiring two columns: chromosome and 1-based coordinate of CpG sites.
4+
#' @param cpg_islands_df a data.frame reporting the location of CpG islands. By default it
5+
#' loads the `cpg_islands` data.frame that comes with the package (based on data
6+
#' downloaded from [UCSC Genome Browser](https://genome.ucsc.edu/) on date 2016-03-01.
7+
#' @examples
8+
#' \dontrun{
9+
#' cpg_indexes <- compute_islands_indexes(bs_toy_sites)
10+
#' }
11+
#' \donttest{
12+
#' cpg_indexes_updated <- compute_islands_indexes(bs_toy_sites, cpg_islands_df = cpg_islands_v2)
13+
#' }
14+
compute_island_indexes <- function(cpg_sites, cpg_islands_df = cpg_islands){
15+
if (!is.data.frame(cpg_sites) | !is.data.frame(cpg_islands_df))
16+
stop("'cpg_sites' and 'cpg_islands_df' must be data.frames")
17+
if (!is.character(cpg_sites[[1]]) | !is.numeric(cpg_sites[[2]]))
18+
stop("First to colums of 'cpg_sites' must be 'character' and 'numeric', respectively")
19+
if (any(!startsWith(cpg_sites[[1]], "chr")))
20+
stop("Please be sure that chromosome symbols in 'cpg_sites' has the format 'chrN'")
21+
if (!is.character(cpg_islands_df[[1]]) | !is.numeric(cpg_islands_df[[2]])| !is.numeric(cpg_islands_df[[3]]))
22+
stop("First column of 'cpg_islands_df' must be 'character', second and third 'numeric'")
23+
24+
message(sprintf("[%s] Computing indexes for 'CpG sites to CpG islands' mapping...", Sys.time()))
25+
cpg_indexes <- apply(cpg_islands_df, 1, function(x) {
26+
same_chromosome <- cpg_sites[[1]] == x[["chr"]]
27+
is_between <- cpg_sites[[2]] >= x[["start"]] & cpg_sites[[2]] <= x[["end"]]
28+
return(which(same_chromosome & is_between))
29+
})
30+
return(cpg_indexes)
31+
}

R/compute_purity.R

+3-3
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@
1616
#' }
1717
compute_purity <- function(tumor, list_of_sites) {
1818
tumor <- as.matrix(tumor)
19-
assertthat::assert_that(is.list(list_of_sites),
20-
length(list_of_sites) == 2,
21-
all(names(list_of_sites) %in% c("hyper", "hypo")) )
19+
stopifnot(is.list(list_of_sites))
20+
stopifnot(length(list_of_sites) == 2)
21+
stopifnot(all(names(list_of_sites) %in% c("hyper", "hypo")))
2222

2323
beta_values <- rbind(tumor[list_of_sites[["hyper"]],],
2424
1 - tumor[list_of_sites[["hypo"]],])

R/data.R

+3-5
Original file line numberDiff line numberDiff line change
@@ -24,26 +24,24 @@
2424

2525
#' Curated set of CpG islands
2626
#'
27-
#' Automatically loaded by \code{\link{map_to_islands}}.
28-
#'
2927
#' Downloaded on 2016-03-01.
3028
#' @source \url{https://genome.ucsc.edu/}
3129
"cpg_islands"
3230

3331

3432
#' Toy data
3533
#'
36-
#' Test with \code{\link{map_to_islands}}.
34+
#' Test with \code{\link{reduce_to_islands}}.
3735
"bs_tumor_toy_data"
3836

3937
#' Toy data
4038
#'
41-
#' Test with \code{\link{map_to_islands}}.
39+
#' Test with \code{\link{reduce_to_islands}}.
4240
"bs_control_toy_data"
4341

4442
#' Toy data
4543
#'
46-
#' Test with \code{\link{map_to_islands}}.
44+
#' Test with \code{\link{reduce_to_islands}}.
4745
"bs_toy_sites"
4846

4947

R/map_to_islands.R

-74
This file was deleted.

R/reduce_to_islands.R

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#' Convert beta values from Bisulphite Sequencing to single CpG island beta values
2+
#'
3+
#' Reduce beta values from different CpG sites to single beta values
4+
#' associated to one CpG island where CpG sites are located.
5+
#'
6+
#' Bisulphite Sequencing is used to retrieve level of DNA methylation at
7+
#' base resolution. Different experiments may retrieve
8+
#' different CpG sites. This function makes a direct comparison possible.
9+
#'
10+
#' @param beta_values a matrix of beta values.
11+
#' @param cpg_indexes a list of indexes generated by \code{compute_islands_indexes}.
12+
#' @param min_CpGs an integer (default to 3). Minimum number of CpG sites within
13+
#' a single CpG island required to compute the reduced beta value (return NA otherwise).
14+
#' @return a matrix of beta values (one row per CpG island).
15+
#' @examples
16+
#' \donttest{
17+
#' reduced <- reduce_to_islands(bs_control_toy_data, bs_toy_sites)
18+
#' }
19+
#' @export
20+
reduce_to_islands <- function(cpg_sites_matrix, cpg_indexes, min_CpGs = 3){
21+
cpg_sites_matrix <- as.matrix(cpg_sites_matrix)
22+
message(sprintf("[%s] Reducing beta values...", Sys.time()))
23+
cpg_islands_matrix <- do.call("rbind",
24+
lapply(cpg_indexes, function(x)
25+
reduce_island(cpg_sites_matrix[x, , drop = F], min_CpGs)))
26+
message(sprintf("[%s] Done", Sys.time()))
27+
return(cpg_islands_matrix)
28+
}
29+
30+
31+
#' Reduce a matrix subset
32+
#'
33+
#' Takes into consideration the eventual presence of NAs
34+
#' @param cpg_sites_matrix a matrix
35+
#' @param n minimum required number of sites per island (return NA otherwise)
36+
#' @keywords internal
37+
reduce_island <- function(cpg_sites_matrix, n) {
38+
if (ncol(cpg_sites_matrix) < n) {
39+
beta_values <- NA
40+
} else {
41+
# identify sites without any beta-score
42+
all_NAs_sites <- rowSums(is.na(cpg_sites_matrix)) <= ncol(cpg_sites_matrix)
43+
if (sum(!all_NAs_sites) < n) {
44+
beta_values <- NA
45+
} else {
46+
beta_values <- apply(cpg_sites_matrix, 2, median, na.rm = T)
47+
}
48+
}
49+
return(beta_values)
50+
}

R/select_informative_islands.R

+4-3
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
#' info_sites <- select_informative_islands(mapped_bs_tumor_toy_data, auc)
1919
#' }
2020
#' @export
21-
select_informative_islands <- function(tumor, auc, max_sites = 20){
21+
select_informative_islands <- function(tumor, auc, hyper_range = c(min = .40, max = .90),
22+
hypo_range = c(min = .10, max = .60), max_sites = 20){
2223
# check parameters ---------------------------------------------------------
2324
if (max_sites %% 2 != 0 & !is.integer(max_sites))
2425
stop("'max_sites' must be even")
@@ -30,8 +31,8 @@ select_informative_islands <- function(tumor, auc, max_sites = 20){
3031
# minimum and maximum beta per site ----------------------------------------
3132
beta_max <- suppressWarnings(apply(tumor, 1, max, na.rm=T))
3233
beta_min <- suppressWarnings(apply(tumor, 1, min, na.rm=T))
33-
hyper_idx <- which(beta_min < .40 & beta_max > .90 & auc > .80)
34-
hypo_idx <- which(beta_min < .10 & beta_max > .60 & auc < .20)
34+
hyper_idx <- which(beta_min < hyper_range[1] & beta_max > hyper_range[2] & auc > .80)
35+
hypo_idx <- which(beta_min < hypo_range[1] & beta_max > hypo_range[2] & auc < .20)
3536

3637
message(sprintf("[%s] Total hyper-methylated sites = %i", Sys.time(), length(hyper_idx)))
3738
message(sprintf("[%s] Total hypo-methylated sites = %i", Sys.time(), length(hypo_idx)))

0 commit comments

Comments
 (0)