-
Notifications
You must be signed in to change notification settings - Fork 7
/
peer.R
85 lines (66 loc) · 2.91 KB
/
peer.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env Rscript
# script to remove both known and hidden covariates using PEER
# partially adapted from: https://github.com/hall-lab/gtex
##' @param mat numeric data.matrix of gene expressions with genes as rows and individuals as columns, with gene entrez ids as rownames
##' @param mat.aligner data.frame of 2 columns contaning the gene entrez id and gene symbols, with "id" and "gene" as colnames
##' @param covs data.frame contaning the meta data / covariate data,
##' @return residuals data.frame of expression residual with genes as rows and individuals as columns
peer.residual <- function(mat, mat.aligner, covs=NULL, n.iteration=1000){
library(peer)
set.seed(666)
model <- PEER()
# normalize the values of one gene to the range of 0~1, so that no variable's range has an unduly large impact on the distance measurement.
# ref: https://stackoverflow.com/questions/16874038/error-with-knn-function
range01 <- function(x, ...){(x - min(x, ...)) / (max(x, ...) - min(x, ...))}
mat <- t(apply(mat, 1, FUN=function(x) range01(x, na.rm=T)))
# optional: Impute missing values in gene expression
if(sum(is.na(mat)) != 0){
# remove genes with more than 30% missing values
row.NAs <- which(rowSums(is.na(mat)) >= 0.3 * ncol(mat))
if(length(row.NAs) != 0){
message("removing genes with more than 30% missing values")
mat <- mat[-row.NAs, ]
}
# remove samples with more than 30% missing values
col.NAs <- which(colSums(is.na(mat)) >= 0.3 * nrow(mat))
if(length(col.NAs) != 0){
message("removing samples with more than 30% missing values")
mat <- mat[, -col.NAs]
}
if(sum(is.na(mat)) != 0){
message("imputing missing values in gene expression data")
library(impute)
mat.impute <- impute.knn(mat)
mat <- mat.impute$data
}
}
PEER_setPhenoMean(model, t(as.matrix(mat)))
# set covariates
if(!is.null(covs)){
PEER_setCovariates(model, as.matrix(covs))
}
# set the number of covariates 25% of the sample number, but less than 100. (just a sufficiently large value)
# ref: (PEER paper) https://www.nature.com/articles/nprot.2011.457
numcov <- 0.25*ncol(mat)
numcov <- min(numcov, 100)
PEER_setNk(model, numcov)
PEER_getNk(model)
# use the default tolerances
PEER_setTolerance(model, 0.001)
PEER_setVarTolerance(model, 0.00001)
# set the number of iterations
PEER_setNmax_iterations(model, n.iteration)
PEER_update(model)
factors=t(PEER_getX(model))
weights=PEER_getW(model)
precision=PEER_getAlpha(model)
residuals=t(PEER_getResiduals(model))
rownames(residuals) <- rownames(mat)
colnames(residuals) <- colnames(mat)
residuals <- as.data.frame(round(residuals, digits=4))
residuals <- cbind(rownames(residuals), residuals)
colnames(residuals)[1] <- "id"
residuals <- plyr::join(residuals, mat.aligner, by="id")
residuals <- residuals[,c(1,ncol(residuals),2:(ncol(residuals)-1))]
return(residuals)
}