DEoverlap_FDR.Rmd

---
title: "Differential Gene Expression overlap"
---
  
```{r setup, echo=F,message=F,warning=F}
library(cmapR)
library(colorspace)
library(scales)
source("~/Dropbox/GDB/line2user.R")
knitr::opts_chunk$set(
  echo=F,
  message=F,
  warning=F
)
```

```{r FX}
DE_heatmap <- function(Z_all,FDR_all,pDE_all,data_name=c("Ligands","Cell lines"),aCut=NA) {
  
  ALPHA <- c(0.1,0.05,0.01,0.001)
  names(ALPHA) <- colnames(pDE_all)
  
  # trim to genes with <Zp% probability of change by chance in at least one ligand
  if (is.na(aCut)) {
    temp_cut <- ALPHA[1]
  } else {
    temp_cut <- aCut
  }
  FDR <- FDR_all[apply(FDR_all,1,function(X) any(X <= temp_cut)),,drop=F]
  if (ncol(FDR) >= 50) {
    FDR <- FDR[,apply(FDR,2,function(X) any(X <= temp_cut)),drop=F]
  }
  if (nrow(FDR) < 2 | ncol(FDR) < 2) {
    stop(paste0("Less than two samples reached FDR threshold of ",aCut * 100,"%"))
  }
  
  # order ligands
  temp_hROW <- hclust(dist(t(FDR)),method="ward.D2") 
  #### Euclidean distance in -log10 space? ####
  
  # order genes
  temp_hGENE <- hclust(dist(FDR),method="ward.D2")
  
  # order genes by DE prior
  # all(rownames(FDR) %in% DEprior$Gene_EntrezID)
  # FDR <- FDR[DEprior$Gene_EntrezID[DEprior$Gene_EntrezID %in% rownames(FDR)],]
  
  FDR <- FDR[temp_hGENE$order,temp_hROW$order]
  countDE <- t(sapply(ALPHA,function(Z) apply(FDR,2,function(X) sum(X <= Z))))
  FDR <- -log10(FDR) * sign(Z_all[rownames(FDR),colnames(FDR)])
  pDE <- pDE_all[colnames(FDR),]
  scoreDE <- pDE + 1e-4
  scoreDE[scoreDE > 1] <- 1
  scoreDE <- t(-log10(scoreDE))
  
  
  layout(rbind(1:2),widths=c(6,1))
  if (nrow(FDR) >= 100) {
    par(mar=c(4.5,6,1,0.5),mgp=2:0)
    label_cex <- 0.9
  } else if (data_name == "Ligands") {
    par(mar=c(4.5,6,1,0.5),mgp=2:0)
    label_cex <- 1
  } else if (data_name == "Cell lines") {
    par(mar=c(4.5,10,1.5,0.5),mgp=2:0)
    label_cex <- 0.9
  } else {
    stop("data_name must be 'Ligands' or 'Cell lines'")
  }
  image(z=FDR / max(abs(FDR)),
        x=1:nrow(FDR),y=1:ncol(FDR),
        col=diverging_hcl(1000,palette="Blue-Red3"),
        breaks=seq(-1,1,length.out=1001),
        xaxt="n",yaxt="n",xlab=NA,ylab=NA)
  if (nrow(FDR) < 100) {
    mtext(lvl4_data@rdesc[rownames(FDR),"pr_gene_symbol"],
          side=1,las=2,at=1:nrow(FDR),line=0.1,cex=0.7)
    mtext("Genes",side=1,line=3.5,font=2,cex=1.5)
  } else {
    mtext("Genes",side=1,line=1,font=2,cex=1.5)
  }
  if (ncol(FDR) < 100) {
    mtext(colnames(FDR),side=2,las=2,
          at=1:ncol(FDR),adj=1,line=0.1,cex=label_cex)
  }
  if (nrow(FDR) >= 100) {
    mtext(data_name,side=2,line=2,font=2,cex=1.5,las=0)
  } else if (data_name == "Ligands") {
    mtext(data_name,side=2,line=4.5,font=2,cex=1.5,las=0)
  } else if (data_name == "Cell lines") {
    mtext(data_name,side=3,line=0,font=2,cex=1.5,las=0,
          at=par("usr")[1],adj=1)
  } 
  
  segments(x0=seq(line2user(5,2),line2user(1,2),length.out=1001),
           x1=seq(line2user(5,2),line2user(1,2),length.out=1001),
           y0=rep(line2user(1,1),1001),y1=rep(line2user(1.7,1),1001),
           xpd=NA,col=diverging_hcl(1001,palette="Blue-Red3"))
  text(c(line2user(5,2),line2user(3,2),line2user(1,2)),
       rep(line2user(2.2,1),3),
       labels=c(-1 * round(max(abs(FDR))),
                0,
                round(max(abs(FDR)))),
       xpd=NA)
  text(line2user(3,2),line2user(3.5,1),xpd=NA,
       labels=paste("Signed","-log10 FDR",sep="\n"))
  
  if (!is.na(aCut)) {
    rect(xleft=which(abs(FDR) >= -log10(aCut),arr.ind=T)[,1] - 0.5,
         xright=which(abs(FDR) >= -log10(aCut),arr.ind=T)[,1] + 0.5,
         ybottom=which(abs(FDR) >= -log10(aCut),arr.ind=T)[,2] - 0.5,
         ytop=which(abs(FDR) >= -log10(aCut),arr.ind=T)[,2] + 0.5,
         border="mediumseagreen",lwd=0.5)
    rect(xleft=seq(line2user(3,2),line2user(1,2),
                   length.out=500)[round((-log10(aCut) / max(abs(FDR))) * 1001 / 2)],
         xright=line2user(1,2),ybottom=line2user(1.7,1),ytop=line2user(1,1),
         xpd=NA,col=NA,border="mediumseagreen")
    rect(xright=seq(line2user(5,2),line2user(3,2),
                    length.out=500)[500 - round((-log10(aCut) / max(abs(FDR))) * 1001 / 2)],
         xleft=line2user(5,2),ybottom=line2user(1.7,1),ytop=line2user(1,1),
         xpd=NA,col=NA,border="mediumseagreen")
    text(line2user(3,2),line2user(0.5,1),adj=0.5,xpd=NA,
         labels=paste0("FDR \u2264 ",aCut * 100,"%"),col="mediumseagreen",cex=0.9)  
  }
  
  if (nrow(FDR) >= 100) {
    par(mar=c(4,0.5,1,1),mgp=2:0)
  } else if (data_name == "Ligands") {
    par(mar=c(4,0.5,1,1),mgp=2:0)
  } else if (data_name == "Cell lines") {
    par(mar=c(4,0.5,1.5,1),mgp=2:0)
  }
  image(z=scoreDE / max(scoreDE),
        x=1:nrow(scoreDE),y=1:ncol(scoreDE),
        col=sequential_hcl(99,palette="PinkYl",rev=T),
        breaks=seq(0,1,length.out=100),
        xaxt="n",yaxt="n",xlab=NA,ylab=NA)
  abline(v=seq(1.5,by=1,length.out=3),col="grey50")
  mtext(c("10.0","5.0","1.0","0.1"),
        side=1,at=1:4,las=2,line=0.1,cex=0.9)
  mtext("FDR %",side=1,at=0.2,las=2,line=0.1)
  if (ncol(FDR) < 100) {
    text(as.vector(sapply(1:ncol(countDE),function(X) 1:nrow(countDE))),
         as.vector(sapply(1:ncol(countDE),function(X) rep(X,nrow(countDE)))),
         labels=as.vector(countDE),cex=0.8,col="grey20")
  }
  segments(x0=seq(1.5,4,length.out=1001),
           x1=seq(1.5,4,length.out=1001),
           y0=rep(line2user(2.1,1),1001),y1=rep(line2user(2.6,1),1001),
           xpd=NA,col=sequential_hcl(1001,palette="PinkYl",rev=T))
  text(x=c(1.2,4.3),y=rep(line2user(2.4,1),2),xpd=NA,
       labels=c(0,max(scoreDE)))
  mtext("-log10(P)",side=1,line=2.7,at=2.75)
  mtext("# DE at FDR thresholds",side=4)
}


pDE_heatmap <- function(pDEvector) {
  scoreDE <- pDEvector + 1e-4
  scoreDE[scoreDE > 1] <- 1
  scoreDE <- -log10(scoreDE)
  
  scoreMAT <- tapply(scoreDE,as.factor(sub("^.+_","",names(scoreDE))),c)
  for (X in names(scoreMAT)) {
    names(scoreMAT[[X]]) <- sub(paste0("_",X),"",names(scoreMAT[[X]]))
    scoreMAT[[X]] <- scoreMAT[[X]][sort(names(scoreMAT[[X]]))]
  }
  scoreMAT <- t(do.call(rbind,scoreMAT))
  
  if (ncol(scoreMAT) > 100) {
    scoreMAT <- scoreMAT[,apply(scoreMAT,2,function(X) any(X >= -log10(0.05 + 1e-4)))]
  }
  
  temp_hLIG <- hclust(dist(t(scoreMAT)),method="ward.D2")
  temp_hCT <- hclust(dist(scoreMAT),method="ward.D2")
  scoreMAT <- scoreMAT[temp_hCT$order,temp_hLIG$order]

  par(mar=c(3,2,7,6),mgp=2:0,las=2)
  image(scoreMAT,
        col=sequential_hcl(100,palette="PinkYl",rev=T),
        x=1:nrow(scoreMAT),y=1:ncol(scoreMAT),
        xaxt="n",yaxt="n",xlab=NA,ylab=NA)
  if (ncol(scoreMAT) < 100) {
    mtext(colnames(scoreMAT),side=4,line=0.1,
          at=1:ncol(scoreMAT),adj=0,cex=switch((ncol(scoreMAT) > 20) + 1,0.9,0.5))
    mtext("Ligands",side=4,line=4.5,las=0,font=2,cex=1.5,
          at=par("usr")[3] + (par("usr")[4] - par("usr")[3]) / 2,adj=0.5)
  } else {
    mtext("Ligands",side=4,line=2,las=0,font=2,cex=1.5,
          at=par("usr")[3] + (par("usr")[4] - par("usr")[3]) / 2,adj=0.5)
  }
  text(x=1:nrow(scoreMAT),y=rep(line2user(0.3,3),nrow(scoreMAT)),
       labels=rownames(scoreMAT),xpd=NA,adj=0,srt=45,cex=0.8)
  text(x=0,y=line2user(1,3),labels="Cell lines",
       xpd=NA,adj=0,srt=45,font=2,cex=1.5)

  rect(xleft=which(scoreMAT >= -log10(0.05 + 1e-4),arr.ind=T)[,1] - 0.5,
       xright=which(scoreMAT >= -log10(0.05 + 1e-4),arr.ind=T)[,1] + 0.5,
       ybottom=which(scoreMAT >= -log10(0.05 + 1e-4),arr.ind=T)[,2] - 0.5,
       ytop=which(scoreMAT >= -log10(0.05 + 1e-4),arr.ind=T)[,2] + 0.5,
       border="dodgerblue")
  
  temp_left <- par("usr")[1] + (par("usr")[2] - par("usr")[1]) * 0.3
  temp_right <- par("usr")[1] + (par("usr")[2] - par("usr")[1]) * 0.7
  
  segments(x0=seq(temp_left,temp_right,length.out=1000),
           x1=seq(temp_left,temp_right,length.out=1000),
           y0=rep(line2user(1,1),1000),y1=rep(line2user(1.6,1),1000),
           xpd=NA,col=sequential_hcl(1000,palette="PinkYl",rev=T))
  mtext(c("100%","< 0.01%"),side=1,line=0.8,las=0,adj=c(1.1,-0.1),
        at=c(temp_left,temp_right))
  mtext("Probability of at least # DE occuring by chance",
        las=0,side=1,line=1.7,adj=0.5,
        at=temp_left + (temp_right - temp_left) / 2)
  
  rect(xleft=temp_left + (-log10(0.05 + 1e-4) / max(scoreMAT)) * (temp_right - temp_left),
       xright=temp_right,ybottom=line2user(1,1),ytop=line2user(1.6,1),
       xpd=NA,col=NA,border="dodgerblue")
  mtext("p <= 0.05",side=1,las=0,col="dodgerblue",cex=0.9,line=0,adj=1,at=temp_right)
}


BoxP <- function(DSname) {
  par(mfrow=c(1,length(DSname)))
  for (D in DSname) {
    load(paste0("~/Dropbox/GDB_archive/CMapCorr_files/",D,"_lig_FDR.RData"))
    scoreDElig <- pDE_lig + 1e-4
    scoreDElig[scoreDElig > 1] <- 1
    scoreDElig <- -log10(scoreDElig)
    load(paste0("~/Dropbox/GDB_archive/CMapCorr_files/",D,"_ct_FDR.RData"))
    scoreDEct <- pDE_ct + 1e-4
    scoreDEct[scoreDEct > 1] <- 1
    scoreDEct <- -log10(scoreDEct)
    load(paste0("~/Dropbox/GDB_archive/CMapCorr_files/",D,"_ligct_FDR.RData"))
    scoreDEligct <- pDE_ligct + 1e-4
    scoreDEligct[scoreDEligct > 1] <- 1
    scoreDEligct <- -log10(scoreDEligct)
    load(paste0("~/Dropbox/GDB_archive/CMapCorr_files/",D,"_rep_FDR.RData"))
    scoreDErep <- pDE_rep + 1e-4
    scoreDErep[scoreDErep > 1] <- 1
    scoreDErep <- -log10(scoreDErep)
    
    par(mar=c(5,3,2,1),mgp=2:0)
    plot(NA,NA,xlim=c(1,19),ylim=c(0,4),
         xaxt="n",xlab=NA,ylab="-log10(P) # DE at FDR thresholds",
         main=names(DSname)[which(DSname == D)])
    boxplot(scoreDEct,at=seq(1,by=5,length.out=4),
            add=T,xaxt="n",yaxt="n",pch=".",cex=2,
            border=qualitative_hcl(4,palette="dynamic")[1],
            col=qualitative_hcl(4,alpha=0.5,palette="dynamic")[1])
    temp_stat <- sapply(colnames(scoreDElig),function(X)
      wilcox.test(scoreDElig[,X],scoreDEct[,X])$p.value)
    if (any(temp_stat <= 2.2e-16)) {
      mtext("*",side=3,line=-0.4,
            at=seq(1,by=5,length.out=4)[temp_stat <= 2.2e-16])
    }
    boxplot(scoreDElig,at=seq(2,by=5,length.out=4),
            add=T,xaxt="n",yaxt="n",pch=".",cex=2,
            border=qualitative_hcl(4,palette="dynamic")[2],
            col=qualitative_hcl(4,alpha=0.5,palette="dynamic")[2])
    boxplot(scoreDEligct,at=seq(3,by=5,length.out=4),
            add=T,xaxt="n",yaxt="n",pch=".",cex=2,
            border=qualitative_hcl(4,palette="dynamic")[3],
            col=qualitative_hcl(4,alpha=0.5,palette="dynamic")[3])
    temp_stat <- sapply(colnames(scoreDElig),function(X) 
      wilcox.test(scoreDElig[,X],scoreDEligct[,X])$p.value)
    if (any(temp_stat <= 2.2e-16)) {
      mtext("*",side=3,line=-0.4,
            at=seq(3,by=5,length.out=4)[temp_stat <= 2.2e-16])
    }
    boxplot(scoreDErep,at=seq(4,by=5,length.out=4),
            add=T,xaxt="n",yaxt="n",pch=".",cex=2,
            border=qualitative_hcl(4,palette="dynamic")[4],
            col=qualitative_hcl(4,alpha=0.5,palette="dynamic")[4])
    temp_stat <- sapply(colnames(scoreDElig),function(X) 
      wilcox.test(scoreDElig[,X],scoreDErep[,X])$p.value)
    if (any(temp_stat <= 2.2e-16)) {
      mtext("*",side=3,line=-0.4,
            at=seq(4,by=5,length.out=4)[temp_stat <= 2.2e-16])
    }
    # points(seq(1,by=5,length.out=4),colMeans(scoreDEct),
    #        pch="-",cex=3,col=alpha("red",0.5))
    # points(seq(2,by=5,length.out=4),colMeans(scoreDElig),
    #        pch="-",cex=3,col=alpha("red",0.5))
    # points(seq(3,by=5,length.out=4),colMeans(scoreDEligct),
    #        pch="-",cex=3,col=alpha("red",0.5))
    # points(seq(4,by=5,length.out=4),colMeans(scoreDErep),
    #        pch="-",cex=3,col=alpha("red",0.5))
    mtext(rep(c("Cell line","Ligand","Lig / Line","Replicate",NA),4),
          col=c(qualitative_hcl(4,palette="dynamic"),NA),
          side=1,line=0.1,las=2,at=1:19,cex=0.8)
    mtext(paste0("FDR = ",c(10,5,1,0.1),"%"),
          side=1,line=3.5,at=seq(2.5,by=5,length.out=4))
  }
}
```


```{r load_data}
load("~/Dropbox/GDB_archive/CMapCorr_files/lvl4_inputs_allgenes.RData")
lvl4_data <- lvl4_data_all
rm(lvl4_data_all)

temp_ligcountsperct <- sapply(unique(lvl4_data@cdesc$cell_id),function(CT)
  length(unique(lvl4_data@cdesc[lvl4_data@cdesc$cell_id == CT,"pert_iname"])))
ct9 <- names(temp_ligcountsperct)[temp_ligcountsperct > 100]
names(ct9) <- sapply(ct9,function(X) names(ct14)[ct14 == X])

temp_ctcountsperlig <- sapply(unique(lvl4_data@cdesc$pert_iname),function(LIG)
  length(unique(lvl4_data@cdesc[lvl4_data@cdesc$pert_iname == LIG,"cell_id"])))

lig295 <- names(temp_ctcountsperlig)[temp_ctcountsperlig >= 9]

rm(list=grep("^temp",ls(),value=T))
```

Calculating significantly differentially expressed genes by Z-score threshold (90th, 95th, and 99th percentile).  No weighting during averaging (unlike Connectivity Map level 5 data, which weights correlated replicates higher).


# Averaging per ligand

## 16 ligands, 'landmark' genes only

```{r lig16_lig, fig.height=4,fig.width=9}
temp <- load("~/Dropbox/GDB_archive/CMapCorr_files/lig16_DE_lig_FDR.RData")
DE_heatmap(meanZ_lig,FDR_lig,pDE_lig,"Ligands",aCut=0.1)
rm(list=temp)
```

## 295 ligands, 'landmark' genes only

```{r lig295_lig, fig.height=10,fig.width=9}
temp <- load("~/Dropbox/GDB_archive/CMapCorr_files/lig295_DE_lig_FDR.RData")
DE_heatmap(meanZ_lig,FDR_lig,pDE_lig,"Ligands",aCut=0.05)
# rm(list=temp)
```


Example plot to clarify what "averaging per ligand" means:  

```{r demo_meanZ, fig.width=6,fig.height=4}
temp_lig <- c("ADM","IL1A","TNF")
for (LIG in temp_lig) {
  temp_gene <- rev(rownames(FDR_lig)[order(FDR_lig[,LIG])[seq(1,sum(FDR_lig[,LIG] < 1))]])
  temp_Z <- sapply(temp_gene,function(X) 
    lvl4_data@mat[X,lvl4_data@cdesc$pert_iname == LIG],
    simplify=F)
  
  par(mar=c(3,4,1,4),mgp=2:0)
  plot(NA,NA,xlim=range(unlist(temp_Z)),ylim=c(0.5,length(temp_Z) + 0.5),
       yaxs="i",yaxt="n",ylab=NA,
       xlab=paste("Z-scores from all samples treated with",LIG))
  abline(v=0,lty=2,col="red")
  boxplot(temp_Z,horizontal=T,pch=".",cex=2,add=T,yaxt="n")
  points(meanZ_lig[temp_gene,LIG],seq_along(temp_gene),
         pch=20,col=alpha("dodgerblue",0.8))
  legend("topleft",bty="n",pch=20,col=alpha("dodgerblue",0.8),
         legend="Mean Z-score")
  mtext(lvl4_data@rdesc[temp_gene,"pr_gene_symbol"],
        side=2,at=seq_along(temp_gene),line=0.1,las=2)
  mtext(signif(FDR_lig[temp_gene,LIG],2),
        side=4,at=seq_along(temp_gene),line=0.1,las=2)
  mtext("FDR",side=4,at=length(temp_gene) + 0.7,line=0.5,las=2,font=2)
}

rm(LIG)
rm(list=temp)
rm(list=grep("^temp",ls(),value=T))
```


```{r demo_volcano, eval=F,fig.height=4,fig.width=9}
temp_lig <- c("ADM","IL1A","TNF")

temp_meanZ <- sapply(temp_lig,function(LIG)
  rowMeans(lvl4_data@mat[lvl4_data@rdesc$pr_is_lm == "1",
                         lvl4_data@cdesc$cell_id %in% ct9 &
                           lvl4_data@cdesc$pert_iname == LIG]))
par(mfrow=c(1,3),mar=c(3,3,2,1),mgp=2:0)
for (LIG in temp_lig) {
  plot(temp_meanZ[rownames(FDR_lig),LIG],-log10(FDR_lig[,LIG]),pch=20,col=alpha(1,0.5),
       xlab="Mean Z-score per gene for all ligand-treated samples",
       ylab="-log10 FDR from Z-score",main=LIG)
}
rm(list=temp)
rm(list=grep("^temp",ls(),value=T))
```


# Averaging per cell line

## 16 ligands, 'landmark' genes only

```{r lig16_ct, fig.height=4,fig.width=9}
temp <- load("~/Dropbox/GDB_archive/CMapCorr_files/lig16_DE_ct_FDR.RData")
try(DE_heatmap(meanZ_ct,FDR_ct,pDE_ct,"Cell lines",aCut=0.1),silent=F)
rm(list=temp)
```


## 295 ligands, 'landmark' genes only

```{r lig295_ct, fig.height=4,fig.width=9}
temp <- load("~/Dropbox/GDB_archive/CMapCorr_files/lig295_DE_ct_FDR.RData")
try(DE_heatmap(meanZ_ct,FDR_ct,pDE_ct,"Cell lines",aCut=0.1),silent=F)
rm(list=temp)
```


# Averaging within each ligand per cell line

## 16 ligands, 'landmark' genes only

```{r lig16_ligct, fig.height=6,fig.width=8}
temp <- load("~/Dropbox/GDB_archive/CMapCorr_files/lig16_DE_ligct_FDR.RData")
DE_heatmap(meanZ_ligct,FDR_ligct,pDE_ligct,"Ligands per cell line",aCut=0.001)
```

```{r lig16_ligct_pDE, fig.height=5,fig.width=6}
pDE_heatmap(pDE_ligct[,"01"])
rm(list=temp)
```


## 295 ligands, 'landmark' genes only

```{r lig295_ligct, fig.height=8,fig.width=9}
temp <- load("~/Dropbox/GDB_archive/CMapCorr_files/lig295_DE_ligct_FDR.RData")
DE_heatmap(meanZ_ligct,FDR_ligct,pDE_ligct,"Ligands per cell line",aCut=0.001)
```

```{r lig295_ligct_pDE, fig.height=4,fig.width=9}
temp_FDRcut <- 0.05

pDEvector <- pDE_ligct[,as.character(temp_FDRcut * 100)]
names(pDEvector) <- sub("large_","",names(pDEvector))

scoreDE <- pDEvector + 1e-4
scoreDE[scoreDE > 1] <- 1
scoreDE <- -log10(scoreDE)

scoreMAT <- tapply(scoreDE,as.factor(sub("^.+_","",names(scoreDE))),c)
for (X in names(scoreMAT)) {
  names(scoreMAT[[X]]) <- sub(paste0("_",X),"",names(scoreMAT[[X]]))
  scoreMAT[[X]] <- scoreMAT[[X]][sort(names(scoreMAT[[X]]))]
}
rm(X)
scoreMAT <- t(do.call(rbind,scoreMAT))

if (ncol(scoreMAT) > 100) {
  scoreMAT <- scoreMAT[,apply(scoreMAT,2,function(X) any(X >= -log10(0.05 + 1e-4)))]
}

temp_hLIG <- hclust(dist(t(scoreMAT)),method="ward.D2")
temp_hCT <- hclust(dist(scoreMAT),method="ward.D2")
scoreMAT <- scoreMAT[temp_hCT$order,temp_hLIG$order]

par(mar=c(4.5,1,1,5.5),mgp=2:0)
image(t(scoreMAT),
      col=sequential_hcl(100,palette="PinkYl",rev=T),
      x=1:ncol(scoreMAT),y=1:nrow(scoreMAT),
      xaxt="n",yaxt="n",xlab=NA,ylab=NA)
mtext(paste0("Ligands (",ncol(scoreMAT)," / ",length(scoreDE) / nrow(scoreMAT),")"),
      side=1,line=0.5,font=1,cex=1.2)

text(x=rep(line2user(0.3,4),nrow(scoreMAT)),y=1:nrow(scoreMAT),
     labels=rownames(scoreMAT),xpd=NA,adj=0,srt=315,cex=0.8)
mtext("Cell lines",side=4,las=2,line=0.5,at=line2user(0,3),cex=1.2)

rect(xleft=which(t(scoreMAT) >= -log10(0.05 + 1e-4),arr.ind=T)[,1] - 0.5,
     xright=which(t(scoreMAT) >= -log10(0.05 + 1e-4),arr.ind=T)[,1] + 0.5,
     ybottom=which(t(scoreMAT) >= -log10(0.05 + 1e-4),arr.ind=T)[,2] - 0.5,
     ytop=which(t(scoreMAT) >= -log10(0.05 + 1e-4),arr.ind=T)[,2] + 0.5,
     border="dodgerblue")

temp_left <- par("usr")[1] + (par("usr")[2] - par("usr")[1]) * 0.2
temp_right <- par("usr")[1] + (par("usr")[2] - par("usr")[1]) * 0.8

segments(x0=seq(temp_left,temp_right,length.out=1000),
         x1=seq(temp_left,temp_right,length.out=1000),
         y0=rep(line2user(2,1),1000),y1=rep(line2user(2.6,1),1000),
         xpd=NA,col=sequential_hcl(1000,palette="PinkYl",rev=T))
mtext(c("100%","< 0.01%"),side=1,line=1.8,las=0,adj=c(1.1,-0.1),
      at=c(temp_left,temp_right))
mtext(paste0("Probability of at least # DE at ",
             as.character(temp_FDRcut * 100),
             "% FDR occuring by chance"),
      las=0,side=1,line=2.7,adj=0.5,
      at=temp_left + (temp_right - temp_left) / 2)

rect(xleft=temp_left + (-log10(0.05 + 1e-4) / max(scoreMAT)) * (temp_right - temp_left),
     xright=temp_right,ybottom=line2user(2,1),ytop=line2user(2.6,1),
     xpd=NA,col=NA,border="dodgerblue")
mtext("p <= 0.05",side=1,las=0,col="dodgerblue",cex=0.9,line=1,adj=1,at=temp_right)

```

```{r lig295_clean}
rm(pDEvector)
rm(list=temp)
rm(list=grep("^score",ls(),value=T))
rm(list=grep("^temp",ls(),value=T))
```


# Comparing DE count distributions

```{r pDE_comparison, fig.height=5,fig.width=9}
inD <- c("lig16_DE","lig295_DE")
names(inD) <- c("16 ligands","295 ligands")

BoxP(inD)
```

The * denotes p < 2.2e-16 from the Wilcoxon rank-sum test comparing to averaging by ligand.  


```{r pDE_comparison_lig295, fig.width=6,fig.height=3.5}
D <- "lig295_DE"

load(paste0("~/Dropbox/GDB_archive/CMapCorr_files/",D,"_lig_FDR.RData"))
scoreDElig <- pDE_lig + 1e-4
scoreDElig[scoreDElig > 1] <- 1
scoreDElig <- -log10(scoreDElig)
load(paste0("~/Dropbox/GDB_archive/CMapCorr_files/",D,"_ligct_FDR.RData"))
scoreDEligct <- pDE_ligct + 1e-4
scoreDEligct[scoreDEligct > 1] <- 1
scoreDEligct <- -log10(scoreDEligct)
load(paste0("~/Dropbox/GDB_archive/CMapCorr_files/",D,"_rep_FDR.RData"))
scoreDErep <- pDE_rep + 1e-4
scoreDErep[scoreDErep > 1] <- 1
scoreDErep <- -log10(scoreDErep)

par(mar=c(3,4.5,0.5,2),mgp=2:0)
plot(NA,NA,xlim=c(0,4),ylim=c(1,15),
     xlab="Probability of # DE at FDR thresholds",
     xaxt="n",yaxt="n",ylab=NA)
axis(1,at=-log10(c(1,0.1,0.05,0.01,0.001,0.0001)),
     labels=c("100%","10%","5%","1%","0.1%","0.01%"))
boxplot(scoreDElig,at=seq(1,by=4,length.out=4),
        add=T,horizontal=T,
        xaxt="n",yaxt="n",pch=".",cex=2,
        border=qualitative_hcl(3,palette="dark3")[1],
        col=qualitative_hcl(3,alpha=0.5,palette="dark3")[1])
boxplot(scoreDEligct,at=seq(2,by=4,length.out=4),
        add=T,horizontal=T,
        xaxt="n",yaxt="n",pch=".",cex=2,
        border=qualitative_hcl(3,palette="dark3")[2],
        col=qualitative_hcl(3,alpha=0.5,palette="dark3")[2])
temp_stat <- sapply(colnames(scoreDElig),function(X) 
  wilcox.test(scoreDElig[,X],scoreDEligct[,X])$p.value)
if (any(temp_stat <= 2.2e-16)) {
  mtext("*",side=2,line=3.3,font=2,
        at=seq(2,by=4,length.out=4)[temp_stat <= 2.2e-16])
}
boxplot(scoreDErep,at=seq(3,by=4,length.out=4),
        add=T,horizontal=T,
        xaxt="n",yaxt="n",pch=".",cex=2,
        border=qualitative_hcl(3,palette="dark3")[3],
        col=qualitative_hcl(3,alpha=0.5,palette="dark3")[3])
temp_stat <- sapply(colnames(scoreDElig),function(X) 
  wilcox.test(scoreDElig[,X],scoreDErep[,X])$p.value)
if (any(temp_stat <= 2.2e-16)) {
  mtext("*",side=2,line=3.3,font=2,
        at=seq(3,by=4,length.out=4)[temp_stat <= 2.2e-16])
}
# points(seq(1,by=5,length.out=4),colMeans(scoreDEct),
#        pch="-",cex=3,col=alpha("red",0.5))
# points(seq(2,by=5,length.out=4),colMeans(scoreDElig),
#        pch="-",cex=3,col=alpha("red",0.5))
# points(seq(3,by=5,length.out=4),colMeans(scoreDEligct),
#        pch="-",cex=3,col=alpha("red",0.5))
# points(seq(4,by=5,length.out=4),colMeans(scoreDErep),
#        pch="-",cex=3,col=alpha("red",0.5))
mtext(rep(c("Ligand","Lig / Line","Replicate",NA),4),
      col=c(qualitative_hcl(3,palette="dark3"),NA),
      side=2,line=0.1,las=2,at=1:16)
mtext(paste0(c(10,5,1,0.1),"% FDR"),
      side=4,line=0,at=seq(2,by=4,length.out=4),cex=0.9)

```

****

# Probability of overlapping DE


```{r cleanup}
rm(list=ls())
source("~/Dropbox/GDB/line2user.R")
```

```{r load_data_lvl5}
load("~/Dropbox/GDB_archive/CMapCorr_files/lvl5_inputs.RData")

temp_ligcountsperct <- sapply(unique(lvl5_data@cdesc$cell_id),function(CT)
  length(unique(lvl5_data@cdesc[lvl5_data@cdesc$cell_id == CT,"pert_iname"])))
ct9 <- names(temp_ligcountsperct)[temp_ligcountsperct > 100]
names(ct9) <- sapply(ct9,function(X) names(ct14)[ct14 == X])

temp_ctcountsperlig <- sapply(unique(lvl5_data@cdesc$pert_iname),function(LIG)
  length(unique(lvl5_data@cdesc[lvl5_data@cdesc$pert_iname == LIG,"cell_id"])))

lig295 <- names(temp_ctcountsperlig)[temp_ctcountsperlig >= 9]

rm(list=c("ct14","lig16",grep("^temp",ls(),value=T)))
```

```{r testing_summary_methods, fig.height=2,fig.width=8}
temp_overlap <- sapply(lig295,function(LIG) {
  sapply(ct9, function(CT) {
    ID <- lvl5_data@cid[lvl5_data@cdesc$pert_iname == LIG & lvl5_data@cdesc$cell_id == CT]
    temp <- pnorm(-abs(lvl5_data@mat[,ID,drop=F]))
    return(sum(apply(temp,1,function(X) all(X <= 0.05))))
  })
})

temp_meanZ <- sapply(lig295,function(LIG) {
  sapply(ct9, function(CT) {
    ID <- lvl5_data@cid[lvl5_data@cdesc$pert_iname == LIG & lvl5_data@cdesc$cell_id == CT]
    return(sum(pnorm(-abs(rowMeans(lvl5_data@mat[,ID,drop=F]))) <= 0.05))
  })
})

par(mar=c(3,6,1,1),mgp=2:0)
boxplot(list(OVERLAP=temp_overlap,MEANZ=temp_meanZ),horizontal=T,yaxt="n",pch=".",cex=2,
        xlab="# DE genes common across replicates at p <= 0.05 when summarized as shown")
mtext(c("Intersect","Mean Z score"),side=2,las=2,line=0.5,at=c(1,2))
```

Using level5 data (moderated Z score of all replicates).  To summarize signatures across dosage and duration, going to take the mean Z score.  This seems more fair, given that different ligand / cell line combinations have differing numbers of dosage/duration replicates.

```{r}

```