|
| 1 | +#!/usr/bin/env Rscript |
| 2 | + |
| 3 | +## --------------------------- |
| 4 | +## |
| 5 | +## Script name: computeCaptureKitCorrectionFactor.R |
| 6 | +## |
| 7 | +## Purpose of script: Compure correction factors for triplets between WGS and WES pipelines. |
| 8 | +## For details you can have a look at the documentation of the R package YAPSA: |
| 9 | +## https://www.bioconductor.org/packages/release/bioc/vignettes/YAPSA/inst/doc/vignette_exomes.html#introduction |
| 10 | +## |
| 11 | +## Author: Niklas Reimer, M.Sc. |
| 12 | +## |
| 13 | +## Date Created: 2020-11-04 |
| 14 | + |
| 15 | +## |
| 16 | +## --------------------------- |
| 17 | + |
| 18 | +require(bedr) |
| 19 | +require(plyr) |
| 20 | +require(stringr) |
| 21 | + |
| 22 | +args = commandArgs(trailingOnly=TRUE) |
| 23 | + |
| 24 | +# read bed file |
| 25 | +bed <- read.table(args[1]) |
| 26 | +bed <- transform(bed, V1 = as.character(V1), V2 = as.numeric(V2), V3 = as.numeric(V3)) |
| 27 | +bed <- bedr.sort.region(bed) |
| 28 | + |
| 29 | +# extract regions from fasta |
| 30 | +fasta <- get.fasta(bed, fasta = args[2]) |
| 31 | + |
| 32 | +# load targetCapture_cor_factors |
| 33 | +load(args[3]) |
| 34 | + |
| 35 | +# read triplets with every possible shift |
| 36 | +tripletlist1 <- strsplit(as.character(fasta$sequence), split = "(?<=.{3})", perl = TRUE) |
| 37 | +tripletlist2 <- strsplit(as.character(substring(fasta$sequence, 2)), split = "(?<=.{3})", perl = TRUE) |
| 38 | +tripletlist3 <- strsplit(as.character(substring(fasta$sequence, 3)), split = "(?<=.{3})", perl = TRUE) |
| 39 | +triplets1 <- unlist(tripletlist1) |
| 40 | +triplets2 <- unlist(tripletlist2) |
| 41 | +triplets3 <- unlist(tripletlist3) |
| 42 | +triplets <- c(triplets1, triplets2, triplets3) |
| 43 | + |
| 44 | +# filter by sequence |
| 45 | +counts <- count(toupper(triplets)) |
| 46 | +# remove sequences with length lower than 3 |
| 47 | +countsFiltered <- counts[str_length(counts$x) == 3,] |
| 48 | + |
| 49 | +# remove ambigious triplets |
| 50 | +countsFiltered <- countsFiltered[str_detect(countsFiltered$x, "N", negate = TRUE),] |
| 51 | + |
| 52 | +# calculate correction factors |
| 53 | +abs_cor <- targetCapture_cor_factors["hs37d5"]$hs37d5$abs_cor / countsFiltered$freq |
| 54 | +rel_cor <- abs_cor / (sum(targetCapture_cor_factors["hs37d5"]$hs37d5$abs_cor) / sum(countsFiltered$freq)) |
| 55 | + |
| 56 | +# save computated data |
| 57 | +targetCapture_cor_factors[[args[4]]] <- list("rel_cor" = rel_cor, "abs_cor" = abs_cor) |
| 58 | +save(targetCapture_cor_factors, file = file.path(args[5])) |
0 commit comments