-
Notifications
You must be signed in to change notification settings - Fork 0
/
alignment_functions.R
66 lines (56 loc) · 2.46 KB
/
alignment_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
####### translateGappedAln: translate() does not deal with gap characters yet (should later - see https://github.com/Bioconductor/Biostrings/issues/30). So I made my own translateGappedAln function
getCodons <- function(myAln) {
seqs <- as.character(myAln)
len <- width(myAln)[1]
starts <- seq(from=1, to=len, by=3)
ends <- starts + 2
myViews <- lapply(myAln, function(x) {
Views(x, starts, ends)
})
myCodons <- lapply(myViews, function(x) {
as.character(DNAStringSet(x))
})
myCodons
}
## translateCodons - takes a character vector of codons as input, outputs the corresponding amino acids
translateCodons <- function(myCodons, unknownCodonTranslatesTo="-") {
## make new genetic code
gapCodon <- "-"
names(gapCodon) <- "---"
my_GENETIC_CODE <- c(GENETIC_CODE, gapCodon)
## translate the codons
pep <- my_GENETIC_CODE[myCodons]
## check for codons that were not possible to translate, e.g. frameshift codons
if (sum(is.na(pep))>0) {
message("\nWarning - there were codons I could not translate. Using this character: ", unknownCodonTranslatesTo, "\n")
unknownCodons <- unique(myCodons[ which(is.na(pep))])
message("The codons in question were: ",
paste(unknownCodons, collapse=","),
"\n\n")
pep[ which(is.na(pep)) ] <- unknownCodonTranslatesTo
}
## prep for output
pep <- paste(pep, collapse="")
return(pep)
}
## wrap the getCodons and translateCodons functions together into one:
translateGappedAln <- function(myAln, unknownCodonTranslatesTo="-") {
myCodons <- getCodons(myAln)
myAAaln <- AAStringSet(unlist(lapply(myCodons, translateCodons, unknownCodonTranslatesTo=unknownCodonTranslatesTo)))
return(myAAaln)
}
########### getAlnPosLookupTable - uses an alignment to get a lookup table of alignment position versus ungapped position in each sequence. Gap positions get an NA
getAlnPosLookupTable <- function(myAln) {
if(length(unique(width(myAln)))>1) {
stop("\n\nERROR - the seqs in the alignment are not all the same length\n\n")
}
output <- tibble(aln_pos = 1:width(myAln)[1])
myAln_chars <- strsplit(as.character(myAln),"")
each_seq_pos <- sapply(myAln_chars, function(eachSeq_chars) {
countNonGap <- cumsum(eachSeq_chars != "-")
countNonGap[which(eachSeq_chars=="-")] <- NA
return(countNonGap)
})
output <- cbind(output, each_seq_pos)
return(output)
}