Skip to content

Commit 83a192f

Browse files
committed
added starter code
1 parent a55b053 commit 83a192f

File tree

2 files changed

+197
-0
lines changed

2 files changed

+197
-0
lines changed

amino_acids.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
aa = ['F', 'L', 'I', 'M', 'V', 'S', 'P', 'T', 'A', 'Y',
2+
'|', 'H', 'Q', 'N', 'K', 'D', 'E', 'C', 'W', 'R',
3+
'G']
4+
5+
codons = [['TTT', 'TTC'],
6+
['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
7+
['ATT', 'ATC', 'ATA'],
8+
['ATG'],
9+
['GTT', 'GTC', 'GTA', 'GTG'],
10+
['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
11+
['CCT', 'CCC', 'CCA', 'CCG'],
12+
['ACT', 'ACC', 'ACA', 'ACG'],
13+
['GCT', 'GCC', 'GCA', 'GCG'],
14+
['TAT', 'TAC'],
15+
['TAA', 'TAG', 'TGA'],
16+
['CAT', 'CAC'],
17+
['CAA', 'CAG'],
18+
['AAT', 'AAC'],
19+
['AAA', 'AAG'],
20+
['GAT', 'GAC'],
21+
['GAA', 'GAG'],
22+
['TGT', 'TGC'],
23+
['TGG'],
24+
['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
25+
['GGT', 'GGC', 'GGA', 'GGG']]
26+
27+
# create a dictionary lookup table for mapping codons into amino acids
28+
aa_table = {}
29+
for i in range(len(aa)):
30+
for codon in codons[i]:
31+
aa_table[codon] = aa[i]

gene_finder.py

+166
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
YOUR HEADER COMMENT HERE
4+
5+
@author: YOUR NAME HERE
6+
7+
"""
8+
9+
import random
10+
from amino_acids import aa, codons, aa_table # you may find these useful
11+
from load import load_seq
12+
13+
14+
def shuffle_string(s):
15+
"""Shuffles the characters in the input string
16+
NOTE: this is a helper function, you do not
17+
have to modify this in any way """
18+
return ''.join(random.sample(s, len(s)))
19+
20+
# YOU WILL START YOUR IMPLEMENTATION FROM HERE DOWN ###
21+
22+
23+
def get_complement(nucleotide):
24+
""" Returns the complementary nucleotide
25+
26+
nucleotide: a nucleotide (A, C, G, or T) represented as a string
27+
returns: the complementary nucleotide
28+
>>> get_complement('A')
29+
'T'
30+
>>> get_complement('C')
31+
'G'
32+
"""
33+
# TODO: implement this
34+
pass
35+
36+
37+
def get_reverse_complement(dna):
38+
""" Computes the reverse complementary sequence of DNA for the specfied DNA
39+
sequence
40+
41+
dna: a DNA sequence represented as a string
42+
returns: the reverse complementary DNA sequence represented as a string
43+
>>> get_reverse_complement("ATGCCCGCTTT")
44+
'AAAGCGGGCAT'
45+
>>> get_reverse_complement("CCGCGTTCA")
46+
'TGAACGCGG'
47+
"""
48+
# TODO: implement this
49+
pass
50+
51+
52+
def rest_of_ORF(dna):
53+
""" Takes a DNA sequence that is assumed to begin with a start
54+
codon and returns the sequence up to but not including the
55+
first in frame stop codon. If there is no in frame stop codon,
56+
returns the whole string.
57+
58+
dna: a DNA sequence
59+
returns: the open reading frame represented as a string
60+
>>> rest_of_ORF("ATGTGAA")
61+
'ATG'
62+
>>> rest_of_ORF("ATGAGATAGG")
63+
'ATGAGA'
64+
"""
65+
# TODO: implement this
66+
pass
67+
68+
69+
def find_all_ORFs_oneframe(dna):
70+
""" Finds all non-nested open reading frames in the given DNA
71+
sequence and returns them as a list. This function should
72+
only find ORFs that are in the default frame of the sequence
73+
(i.e. they start on indices that are multiples of 3).
74+
By non-nested we mean that if an ORF occurs entirely within
75+
another ORF, it should not be included in the returned list of ORFs.
76+
77+
dna: a DNA sequence
78+
returns: a list of non-nested ORFs
79+
>>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC")
80+
['ATGCATGAATGTAGA', 'ATGTGCCC']
81+
"""
82+
# TODO: implement this
83+
pass
84+
85+
86+
def find_all_ORFs(dna):
87+
""" Finds all non-nested open reading frames in the given DNA sequence in
88+
all 3 possible frames and returns them as a list. By non-nested we
89+
mean that if an ORF occurs entirely within another ORF and they are
90+
both in the same frame, it should not be included in the returned list
91+
of ORFs.
92+
93+
dna: a DNA sequence
94+
returns: a list of non-nested ORFs
95+
96+
>>> find_all_ORFs("ATGCATGAATGTAG")
97+
['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG']
98+
"""
99+
# TODO: implement this
100+
pass
101+
102+
103+
def find_all_ORFs_both_strands(dna):
104+
""" Finds all non-nested open reading frames in the given DNA sequence on both
105+
strands.
106+
107+
dna: a DNA sequence
108+
returns: a list of non-nested ORFs
109+
>>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA")
110+
['ATGCGAATG', 'ATGCTACATTCGCAT']
111+
"""
112+
# TODO: implement this
113+
pass
114+
115+
116+
def longest_ORF(dna):
117+
""" Finds the longest ORF on both strands of the specified DNA and returns it
118+
as a string
119+
>>> longest_ORF("ATGCGAATGTAGCATCAAA")
120+
'ATGCTACATTCGCAT'
121+
"""
122+
# TODO: implement this
123+
pass
124+
125+
126+
def longest_ORF_noncoding(dna, num_trials):
127+
""" Computes the maximum length of the longest ORF over num_trials shuffles
128+
of the specfied DNA sequence
129+
130+
dna: a DNA sequence
131+
num_trials: the number of random shuffles
132+
returns: the maximum length longest ORF """
133+
# TODO: implement this
134+
pass
135+
136+
137+
def coding_strand_to_AA(dna):
138+
""" Computes the Protein encoded by a sequence of DNA. This function
139+
does not check for start and stop codons (it assumes that the input
140+
DNA sequence represents an protein coding region).
141+
142+
dna: a DNA sequence represented as a string
143+
returns: a string containing the sequence of amino acids encoded by the
144+
the input DNA fragment
145+
146+
>>> coding_strand_to_AA("ATGCGA")
147+
'MR'
148+
>>> coding_strand_to_AA("ATGCCCGCTTT")
149+
'MPA'
150+
"""
151+
# TODO: implement this
152+
pass
153+
154+
155+
def gene_finder(dna):
156+
""" Returns the amino acid sequences that are likely coded by the specified dna
157+
158+
dna: a DNA sequence
159+
returns: a list of all amino acid sequences coded by the sequence dna.
160+
"""
161+
# TODO: implement this
162+
pass
163+
164+
if __name__ == "__main__":
165+
import doctest
166+
doctest.testmod()

0 commit comments

Comments
 (0)