-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathprobeTiler.py
executable file
·175 lines (135 loc) · 5.5 KB
/
probeTiler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/env python
"""
designProbes
This script generates probes of a given length and
tiling density for use in capture experiments.
"""
import shlex, subprocess, sys, os, argparse, re, math, uuid
from string import *
__author__ = "Colby Chiang ([email protected])"
__version__ = "$Revision: 0.0.3 $"
__date__ = "$Date: 2011/06/22 13:22 $"
############# FUNCTIONS ###################
#runs a command in the bash shell
def bash(cmd,cwd=None):
# print(cmd) # diagnostic
retVal = subprocess.Popen(cmd, shell=True, \
stdout=subprocess.PIPE, cwd=cwd).stdout.read().strip('\n').split('\n')
if retVal==['']:
return(0)
else:
return(retVal)
# gets the reverse complement of a DNA strand
def revcomp(dna):
""" reverse complement of a DNA sequence """
comp = dna.translate(maketrans("AGCTagct", "TCGAtcga"))
lcomp = list(comp)
lcomp.reverse()
return ''.join(lcomp)
############# END FUNCTIONS ################
parser = argparse.ArgumentParser(description='Design capture probes over multiple genomic regions.')
parser.add_argument('-g', '--genome', required=True, help='Genome 2bit file')
parser.add_argument('-l', '--length', required=True, help='Length of each probe (bp)')
parser.add_argument('-s', '--separation', help='Separation distance between probes. (Either this or probeNumber is required)')
parser.add_argument('-n', '--probeNumber', help='total number of probes to tile to. (Either this or separation is required)')
parser.add_argument('-str', '--strand', choices=['plus', 'minus', 'both'], default='plus', help='Tile over the plus strand, the minus strand or both strands')
parser.add_argument( '-c', '--coordinates', help='Chromosomal position for region of interest. Coordinates are zero-based. Must be of the form chr6:5302012-5703254')
parser.add_argument('-bed', '--bedFile', help='BED file of regions')
#parser.add_argument('-o', '--output', required=True, help='Output prefix for probe files')
args = parser.parse_args()
genome = args.genome
coords = args.coordinates
probeLength = int(args.length)
strand = args.strand
refGenome = args.genome # 2bit fasta file (downloaded from here http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/)
myID = uuid.uuid1().hex # session id
coordList = []
if coords != None and args.bedFile != None:
parser.error('input must be either BED file or coordinates, but not both')
elif coords != None:
# Input control for bad values
if re.search(r'.*:.*-.*', coords) == None:
parser.error('coordinates must be of the form \'chr#:start-end\'')
coords = re.sub(r'[,\s]', '', coords) # strip whitespace and commas
coords = re.split('[:-]', coords) # splits chr5:2039-23800 into array ['chr5', '2039', '23800']
chr = coords[0].lower() # escape characters, convert to lowercase
if chr[:3] != 'chr':
parser.error('could not understand chromosome \'%s\'' % chr)
try:
n = int(chr[3:])
if n > 22 or n < 1:
parser.error('could not understand chromosome \'%s\'\nchromosomes must be of the form \'chr1-22\', \'chrX\', \'chrY\', or \'chrM\'' % chr)
except ValueError:
if re.search(chr[3:], '[xym]'):
chr = chr[:3] + chr[3:].upper()
else:
parser.error('could not understand chromosome \'%s\'\nchromosomes must be of the form \'chr1-22\', \'chrX\', \'chrY\', or \'chrM\'' % chr)
start = coords[1]
try:
start = int(start)
if start < 0: parser.error('start position cannot be less than 0')
except ValueError:
parser.error('start position must be integer greater than 0')
end = coords[2]
try:
end = int(end)
if end < 0: parser.error('end position cannot be less than 0')
except ValueError:
parser.error('end position must be integer greater than 0')
if start > end:
parser.error('end position must be greater than start position')
coordList.append([chr, start, end])
elif args.bedFile != None:
bedFile = open(args.bedFile, 'r')
for l in bedFile:
bedData = l.rstrip().split('\t')
bedData[1] = int(bedData[1])
bedData[2] = int(bedData[2])
coordList.append(bedData)
bedFile.close()
fullLength = 0
for r in coordList:
fullLength = fullLength + r[2] - r[1]
if args.separation != None:
stepSize = int(args.separation)
elif args.probeNumber != None:
probeNumber = int(args.probeNumber)
stepSize = (float(fullLength) - (probeLength)*len(coordList)) / probeNumber
else:
parser.error('either --separation or --probeNumber is required')
carryOver = 0
counter = 1
for region in coordList:
chr = region[0]
start = region[1]
end = region[2]
cmd = "twoBitToFa %s %s_region.fa -seq=%s -start=%s -end=%s" % (refGenome, myID, chr, start,end)
bash(cmd)
cmd = "tail -n +2 %s_region.fa | tr -d '\n' > %s_region2.fa; mv %s_region2.fa %s_region.fa" % (myID, myID, myID, myID)
bash(cmd)
fasta = open(myID + "_region.fa", 'r')
seq = fasta.read()
seq = seq.upper()
i = carryOver
j = 0
while round(i,4) < (len(seq) - probeLength): # have to round here to protect against a tiny bit of rounding error
roundCoord = int(round(i))
if strand == 'plus':
probeSeq = seq[roundCoord:roundCoord+probeLength]
elif strand == 'minus':
probeSeq = revcomp(seq[roundCoord:roundCoord+probeLength])
elif strand == 'both':
if j % 2 == 1:
probeSeq = revcomp(seq[roundCoord:roundCoord+probeLength])
else:
probeSeq = seq[roundCoord:roundCoord+probeLength]
else:
parser.error('something went wrong with the strands')
print '%s\t%s' % (counter, probeSeq)
counter += 1
i += stepSize
j += 1
carryOver = i - (len(seq)-probeLength)
# print 'carryOver ', carryOver, stepSize
fasta.close()
os.remove(myID + "_region.fa")