Skip to content

Commit

Permalink
Merge pull request #2 from rmagoglia/SimpleFindSNPs
Browse files Browse the repository at this point in the history
Update find_intersecting_snps_2.py and new filter_remapped_reads_2.py
  • Loading branch information
rmagoglia authored Jul 14, 2016
2 parents 1d6087c + 4f3f139 commit 41ed66b
Show file tree
Hide file tree
Showing 2 changed files with 265 additions and 19 deletions.
197 changes: 197 additions & 0 deletions mapping/filter_remapped_reads_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
""" filter_remapped_reads
This is a rewrite of the official WASP code, which no longer requires
a .num.gz file. This code is compatable with the rewritten version of
the first step (find_intersecting_snps), which no longer generates the
.num.gz file. The bam file of remapped reads must be sorted by read name.
Must be run in Python2.
"""

from __future__ import print_function
import gzip
import sys
import pysam
import time

def run(to_remap_bam, remap_bam, keep_bam, is_paired_end):

print(time.strftime(("%b %d ") + time.strftime("%I:%M:%S")),\
"Starting run...")

to_remap_bam = pysam.Samfile(to_remap_bam, "rb")
remap_bam = pysam.Samfile(remap_bam, "rb")
keep_bam = pysam.Samfile(keep_bam, "wb", template=to_remap_bam)

# List of correctly mapped reads, represented by read ID
# (e.g. 42 in @42:chr1:14536:3)
correct_maps = []

# List of how many alternate versions of each read there are
# (e.g. 3 in @42:chr1:14536:3)
nums = []

end_of_file = False
prev_read_num = 0

# Keep track of remapped read index
counter = 1
skip = 0

# Get a list of reads that remapped correctly
remap_read = remap_bam.next()

while not end_of_file:


chrm = remap_read.qname.strip().split(":")[1]

if remap_read.is_reverse:
if is_paired_end:
pos = int(remap_read.qname.strip().split(":")[3])
else:
pos = int(remap_read.qname.strip().split(":")[2])

else:
pos = int(remap_read.qname.strip().split(":")[2])

read_num = int(remap_read.qname.strip().split(":")[0])
num = int(remap_read.qname.strip().split(":")[-1])

# For each original read, keep track of how many alternate reads were mapped
if read_num != prev_read_num:

# If none of the alternate versions of the read got mapped, set num to 1
if read_num != counter:
skipped = read_num - counter
nums += ([1]*skipped)
counter = read_num

if is_paired_end:
nums.append(num*2)
else:
nums.append(num)

prev_read_num = read_num
counter += 1

if (remap_read.tid != -1 and remap_read.pos == pos and
remap_bam.getrname(remap_read.tid) == chrm):

# Throw out the remapped read if it remapped with a deletion
dels = 0
for cig in remap_read.cigar:
if not cig[0] in (0, 3, 4):
dels += 1

if dels == 0:
correct_maps.append(read_num)

try:
remap_read = remap_bam.next()
except:
end_of_file = True

correct_maps.sort()

# Original aligned reads
orig_read = to_remap_bam.next()

# Number of different reads generated from the original read
orig_num = int(nums[0])

# Line number of the remap_bam file (if single end data) or read pair
# number if paired end data.
line_num = 1

# Index for walking through correct_maps.
map_indx = 0

# Number of correctly mapped reads for the current read (pair).
correct = 0

# Total number of correctly mapped read (pairs).
total_correct = 0

end_of_file = False
while (not end_of_file and
(map_indx < len(correct_maps)) and
(line_num <= correct_maps[-1])):

if line_num != correct_maps[map_indx]:

# If we saw the correct number of remaps for the last read, keep it.
if correct == orig_num:
total_correct += 1
keep_bam.write(orig_read)

# If the data is paired end, write out the paired read.
if is_paired_end:
try:
orig_read = to_remap_bam.next()
except:
sys.stderr.write("File ended unexpectedly (no pair found).")
exit()
keep_bam.write(orig_read)


else:
try:
second_read = to_remap_bam.next()
except:
end_of_file=True
break

try:
orig_read = to_remap_bam.next()
orig_num = nums[line_num]
except StopIteration:
end_of_file = True

line_num += 1
correct = 0

else:
correct += 1
map_indx += 1

if correct == orig_num:
total_correct += 1
keep_bam.write(orig_read)

# If the data is paired end, write out the paired read.
if is_paired_end:
try:
orig_read = to_remap_bam.next()
except:
sys.stderr.write("File ended unexpectedly (no pair found).")
exit()
keep_bam.write(orig_read)
print(time.strftime(("%b %d ") + time.strftime("%I:%M:%S")),\
"Finished!")
print()
print("RUN STATISTICS:")
print(" Total remapped read (pair)s:", line_num)
print(" Read (pair)s remapped to the correct position:", total_correct,\
"(" + "%.2f" % ((total_correct/float(line_num))*100) + "%)")

def main():
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("-p", action='store_true', dest='is_paired_end',
default=False, help=('Indicates that reads are '
'paired-end (default is single).'))
h = ('to.remap.bam file from find_intersecting_snps.py.')
parser.add_argument("to_remap_bam", help=h)
parser.add_argument("remap_bam", help='Remapped bam file.')
parser.add_argument("keep_bam", help=('File to write correctly remapped '
'reads to.'))

options = parser.parse_args()

run(options.to_remap_bam, options.remap_bam, options.keep_bam,
options.is_paired_end)

if __name__ == '__main__':
main()
Loading

0 comments on commit 41ed66b

Please sign in to comment.