From 29af1b46721e78bc93f98bdd37de42be88f2b06e Mon Sep 17 00:00:00 2001 From: Rachel Agoglia Date: Mon, 20 Jun 2016 13:12:48 -0700 Subject: [PATCH 01/11] Update find_intersecting_snps_2.py test --- mapping/find_intersecting_snps_2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mapping/find_intersecting_snps_2.py b/mapping/find_intersecting_snps_2.py index ce18917..44b3ed3 100644 --- a/mapping/find_intersecting_snps_2.py +++ b/mapping/find_intersecting_snps_2.py @@ -9,6 +9,7 @@ from __future__ import print_function import argparse import gzip +import time import itertools as it from collections import defaultdict, Counter from glob import glob From cf9fb3ca9d37b207487d4b50847eca821e23abc4 Mon Sep 17 00:00:00 2001 From: Rachel Marie Agoglia Date: Mon, 20 Jun 2016 13:16:07 -0700 Subject: [PATCH 02/11] Removed time This is a longer message saying what I did. --- mapping/find_intersecting_snps_2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mapping/find_intersecting_snps_2.py b/mapping/find_intersecting_snps_2.py index 44b3ed3..ce18917 100644 --- a/mapping/find_intersecting_snps_2.py +++ b/mapping/find_intersecting_snps_2.py @@ -9,7 +9,6 @@ from __future__ import print_function import argparse import gzip -import time import itertools as it from collections import defaultdict, Counter from glob import glob From 5f9a389acaad3a18799c39ef896f30c50aef922e Mon Sep 17 00:00:00 2001 From: Rachel Agoglia Date: Thu, 23 Jun 2016 15:55:17 -0700 Subject: [PATCH 03/11] Format output; fix limit on number of SNPs allowed Added run statistics and runtime to standard output. Fixed limit on number of SNPs allowed per read (default is 1024). Allowed for chromosome names that have a "." in them (e.g. NC_012920.1). Removed -s --sorted option, since it is not used. --- mapping/find_intersecting_snps_2.py | 65 +++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 17 deletions(-) diff --git a/mapping/find_intersecting_snps_2.py b/mapping/find_intersecting_snps_2.py index ce18917..80846c8 100644 --- a/mapping/find_intersecting_snps_2.py +++ b/mapping/find_intersecting_snps_2.py @@ -9,6 +9,7 @@ from __future__ import print_function import argparse import gzip +import time import itertools as it from collections import defaultdict, Counter from glob import glob @@ -44,7 +45,8 @@ def get_snps(snpdir, chrom_only = None): """ snp_dict = defaultdict(dict) if path.exists(path.join(snpdir, 'all.txt.gz')): - print("Loading snps from consolidated file") + print(time.strftime(("%b %d ") + time.strftime("%I:%M:%S")), ".... Load\ + ing snps from consolidated file.") for line in gzip.open(path.join(snpdir, 'all.txt.gz'), 'rt', encoding='ascii'): chrom, pos, ref, alt = line.split() if chrom_only is not None and chrom != chrom_only: @@ -53,10 +55,11 @@ def get_snps(snpdir, chrom_only = None): snp_dict[chrom][pos] = "".join([ref, alt]) return snp_dict for fname in glob(path.join(snpdir, '*.txt.gz')): - chrom = path.basename(fname).split('.')[0] + chrom = path.basename(fname).split('.snps.txt.gz')[0] if chrom_only is not None and chrom != chrom_only: continue - print("Loading snps from ", fname) + print(time.strftime(("%b %d ") + time.strftime("%I:%M:%S")), ".... Load\ + ing snps from", fname) i = -1 for i, line in enumerate(gzip.open(fname, 'rt', encoding='ascii')): pos, ref, alt = line.split() @@ -164,7 +167,7 @@ def get_dual_read_seqs(read1, read2, snp_dict, indel_dict, dispositions): dispositions['has_snps'] += 1 return seqs1, seqs2 -def get_read_seqs(read, snp_dict, indel_dict, dispositions): +def get_read_seqs(read, snp_dict, indel_dict, dispositions): # Currently untested """ For each read, get all possible SNP substitutions for N biallelic snps in the read, will return 2^N reads @@ -236,13 +239,14 @@ def assign_reads(insam, snp_dict, indel_dict, is_paired=True): read_results = Counter() remap_num = 1 for i, read in enumerate(insam): + read_results['total'] += 1 if i % 10000 == 0: pass if not is_paired: read_seqs = get_read_seqs(read, snp_dict, indel_dict, read_results) write_read_seqs([(read, read_seqs)], keep, remap_bam, fastqs) elif read.is_proper_pair: - slot_self = read.is_read2 # 0 if is_read1, 1 if read2 + slot_self = read.is_read2 # 0 if is_read1, 1 if is_read2 slot_other = read.is_read1 if read.qname in unpaired_reads[slot_other]: both_reads = [None, None] @@ -260,9 +264,43 @@ def assign_reads(insam, snp_dict, indel_dict, is_paired=True): # Most tools assume reads are paired and do not check IDs. Drop it out. continue print() - print(len(unpaired_reads[0]), len(unpaired_reads[1])) - print(read_results) + print(time.strftime(("%b %d ") + time.strftime("%I:%M:%S")), ".... Finished!") + print() + print("RUN STATISTICS:") + if is_paired: + total_pairs = read_results['total']//2 + print(" Total input reads:", total_pairs, "pairs.") + print(" Unpaired reads:", len(unpaired_reads[0]) + len(unpaired_reads[1]), "(" + \ + "%.2f" % ((len(unpaired_reads[0]) + len(unpaired_reads[1]) / total_pairs)*100) + "%)") + else: + total_pairs = read_results['total'] + print(" Total input reads:", total_pairs) + + print(" Reads with no SNPs:", read_results['no_snps'], "(" + "%.2f" % ((read_results\ + ['no_snps'] / total_pairs)*100) + "%)") + print(" Reads overlapping SNPs:", read_results['has_snps'], "(" + "%.2f" % ((read_results\ + ['has_snps'] / total_pairs)*100) + "%)") + + if not is_paired: + print("\tReference SNP matches:", read_results['ref_match'], "(" + "%.2f" % ((read_results\ + ['ref_match'] / total_pairs)*100) + "%)") + print("\tNon-reference SNP matches:", read_results['no_match'], "(" + "%.2f" % ((read_results\ + ['no_match'] / total_pairs)*100) + "%)") + + print(" Reads dropped [INDEL]:", read_results['toss_indel'], "(" + "%.2f" % ((read_results\ + ['toss_indel'] / total_pairs)*100) + "%)") + print(" Reads dropped [too many SNPs]:", read_results['toss_manysnps'], "(" + "%.2f" % \ + ((read_results['toss_manysnps'] / total_pairs)*100) + "%)") + + if is_paired: + print(" Reads dropped [anomalous pair]:", read_results['toss_anomalous_phase'], "(" + \ + "%.2f" % ((read_results['toss_anomalous_phase'] / total_pairs)*100) + "%)") + print(" Reads dropped [not proper pair]:", read_results['not_proper_pair'], "(" + "%.2f" % \ + ((read_results['not_proper_pair'] / total_pairs)*100) + "%)") + + print() + print(read_results) def write_read_seqs(both_read_seqs, keep, remap_bam, fastqs, dropped=None, remap_num=0): """Write the given reads out to the appropriate file @@ -278,9 +316,8 @@ def write_read_seqs(both_read_seqs, keep, remap_bam, fastqs, dropped=None, remap """ reads, seqs = zip(*both_read_seqs) assert len(reads) == len(fastqs) - - num_seqs = product(len(r[1]) for r in both_read_seqs) - if num_seqs == 0 or num_seqs > MAX_SEQS_PER_READ: + num_seqs = len(both_read_seqs[0][1]) + if num_seqs == 0: # or num_seqs > MAX_SEQS_PER_READ: if dropped is not None: for read in reads: dropped.write(read) @@ -335,12 +372,6 @@ def write_read_seqs(both_read_seqs, keep, remap_bam, fastqs, dropped=None, remap default=None, help="Limit loading of SNPs to the specified chromosome") - parser.add_argument("-s", "--sorted", - action='store_true', dest='is_sorted', default=False, - help=('Indicates that the input bam file' - ' is coordinate sorted (default is False)')) - - parser.add_argument("infile", type=Samfile, help=("Coordinate sorted bam " "file.")) snp_dir_help = ('Directory containing the SNPs segregating within the ' @@ -357,6 +388,6 @@ def write_read_seqs(both_read_seqs, keep, remap_bam, fastqs, dropped=None, remap SNP_DICT = get_snps(options.snp_dir, options.limit_to_chrom) INDEL_DICT = get_indels(SNP_DICT) - print("Done with SNPs") + print(time.strftime(("%b %d ") + time.strftime("%I:%M:%S")), ".... Done with SNPs.") assign_reads(options.infile, SNP_DICT, INDEL_DICT, options.is_paired_end) From 11334c426a06fa0957355697611a95f9808a429d Mon Sep 17 00:00:00 2001 From: Rachel Agoglia Date: Thu, 23 Jun 2016 16:28:19 -0700 Subject: [PATCH 04/11] Remove unnecessary print statement Run statistics covers everything we need to print. --- mapping/find_intersecting_snps_2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mapping/find_intersecting_snps_2.py b/mapping/find_intersecting_snps_2.py index 80846c8..224c0d8 100644 --- a/mapping/find_intersecting_snps_2.py +++ b/mapping/find_intersecting_snps_2.py @@ -300,7 +300,6 @@ def assign_reads(insam, snp_dict, indel_dict, is_paired=True): ((read_results['not_proper_pair'] / total_pairs)*100) + "%)") print() - print(read_results) def write_read_seqs(both_read_seqs, keep, remap_bam, fastqs, dropped=None, remap_num=0): """Write the given reads out to the appropriate file From 1706f699c99dab6b129da23abf17a93954735023 Mon Sep 17 00:00:00 2001 From: Rachel Agoglia Date: Tue, 28 Jun 2016 11:48:31 -0700 Subject: [PATCH 05/11] Update run statistics Script now tracks how many reads contain the reference SNP alleles given in the SNP files. The number of "reference matches" should ideally be 100%. If this percentage is very low, it may indicte that the SNPs are incorrect or that they are not 1-indexed, etc. --- mapping/find_intersecting_snps_2.py | 41 +++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/mapping/find_intersecting_snps_2.py b/mapping/find_intersecting_snps_2.py index 224c0d8..78b9053 100644 --- a/mapping/find_intersecting_snps_2.py +++ b/mapping/find_intersecting_snps_2.py @@ -23,7 +23,7 @@ # We better hope we're in Python 2. print(exc) -MAX_SEQS_PER_READ = 1024 +MAX_SEQS_PER_READ = 32 def product(iterable): "Returns the product of all items in the iterable" @@ -45,8 +45,8 @@ def get_snps(snpdir, chrom_only = None): """ snp_dict = defaultdict(dict) if path.exists(path.join(snpdir, 'all.txt.gz')): - print(time.strftime(("%b %d ") + time.strftime("%I:%M:%S")), ".... Load\ - ing snps from consolidated file.") + print(time.strftime(("%b %d ") + time.strftime("%I:%M:%S")),\ + ".... Loading snps from consolidated file.") for line in gzip.open(path.join(snpdir, 'all.txt.gz'), 'rt', encoding='ascii'): chrom, pos, ref, alt = line.split() if chrom_only is not None and chrom != chrom_only: @@ -58,8 +58,8 @@ def get_snps(snpdir, chrom_only = None): chrom = path.basename(fname).split('.snps.txt.gz')[0] if chrom_only is not None and chrom != chrom_only: continue - print(time.strftime(("%b %d ") + time.strftime("%I:%M:%S")), ".... Load\ - ing snps from", fname) + print(time.strftime(("%b %d ") + time.strftime("%I:%M:%S")), \ + ".... Loading snps from", fname) i = -1 for i, line in enumerate(gzip.open(fname, 'rt', encoding='ascii')): pos, ref, alt = line.split() @@ -129,11 +129,18 @@ def get_dual_read_seqs(read1, read2, snp_dict, indel_dict, dispositions): return [[], []] for ref_pos in snps: + match = False + dispositions['total_snps'] += 1 alleles = snps[ref_pos] pos1, pos2 = read_posns[ref_pos] new_seqs1 = [] new_seqs2 = [] if pos1 is None: + if seq2[pos2] in alleles: + dispositions['ref_match'] += 1 + else: + dispositions['no_match'] += 1 + for allele in alleles: if allele == seq2[pos2]: continue @@ -142,6 +149,11 @@ def get_dual_read_seqs(read1, read2, snp_dict, indel_dict, dispositions): new_seqs2.append(''.join([seq2[:pos2], allele, seq2[pos2+1:]])) elif pos2 is None: + if seq1[pos1] in alleles: + dispositions['ref_match'] += 1 + else: + dispositions['no_match'] += 1 + for allele in alleles: if allele == seq1[pos1]: continue @@ -152,6 +164,10 @@ def get_dual_read_seqs(read1, read2, snp_dict, indel_dict, dispositions): if seq1[pos1] != seq2[pos2]: dispositions['toss_anomalous_phase'] += 1 return [[], []] + if seq2[pos2] in alleles: + dispositions['ref_match'] += 1 + else: + dispositions['no_match'] += 1 for allele in alleles: if allele == seq2[pos2]: continue @@ -187,6 +203,8 @@ def get_read_seqs(read, snp_dict, indel_dict, dispositions): # Currently unteste return [] if ref_pos in snp_dict[chrom]: + dispositions['total_snps'] += 1 + read_base = read.seq[read_pos] if read_base in snp_dict[chrom][ref_pos]: dispositions['ref_match'] += 1 @@ -282,11 +300,12 @@ def assign_reads(insam, snp_dict, indel_dict, is_paired=True): print(" Reads overlapping SNPs:", read_results['has_snps'], "(" + "%.2f" % ((read_results\ ['has_snps'] / total_pairs)*100) + "%)") - if not is_paired: - print("\tReference SNP matches:", read_results['ref_match'], "(" + "%.2f" % ((read_results\ - ['ref_match'] / total_pairs)*100) + "%)") - print("\tNon-reference SNP matches:", read_results['no_match'], "(" + "%.2f" % ((read_results\ - ['no_match'] / total_pairs)*100) + "%)") + total_snps = read_results['total_snps'] + print(" Total SNPs covered:", total_snps) + print("\tReference SNP matches:", read_results['ref_match'], "(" + "%.2f" % ((read_results\ + ['ref_match'] / total_snps)*100) + "%)") + print("\tNon-reference SNP matches:", read_results['no_match'], "(" + "%.2f" % ((read_results\ + ['no_match'] / total_snps)*100) + "%)") print(" Reads dropped [INDEL]:", read_results['toss_indel'], "(" + "%.2f" % ((read_results\ ['toss_indel'] / total_pairs)*100) + "%)") @@ -316,7 +335,7 @@ def write_read_seqs(both_read_seqs, keep, remap_bam, fastqs, dropped=None, remap reads, seqs = zip(*both_read_seqs) assert len(reads) == len(fastqs) num_seqs = len(both_read_seqs[0][1]) - if num_seqs == 0: # or num_seqs > MAX_SEQS_PER_READ: + if num_seqs == 0: if dropped is not None: for read in reads: dropped.write(read) From adb23deb46e702c2c563b406fea8886a428dc389 Mon Sep 17 00:00:00 2001 From: Rachel Agoglia Date: Wed, 29 Jun 2016 10:58:27 -0700 Subject: [PATCH 06/11] Fix bug that writes duplicate reads to fastqs Duplicates of reads were being written to the fastq files. --- mapping/find_intersecting_snps_2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mapping/find_intersecting_snps_2.py b/mapping/find_intersecting_snps_2.py index 78b9053..6604fc1 100644 --- a/mapping/find_intersecting_snps_2.py +++ b/mapping/find_intersecting_snps_2.py @@ -346,6 +346,8 @@ def write_read_seqs(both_read_seqs, keep, remap_bam, fastqs, dropped=None, remap for read, seqs in both_read_seqs: keep.write(read) else: + print() + assert len(reads) > 0 for read in reads: remap_bam.write(read) @@ -362,7 +364,7 @@ def write_read_seqs(both_read_seqs, keep, remap_bam, fastqs, dropped=None, remap first = True # Some python fanciness to deal with single or paired end reads (or # n-ended reads, if such technology ever happens. - for read_seqs in it.product(*seqs): + for read_seqs in zip(*seqs): if first: first = False continue From c9eff328847c6c4ba72870cc62378dac0ac4ad80 Mon Sep 17 00:00:00 2001 From: Rachel Agoglia Date: Wed, 29 Jun 2016 11:02:44 -0700 Subject: [PATCH 07/11] Remove unnecessary print statement --- mapping/find_intersecting_snps_2.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mapping/find_intersecting_snps_2.py b/mapping/find_intersecting_snps_2.py index 6604fc1..3cdcf44 100644 --- a/mapping/find_intersecting_snps_2.py +++ b/mapping/find_intersecting_snps_2.py @@ -346,8 +346,6 @@ def write_read_seqs(both_read_seqs, keep, remap_bam, fastqs, dropped=None, remap for read, seqs in both_read_seqs: keep.write(read) else: - print() - assert len(reads) > 0 for read in reads: remap_bam.write(read) From 97734ae9a5464829fbada05693851727759a5a6f Mon Sep 17 00:00:00 2001 From: Rachel Agoglia Date: Wed, 29 Jun 2016 16:46:26 -0700 Subject: [PATCH 08/11] Rewrite filter_remapped_reads Changes from original version: - No longer requires .num.gz file (now compatible with fild_intersecting_snps_2.py) - Tracks runtime and run statistics --- mapping/filter_remapped_reads_2.py | 183 +++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 mapping/filter_remapped_reads_2.py diff --git a/mapping/filter_remapped_reads_2.py b/mapping/filter_remapped_reads_2.py new file mode 100644 index 0000000..a295436 --- /dev/null +++ b/mapping/filter_remapped_reads_2.py @@ -0,0 +1,183 @@ +""" filter_remapped_reads + +This is a rewrite of the official WASP code, which no longer requires +a .num.gz file. This code is compatable with the rewritten version of +the first step (find_intersecting_snps), which no longer generates the +.num.gz file. + +""" + +from __future__ import print_function +import gzip +import sys +import pysam +import time + +def run(to_remap_bam, remap_bam, keep_bam, is_paired_end): + + print(time.strftime(("%b %d ") + time.strftime("%I:%M:%S")),\ + "Starting run...") + + to_remap_bam = pysam.Samfile(to_remap_bam, "rb") + remap_bam = pysam.Samfile(remap_bam, "rb") + keep_bam = pysam.Samfile(keep_bam, "wb", template=to_remap_bam) + + # List of correctly mapped reads, represented by read ID + # (e.g. 42 in @42:chr1:14536:3) + correct_maps = [] + + # List of how many alternate versions of each read there are + # (e.g. 3 in @42:chr1:14536:3) + nums = [] + + end_of_file = False + prev_read_num = 0 + + # Get a list of reads that remapped correctly + remap_read = remap_bam.next() + + while not end_of_file: + + chrm = remap_read.qname.strip().split(":")[1] + + if remap_read.is_reverse: + if is_paired_end: + pos = int(remap_read.qname.strip().split(":")[3]) + else: + pos = int(remap_read.qname.strip().split(":")[2]) + + else: + pos = int(remap_read.qname.strip().split(":")[2]) + + read_num = int(remap_read.qname.strip().split(":")[0]) + num = int(remap_read.qname.strip().split(":")[-1]) + + # For each original read, keep track of how many alternate reads were mapped + if read_num != prev_read_num: + if is_paired_end: + nums.append(num*2) + else: + nums.append(num) + + prev_read_num = read_num + + if (remap_read.tid != -1 and remap_read.pos == pos and + remap_bam.getrname(remap_read.tid) == chrm): + + # Throw out the remapped read if it remapped with a deletion + dels = 0 + for cig in remap_read.cigar: + if not cig[0] in (0, 3, 4): + dels += 1 + + if dels == 0: + correct_maps.append(read_num) + + try: + remap_read = remap_bam.next() + except: + end_of_file = True + + correct_maps.sort() + + # Original aligned reads + orig_read = to_remap_bam.next() + + # Number of different reads generated from the original read + orig_num = int(nums[0]) + + # Line number of the remap_bam file (if single end data) or read pair + # number if paired end data. + line_num = 1 + + # Index for walking through correct_maps. + map_indx = 0 + + # Number of correctly mapped reads for the current read (pair). + correct = 0 + + # Total number of correctly mapped read (pairs). + total_correct = 0 + + end_of_file = False + while (not end_of_file and + (map_indx < len(correct_maps)) and + (line_num <= correct_maps[-1])): + + if line_num != correct_maps[map_indx]: + + # If we saw the correct number of remaps for the last read, keep it. + if correct == orig_num: + total_correct += 1 + keep_bam.write(orig_read) + + # If the data is paired end, write out the paired read. + if is_paired_end: + try: + orig_read = to_remap_bam.next() + except: + sys.stderr.write("File ended unexpectedly (no pair found).") + exit() + keep_bam.write(orig_read) + + + else: + try: + second_read = to_remap_bam.next() + except: + end_of_file=True + break + + try: + orig_read = to_remap_bam.next() + orig_num = nums[line_num] + except StopIteration: + end_of_file = True + + line_num += 1 + correct = 0 + + else: + correct += 1 + map_indx += 1 + + if correct == orig_num: + total_correct += 1 + keep_bam.write(orig_read) + + # If the data is paired end, write out the paired read. + if is_paired_end: + try: + orig_read = to_remap_bam.next() + except: + sys.stderr.write("File ended unexpectedly (no pair found).") + exit() + keep_bam.write(orig_read) + print(time.strftime(("%b %d ") + time.strftime("%I:%M:%S")),\ + "Finished!") + print() + print("RUN STATISTICS:") + print(" Total remapped read (pair)s:", line_num) + print(" Read (pair)s remapped to the correct position:", total_correct,\ + "(" + "%.2f" % ((total_correct/float(line_num))*100) + "%)") + +def main(): + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("-p", action='store_true', dest='is_paired_end', + default=False, help=('Indicates that reads are ' + 'paired-end (default is single).')) + h = ('to.remap.bam file from find_intersecting_snps.py.') + parser.add_argument("to_remap_bam", help=h) + parser.add_argument("remap_bam", help='Remapped bam file.') + parser.add_argument("keep_bam", help=('File to write correctly remapped ' + 'reads to.')) + + options = parser.parse_args() + + run(options.to_remap_bam, options.remap_bam, options.keep_bam, + options.is_paired_end) + +if __name__ == '__main__': + main() From ebb05d106cfbf247ae567b3c121422650d212bef Mon Sep 17 00:00:00 2001 From: Rachel Agoglia Date: Wed, 29 Jun 2016 16:49:08 -0700 Subject: [PATCH 09/11] Remove unused variable "map_indx" This variable wasn't being used for anything. --- mapping/filter_remapped_reads_2.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mapping/filter_remapped_reads_2.py b/mapping/filter_remapped_reads_2.py index a295436..bb9334e 100644 --- a/mapping/filter_remapped_reads_2.py +++ b/mapping/filter_remapped_reads_2.py @@ -90,9 +90,6 @@ def run(to_remap_bam, remap_bam, keep_bam, is_paired_end): # number if paired end data. line_num = 1 - # Index for walking through correct_maps. - map_indx = 0 - # Number of correctly mapped reads for the current read (pair). correct = 0 @@ -139,7 +136,6 @@ def run(to_remap_bam, remap_bam, keep_bam, is_paired_end): else: correct += 1 - map_indx += 1 if correct == orig_num: total_correct += 1 From f6fb1900ec56abc40305827250039789bd4d4fff Mon Sep 17 00:00:00 2001 From: Rachel Agoglia Date: Thu, 30 Jun 2016 10:19:17 -0700 Subject: [PATCH 10/11] Don't delete map_indx Turns out we do need this. --- mapping/filter_remapped_reads_2.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mapping/filter_remapped_reads_2.py b/mapping/filter_remapped_reads_2.py index bb9334e..a295436 100644 --- a/mapping/filter_remapped_reads_2.py +++ b/mapping/filter_remapped_reads_2.py @@ -90,6 +90,9 @@ def run(to_remap_bam, remap_bam, keep_bam, is_paired_end): # number if paired end data. line_num = 1 + # Index for walking through correct_maps. + map_indx = 0 + # Number of correctly mapped reads for the current read (pair). correct = 0 @@ -136,6 +139,7 @@ def run(to_remap_bam, remap_bam, keep_bam, is_paired_end): else: correct += 1 + map_indx += 1 if correct == orig_num: total_correct += 1 From 4f3f139ad9287520039ea39c33894f66c4e7315a Mon Sep 17 00:00:00 2001 From: Rachel Agoglia Date: Thu, 14 Jul 2016 13:24:42 -0700 Subject: [PATCH 11/11] Fix bug for unmapped reads Script now works for reads that did not remap at all, so the remapped bam file need not contain unmapped reads. Note that the remapped bam file MUST be sorted by read name. --- mapping/filter_remapped_reads_2.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/mapping/filter_remapped_reads_2.py b/mapping/filter_remapped_reads_2.py index a295436..1f32e68 100644 --- a/mapping/filter_remapped_reads_2.py +++ b/mapping/filter_remapped_reads_2.py @@ -3,7 +3,8 @@ This is a rewrite of the official WASP code, which no longer requires a .num.gz file. This code is compatable with the rewritten version of the first step (find_intersecting_snps), which no longer generates the -.num.gz file. +.num.gz file. The bam file of remapped reads must be sorted by read name. +Must be run in Python2. """ @@ -33,11 +34,16 @@ def run(to_remap_bam, remap_bam, keep_bam, is_paired_end): end_of_file = False prev_read_num = 0 + # Keep track of remapped read index + counter = 1 + skip = 0 + # Get a list of reads that remapped correctly remap_read = remap_bam.next() while not end_of_file: + chrm = remap_read.qname.strip().split(":")[1] if remap_read.is_reverse: @@ -54,12 +60,20 @@ def run(to_remap_bam, remap_bam, keep_bam, is_paired_end): # For each original read, keep track of how many alternate reads were mapped if read_num != prev_read_num: + + # If none of the alternate versions of the read got mapped, set num to 1 + if read_num != counter: + skipped = read_num - counter + nums += ([1]*skipped) + counter = read_num + if is_paired_end: nums.append(num*2) else: nums.append(num) prev_read_num = read_num + counter += 1 if (remap_read.tid != -1 and remap_read.pos == pos and remap_bam.getrname(remap_read.tid) == chrm):