Skip to content

Commit 4209eb6

Browse files
committed
fix: junctions extractor count overlapping read pairs once
- closes griffithlab#187, implemented by adding set of `reads` to `Junction`, and only incrementing `read_count` if read has not been seen yet - this commit should also increase RegTools speed by removing the barcode updating bottleneck caused by repeatedly copying the barcode map
1 parent bd7df5a commit 4209eb6

File tree

2 files changed

+36
-30
lines changed

2 files changed

+36
-30
lines changed

src/junctions/junctions_extractor.cc

+34-30
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,8 @@ bool JunctionsExtractor::junction_qc(Junction &j1) {
196196
}
197197

198198
//Add a junction to the junctions map
199-
//The read_count field is the number of reads supporting the junction.
199+
//The read_count field is the number of reads supporting the junction,
200+
// and reads is a set of the read names supporting the junction
200201
int JunctionsExtractor::add_junction(Junction j1) {
201202
//Check junction_qc
202203
if(!junction_qc(j1)) {
@@ -219,44 +220,46 @@ int JunctionsExtractor::add_junction(Junction j1) {
219220
}
220221
string key = j1.chrom + string(":") + start + "-" + end + ":" + strand_proxy;
221222

222-
//Check if new junction
223+
//add new junction
223224
if(!junctions_.count(key)) {
224225
j1.name = get_new_junction_name();
225226
j1.read_count = 1;
226227
j1.score = common::num_to_str(j1.read_count);
227-
} else { //existing junction
228-
Junction j0 = junctions_[key];
229-
228+
junctions_[key] = j1;
229+
230+
} else { //update existing junction
231+
230232
if (output_barcodes_file_ != "NA"){
231-
unordered_map<string, int>::const_iterator it = j0.barcodes.find(j1.barcodes.begin()->first);
232-
233-
if (it != j0.barcodes.end()) {// barcode exists already
234-
j1.barcodes = j0.barcodes;
235-
j1.barcodes[it->first]++;
233+
// NOTE: Junction j1 will always have exactly one barcode,
234+
// that of the current alignment; i.e. j1.barcodes = {<barcode>: 1}
235+
auto it = junctions_[key].barcodes.find(j1.barcodes.begin()->first);
236+
if (it != junctions_[key].barcodes.end()) {// barcode exists already
237+
junctions_[key].barcodes[it->first]++;
236238
} else {
237-
// this block is where the slowness happens - not sure if it's the instantiation or the insertion
238-
// well, tried to get around instantion by just inserting into j0 but that made it like another 2x slower so I don't think it's that
239-
pair<string, int> tmp_barcode = *j1.barcodes.begin();
240-
j1.barcodes = j0.barcodes;
241-
j1.barcodes.insert(tmp_barcode);
239+
junctions_[key].barcodes[it->first] = 1;
242240
}
243241
}
244-
//increment read count
245-
j1.read_count = j0.read_count + 1;
246-
j1.score = common::num_to_str(j1.read_count);
247-
//Keep the same name
248-
j1.name = j0.name;
242+
// following barcodes convention, Junction j1 has one read,
243+
// that of the current alignment; i.e. j1.reads = {<read_name>}
244+
string read_name = *j1.reads.begin();
245+
//avoid counting the same paired-end read twice
246+
//if both segments overlap junction
247+
if (!junctions_[key].reads.count(read_name)) {
248+
junctions_[key].reads.insert(read_name);
249+
junctions_[key].read_count++;
250+
junctions_[key].score = common::num_to_str(junctions_[key].read_count);
251+
}
249252
//Check if thick starts are any better
250-
if(j0.thick_start < j1.thick_start)
251-
j1.thick_start = j0.thick_start;
252-
if(j0.thick_end > j1.thick_end)
253-
j1.thick_end = j0.thick_end;
254-
//preserve min anchor information
255-
j1.has_left_min_anchor = j1.has_left_min_anchor || j0.has_left_min_anchor;
256-
j1.has_right_min_anchor = j1.has_right_min_anchor || j0.has_right_min_anchor;
253+
if(j1.thick_start < junctions_[key].thick_start)
254+
junctions_[key].thick_start = j1.thick_start;
255+
if(j1.thick_end > junctions_[key].thick_end)
256+
junctions_[key].thick_end = j1.thick_end;
257+
//update min anchor information
258+
junctions_[key].has_left_min_anchor =
259+
junctions_[key].has_left_min_anchor || j1.has_left_min_anchor;
260+
junctions_[key].has_right_min_anchor =
261+
junctions_[key].has_right_min_anchor || j1.has_right_min_anchor;
257262
}
258-
//Add junction and check anchor while printing.
259-
junctions_[key] = j1;
260263
return 0;
261264
}
262265

@@ -420,8 +423,9 @@ int JunctionsExtractor::parse_alignment_into_junctions(bam_hdr_t *header, bam1_t
420423

421424
Junction j1;
422425
j1.chrom = chr;
426+
j1.thick_start = read_pos; // start pos of read
423427
j1.start = read_pos; //maintain start pos of junction
424-
j1.thick_start = read_pos;
428+
j1.reads.insert(bam_get_qname(aln));
425429
string intron_motif;
426430

427431
if (output_barcodes_file_ != "NA"){

src/junctions/junctions_extractor.h

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ using namespace std;
3939
struct Junction : BED {
4040
//Number of reads supporting the junction
4141
unsigned int read_count;
42+
//Reads supporting the junction
43+
unordered_set<string> reads;
4244
//This is the start - max overhang
4345
CHRPOS thick_start;
4446
//This is the end + max overhang

0 commit comments

Comments
 (0)