Skip to content

Commit 19644c1

Browse files
committed
fix: junctions extractor count overlapping read pairs once
- closes griffithlab#187, implemented by adding set of `reads` to `Junction`, and only incrementing `read_count` if read has not been seen yet - this commit should also increase RegTools speed by removing the barcode updating bottleneck caused by repeatedly copying the barcode map
1 parent 77451c4 commit 19644c1

File tree

2 files changed

+36
-30
lines changed

2 files changed

+36
-30
lines changed

src/junctions/junctions_extractor.cc

+34-30
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,8 @@ bool JunctionsExtractor::junction_qc(Junction &j1) {
198198
}
199199

200200
//Add a junction to the junctions map
201-
//The read_count field is the number of reads supporting the junction.
201+
//The read_count field is the number of reads supporting the junction,
202+
// and reads is a set of the read names supporting the junction
202203
int JunctionsExtractor::add_junction(Junction j1) {
203204
//Check junction_qc
204205
if(!junction_qc(j1)) {
@@ -221,44 +222,46 @@ int JunctionsExtractor::add_junction(Junction j1) {
221222
}
222223
string key = j1.chrom + string(":") + start + "-" + end + ":" + strand_proxy;
223224

224-
//Check if new junction
225+
//add new junction
225226
if(!junctions_.count(key)) {
226227
j1.name = get_new_junction_name();
227228
j1.read_count = 1;
228229
j1.score = common::num_to_str(j1.read_count);
229-
} else { //existing junction
230-
Junction j0 = junctions_[key];
231-
230+
junctions_[key] = j1;
231+
232+
} else { //update existing junction
233+
232234
if (output_barcodes_file_ != "NA"){
233-
unordered_map<string, int>::const_iterator it = j0.barcodes.find(j1.barcodes.begin()->first);
234-
235-
if (it != j0.barcodes.end()) {// barcode exists already
236-
j1.barcodes = j0.barcodes;
237-
j1.barcodes[it->first]++;
235+
// NOTE: Junction j1 will always have exactly one barcode,
236+
// that of the current alignment; i.e. j1.barcodes = {<barcode>: 1}
237+
auto it = junctions_[key].barcodes.find(j1.barcodes.begin()->first);
238+
if (it != junctions_[key].barcodes.end()) {// barcode exists already
239+
junctions_[key].barcodes[it->first]++;
238240
} else {
239-
// this block is where the slowness happens - not sure if it's the instantiation or the insertion
240-
// well, tried to get around instantion by just inserting into j0 but that made it like another 2x slower so I don't think it's that
241-
pair<string, int> tmp_barcode = *j1.barcodes.begin();
242-
j1.barcodes = j0.barcodes;
243-
j1.barcodes.insert(tmp_barcode);
241+
junctions_[key].barcodes[it->first] = 1;
244242
}
245243
}
246-
//increment read count
247-
j1.read_count = j0.read_count + 1;
248-
j1.score = common::num_to_str(j1.read_count);
249-
//Keep the same name
250-
j1.name = j0.name;
244+
// following barcodes convention, Junction j1 has one read,
245+
// that of the current alignment; i.e. j1.reads = {<read_name>}
246+
string read_name = *j1.reads.begin();
247+
//avoid counting the same paired-end read twice
248+
//if both segments overlap junction
249+
if (!junctions_[key].reads.count(read_name)) {
250+
junctions_[key].reads.insert(read_name);
251+
junctions_[key].read_count++;
252+
junctions_[key].score = common::num_to_str(junctions_[key].read_count);
253+
}
251254
//Check if thick starts are any better
252-
if(j0.thick_start < j1.thick_start)
253-
j1.thick_start = j0.thick_start;
254-
if(j0.thick_end > j1.thick_end)
255-
j1.thick_end = j0.thick_end;
256-
//preserve min anchor information
257-
j1.has_left_min_anchor = j1.has_left_min_anchor || j0.has_left_min_anchor;
258-
j1.has_right_min_anchor = j1.has_right_min_anchor || j0.has_right_min_anchor;
255+
if(j1.thick_start < junctions_[key].thick_start)
256+
junctions_[key].thick_start = j1.thick_start;
257+
if(j1.thick_end > junctions_[key].thick_end)
258+
junctions_[key].thick_end = j1.thick_end;
259+
//update min anchor information
260+
junctions_[key].has_left_min_anchor =
261+
junctions_[key].has_left_min_anchor || j1.has_left_min_anchor;
262+
junctions_[key].has_right_min_anchor =
263+
junctions_[key].has_right_min_anchor || j1.has_right_min_anchor;
259264
}
260-
//Add junction and check anchor while printing.
261-
junctions_[key] = j1;
262265
return 0;
263266
}
264267

@@ -422,8 +425,9 @@ int JunctionsExtractor::parse_alignment_into_junctions(bam_hdr_t *header, bam1_t
422425

423426
Junction j1;
424427
j1.chrom = chr;
428+
j1.thick_start = read_pos; // start pos of read
425429
j1.start = read_pos; //maintain start pos of junction
426-
j1.thick_start = read_pos;
430+
j1.reads.insert(bam_get_qname(aln));
427431
string intron_motif;
428432

429433
if (output_barcodes_file_ != "NA"){

src/junctions/junctions_extractor.h

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ using namespace std;
3939
struct Junction : BED {
4040
//Number of reads supporting the junction
4141
unsigned int read_count;
42+
//Reads supporting the junction
43+
unordered_set<string> reads;
4244
//This is the start - max overhang
4345
CHRPOS thick_start;
4446
//This is the end + max overhang

0 commit comments

Comments
 (0)