Skip to content

Commit 9f5e456

Browse files
committed
fix: junctions extractor count overlapping read pairs once
- closes griffithlab#187, implemented by adding set of `reads` to `Junction`, and only incrementing `read_count` if read has not been seen yet - this commit should also increase RegTools speed by removing the barcode updating bottleneck caused by repeatedly copying the barcode map - updated regtools to v1.1.0
1 parent 1e3080c commit 9f5e456

File tree

3 files changed

+37
-31
lines changed

3 files changed

+37
-31
lines changed

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ include(TestHelper)
1111

1212
#versioning stuff
1313
set (regtools_VERSION_MAJOR 1)
14-
set (regtools_VERSION_MINOR 0)
14+
set (regtools_VERSION_MINOR 1)
1515
set (regtools_VERSION_PATCH 0)
1616

1717
configure_file (

src/junctions/junctions_extractor.cc

+34-30
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,8 @@ bool JunctionsExtractor::junction_qc(Junction &j1) {
202202
}
203203

204204
//Add a junction to the junctions map
205-
//The read_count field is the number of reads supporting the junction.
205+
//The read_count field is the number of reads supporting the junction,
206+
// and reads is a set of the read names supporting the junction
206207
int JunctionsExtractor::add_junction(Junction j1) {
207208
//Check junction_qc
208209
if(!junction_qc(j1)) {
@@ -225,44 +226,46 @@ int JunctionsExtractor::add_junction(Junction j1) {
225226
}
226227
string key = j1.chrom + string(":") + start + "-" + end + ":" + strand_proxy;
227228

228-
//Check if new junction
229+
//add new junction
229230
if(!junctions_.count(key)) {
230231
j1.name = get_new_junction_name();
231232
j1.read_count = 1;
232233
j1.score = common::num_to_str(j1.read_count);
233-
} else { //existing junction
234-
Junction j0 = junctions_[key];
235-
234+
junctions_[key] = j1;
235+
236+
} else { //update existing junction
237+
236238
if (output_barcodes_file_ != "NA"){
237-
unordered_map<string, int>::const_iterator it = j0.barcodes.find(j1.barcodes.begin()->first);
238-
239-
if (it != j0.barcodes.end()) {// barcode exists already
240-
j1.barcodes = j0.barcodes;
241-
j1.barcodes[it->first]++;
239+
// NOTE: Junction j1 will always have exactly one barcode,
240+
// that of the current alignment; i.e. j1.barcodes = {<barcode>: 1}
241+
auto it = junctions_[key].barcodes.find(j1.barcodes.begin()->first);
242+
if (it != junctions_[key].barcodes.end()) {// barcode exists already
243+
junctions_[key].barcodes[it->first]++;
242244
} else {
243-
// this block is where the slowness happens - not sure if it's the instantiation or the insertion
244-
// well, tried to get around instantion by just inserting into j0 but that made it like another 2x slower so I don't think it's that
245-
pair<string, int> tmp_barcode = *j1.barcodes.begin();
246-
j1.barcodes = j0.barcodes;
247-
j1.barcodes.insert(tmp_barcode);
245+
junctions_[key].barcodes[it->first] = 1;
248246
}
249247
}
250-
//increment read count
251-
j1.read_count = j0.read_count + 1;
252-
j1.score = common::num_to_str(j1.read_count);
253-
//Keep the same name
254-
j1.name = j0.name;
248+
// following barcodes convention, Junction j1 has one read,
249+
// that of the current alignment; i.e. j1.reads = {<read_name>}
250+
string read_name = *j1.reads.begin();
251+
//avoid counting the same paired-end read twice
252+
//if both segments overlap junction
253+
if (!junctions_[key].reads.count(read_name)) {
254+
junctions_[key].reads.insert(read_name);
255+
junctions_[key].read_count++;
256+
junctions_[key].score = common::num_to_str(junctions_[key].read_count);
257+
}
255258
//Check if thick starts are any better
256-
if(j0.thick_start < j1.thick_start)
257-
j1.thick_start = j0.thick_start;
258-
if(j0.thick_end > j1.thick_end)
259-
j1.thick_end = j0.thick_end;
260-
//preserve min anchor information
261-
j1.has_left_min_anchor = j1.has_left_min_anchor || j0.has_left_min_anchor;
262-
j1.has_right_min_anchor = j1.has_right_min_anchor || j0.has_right_min_anchor;
259+
if(j1.thick_start < junctions_[key].thick_start)
260+
junctions_[key].thick_start = j1.thick_start;
261+
if(j1.thick_end > junctions_[key].thick_end)
262+
junctions_[key].thick_end = j1.thick_end;
263+
//update min anchor information
264+
junctions_[key].has_left_min_anchor =
265+
junctions_[key].has_left_min_anchor || j1.has_left_min_anchor;
266+
junctions_[key].has_right_min_anchor =
267+
junctions_[key].has_right_min_anchor || j1.has_right_min_anchor;
263268
}
264-
//Add junction and check anchor while printing.
265-
junctions_[key] = j1;
266269
return 0;
267270
}
268271

@@ -426,8 +429,9 @@ int JunctionsExtractor::parse_alignment_into_junctions(bam_hdr_t *header, bam1_t
426429

427430
Junction j1;
428431
j1.chrom = chr;
432+
j1.thick_start = read_pos; // start pos of read
429433
j1.start = read_pos; //maintain start pos of junction
430-
j1.thick_start = read_pos;
434+
j1.reads.insert(bam_get_qname(aln));
431435
string intron_motif;
432436

433437
if (output_barcodes_file_ != "NA"){

src/junctions/junctions_extractor.h

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ using namespace std;
3939
struct Junction : BED {
4040
//Number of reads supporting the junction
4141
unsigned int read_count;
42+
//Reads supporting the junction
43+
unordered_set<string> reads;
4244
//This is the start - max overhang
4345
CHRPOS thick_start;
4446
//This is the end + max overhang

0 commit comments

Comments
 (0)