Skip to content

Commit 409a492

Browse files
committed
feat: added per-read flag and map quality filtering
- closes griffithlab#183, flag based filtering - closes griffithlab#189, mapping quality based filtering - option '-F' filters reads containing any of these flags - option '-f' filters reads not containing all these flags - option '-q' filters reads below this mapping quality
1 parent 1260bf9 commit 409a492

File tree

4 files changed

+74
-6
lines changed

4 files changed

+74
-6
lines changed

src/cis-splice-effects/cis_splice_effects_identifier.cc

+16-2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ void CisSpliceEffectsIdentifier::usage(ostream& out) {
4646
<< "\t\t\t " << "anchor length on both sides are reported. [8]" << endl;
4747
out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl;
4848
out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl;
49+
out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl;
50+
out << "\t\t" << "-F INT\tOnly use alignments where no flag bits set here are set. [0]" << endl;
51+
out << "\t\t" << "-q INT\tOnly use alignments with this mapping quality or above. [0]" << endl;
4952
out << "\t\t" << "-w INT\tWindow size in b.p to identify splicing events in.\n"
5053
<< "\t\t\t " << "The tool identifies events in variant.start +/- w basepairs.\n"
5154
<< "\t\t\t " << "Default behaviour is to look at the window between previous and next exons." << endl;
@@ -113,7 +116,7 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) {
113116
optind = 1; //Reset before parsing again.
114117
stringstream help_ss;
115118
char c;
116-
while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:m:M:b:C")) != -1) {
119+
while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:m:M:f:F:q:b:C")) != -1) {
117120
switch(c) {
118121
case 'o':
119122
output_file_ = string(optarg);
@@ -170,6 +173,15 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) {
170173
case 'M':
171174
max_intron_length_ = atoi(optarg);
172175
break;
176+
case 'f':
177+
require_flags_ = atoi(optarg);
178+
break;
179+
case 'F':
180+
filter_flags_ = atoi(optarg);
181+
break;
182+
case 'q':
183+
min_map_qual_ = atoi(optarg);
184+
break;
173185
case 'b':
174186
output_barcodes_file_ = string(optarg);
175187
break;
@@ -285,7 +297,9 @@ void CisSpliceEffectsIdentifier::identify() {
285297
} else {
286298
ref_to_pass = "NA";
287299
}
288-
JunctionsExtractor je1(bam_, variant_region, strandness_, strand_tag_, min_anchor_length_, min_intron_length_, max_intron_length_, ref_to_pass);
300+
JunctionsExtractor je1(bam_, variant_region, strandness_,
301+
strand_tag_, min_anchor_length_, min_intron_length_, max_intron_length_,
302+
filter_flags_, require_flags_, min_map_qual_, ref_to_pass);
289303
je1.identify_junctions_from_BAM();
290304
vector<Junction> junctions = je1.get_all_junctions();
291305
//Add all the junctions to the unique set

src/cis-splice-effects/cis_splice_effects_identifier.h

+10-1
Original file line numberDiff line numberDiff line change
@@ -87,13 +87,19 @@ class CisSpliceEffectsIdentifier {
8787
//tag used in BAM to denote strand, default "XS"
8888
string strand_tag_;
8989
//Minimum anchor length for junctions
90-
//Junctions need atleast this many bp overlap
90+
//Junctions need at least this many bp overlap
9191
// on both ends.
9292
uint32_t min_anchor_length_;
9393
//Minimum length of an intron, i.e min junction width
9494
uint32_t min_intron_length_;
9595
//Maximum length of an intron, i.e max junction width
9696
uint32_t max_intron_length_;
97+
//filter reads containing any of these flags
98+
uint16_t filter_flags_;
99+
// filter reads not containing all of these flags
100+
uint16_t require_flags_;
101+
// filter reads below the minimum mapping quality
102+
uint8_t min_map_qual_;
97103
//whether to override strand of extracted junctions using intron-motif method
98104
bool override_strand_with_canonical_intron_motif_;
99105
public:
@@ -114,6 +120,9 @@ class CisSpliceEffectsIdentifier {
114120
min_anchor_length_(8),
115121
min_intron_length_(70),
116122
max_intron_length_(500000),
123+
filter_flags_(0),
124+
require_flags_(0),
125+
min_map_qual_(0),
117126
override_strand_with_canonical_intron_motif_(false) {}
118127
//Destructor
119128
~CisSpliceEffectsIdentifier() {

src/junctions/junctions_extractor.cc

+30-2
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {
4343
optind = 1; //Reset before parsing again.
4444
int c;
4545
stringstream help_ss;
46-
while((c = getopt(argc, argv, "ha:m:M:o:r:t:s:b:")) != -1) {
46+
while((c = getopt(argc, argv, "ha:m:M:f:F:q:o:r:t:s:b:")) != -1) {
4747
switch(c) {
4848
case 'h':
4949
usage(help_ss);
@@ -57,6 +57,15 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {
5757
case 'M':
5858
max_intron_length_ = atoi(optarg);
5959
break;
60+
case 'f':
61+
require_flags_ = atoi(optarg);
62+
break;
63+
case 'F':
64+
filter_flags_ = atoi(optarg);
65+
break;
66+
case 'q':
67+
min_map_qual_ = atoi(optarg);
68+
break;
6069
case 'o':
6170
output_file_ = string(optarg);
6271
break;
@@ -108,10 +117,17 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {
108117
throw runtime_error("Strandness mode 'intron-motif' requires a fasta file!\n\n");
109118
}
110119
}
120+
if ( (require_flags_ & filter_flags_) != 0) {
121+
usage();
122+
throw runtime_error("No overlap allowed between '-f' and '-F' options (same flag filtered and required)!\n\n");
123+
}
111124

112125
cerr << "Minimum junction anchor length: " << min_anchor_length_ << endl;
113126
cerr << "Minimum intron length: " << min_intron_length_ << endl;
114127
cerr << "Maximum intron length: " << max_intron_length_ << endl;
128+
cerr << "Require alignment flags: " << require_flags_ << endl;
129+
cerr << "Filter alignment flags: " << filter_flags_ << endl;
130+
cerr << "Minimum alignment mapping quality: " << int(min_map_qual_) << endl;
115131
cerr << "Alignment: " << bam_ << endl;
116132
cerr << "Output file: " << output_file_ << endl;
117133
if (output_barcodes_file_ != "NA"){
@@ -130,6 +146,9 @@ int JunctionsExtractor::usage(ostream& out) {
130146
<< "\t\t\t " << "anchor length on both sides are reported. [8]" << endl;
131147
out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl;
132148
out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl;
149+
out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl;
150+
out << "\t\t" << "-F INT\tOnly use alignments where no flag bits set here are set. [0]" << endl;
151+
out << "\t\t" << "-q INT\tOnly use alignments with this mapping quality or above. [0]" << endl;
133152
out << "\t\t" << "-o FILE\tThe file to write output to. [STDOUT]" << endl;
134153
out << "\t\t" << "-r STR\tThe region to identify junctions \n"
135154
<< "\t\t\t " << "in \"chr:start-end\" format. Entire BAM by default." << endl;
@@ -358,7 +377,7 @@ void JunctionsExtractor::set_junction_strand(bam1_t *aln, Junction& j1, string i
358377
return;
359378
}
360379

361-
//Get the the barcode
380+
//Get the barcode
362381
void JunctionsExtractor::set_junction_barcode(bam1_t *aln, Junction& j1) {
363382
uint8_t *p = bam_aux_get(aln, barcode_tag_.c_str());
364383
if(p != NULL) {
@@ -373,6 +392,14 @@ void JunctionsExtractor::set_junction_barcode(bam1_t *aln, Junction& j1) {
373392
}
374393
}
375394

395+
//Filters alignments based on filtering flags and mapping quality
396+
bool JunctionsExtractor::filter_alignment(bam_hdr_t *header, bam1_t *aln) {
397+
if ((aln->core.flag & filter_flags_) != 0) return true;
398+
if ((aln->core.flag | require_flags_) != aln->core.flag) return true;
399+
if (aln->core.qual < min_map_qual_) return true;
400+
return false;
401+
}
402+
376403
//Parse junctions from the read and store in junction map
377404
int JunctionsExtractor::parse_alignment_into_junctions(bam_hdr_t *header, bam1_t *aln) {
378405
int n_cigar = aln->core.n_cigar;
@@ -523,6 +550,7 @@ int JunctionsExtractor::identify_junctions_from_BAM() {
523550
//Initiate the alignment record
524551
bam1_t *aln = bam_init1();
525552
while(sam_itr_next(in, iter, aln) >= 0) {
553+
if (filter_alignment(header, aln)) continue;
526554
parse_alignment_into_junctions(header, aln);
527555
}
528556
hts_itr_destroy(iter);

src/junctions/junctions_extractor.h

+18-1
Original file line numberDiff line numberDiff line change
@@ -180,12 +180,21 @@ class JunctionsExtractor {
180180
string strand_tag_;
181181
//tag used in BAM to denote single cell barcode
182182
string barcode_tag_;
183+
//filter reads containing any of these flags
184+
uint16_t filter_flags_;
185+
// filter reads not containing all of these flags
186+
uint16_t require_flags_;
187+
// filter reads below the minimum mapping quality
188+
uint8_t min_map_qual_;
183189
public:
184190
//Default constructor
185191
JunctionsExtractor() {
186192
min_anchor_length_ = 8;
187193
min_intron_length_ = 70;
188194
max_intron_length_ = 500000;
195+
filter_flags_ = 0;
196+
require_flags_ = 0;
197+
min_map_qual_ = 0;
189198
junctions_sorted_ = false;
190199
strandness_ = -1;
191200
strand_tag_ = "XS";
@@ -204,14 +213,20 @@ class JunctionsExtractor {
204213
uint32_t min_anchor_length1,
205214
uint32_t min_intron_length1,
206215
uint32_t max_intron_length1,
216+
uint16_t filter_flags,
217+
uint16_t require_flags,
218+
uint8_t min_map_qual,
207219
string ref1) :
208220
bam_(bam1),
209221
region_(region1),
210222
strandness_(strandness1),
211223
strand_tag_(strand_tag1),
212224
min_anchor_length_(min_anchor_length1),
213-
min_intron_length_(min_intron_length1),
225+
min_intron_length_(min_anchor_length1),
214226
max_intron_length_(max_intron_length1),
227+
filter_flags_(filter_flags),
228+
require_flags_(require_flags),
229+
min_map_qual_(min_map_qual),
215230
ref_(ref1) {
216231
junctions_sorted_ = false;
217232
output_file_ = "NA";
@@ -241,6 +256,8 @@ class JunctionsExtractor {
241256
void create_junctions_vector();
242257
//Pull out the cigar string from the read
243258
int parse_read(bam_hdr_t *header, bam1_t *aln);
259+
//Returns whether alignment should be filtered from junction analysis
260+
bool filter_alignment(bam_hdr_t *header, bam1_t *aln);
244261
//Parse junctions from the read and store in junction map
245262
int parse_cigar_into_junctions(string chr, int read_pos,
246263
uint32_t *cigar, int n_cigar);

0 commit comments

Comments
 (0)