Skip to content

Commit 716a700

Browse files
authored
Merge pull request #247 from HadrienG/isize
Fix for insert size distributions
2 parents fcd3490 + 351a636 commit 716a700

File tree

2 files changed

+23
-13
lines changed

2 files changed

+23
-13
lines changed

Diff for: iss/bam.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def to_model(bam_path, output):
113113
"""
114114
logger = logging.getLogger(__name__)
115115

116-
insert_size_dist = []
116+
template_length_dist = []
117117
qualities_forward = []
118118
qualities_reverse = []
119119
subst_matrix_f = np.zeros([301, 16]) # we dont know the len of the reads
@@ -124,10 +124,10 @@ def to_model(bam_path, output):
124124
# read the bam file and extract info needed for modelling
125125
for read in read_bam(bam_path):
126126
# get insert size distribution
127-
if read.is_proper_pair:
127+
if read.is_paired:
128128
template_length = abs(read.template_length)
129-
i_size = template_length - (2 * len(read.seq))
130-
insert_size_dist.append(i_size)
129+
# i_size = template_length - (2 * len(read.seq))
130+
template_length_dist.append(template_length)
131131

132132
# get qualities
133133
if read.is_read1:
@@ -167,10 +167,6 @@ def to_model(bam_path, output):
167167
elif read.is_read2:
168168
indel_matrix_r[pos, indel] += 1
169169

170-
logger.info("Calculating insert size distribution")
171-
# insert_size = int(np.mean(insert_size_dist))
172-
hist_insert_size = modeller.insert_size(insert_size_dist)
173-
174170
logger.info("Calculating mean and base quality distribution")
175171
quality_bins_f = modeller.divide_qualities_into_bins(qualities_forward)
176172
quality_bins_r = modeller.divide_qualities_into_bins(qualities_reverse)
@@ -209,6 +205,10 @@ def to_model(bam_path, output):
209205
ins_f, del_f = modeller.indel_matrix_to_choices(indel_matrix_f, read_length)
210206
ins_r, del_r = modeller.indel_matrix_to_choices(indel_matrix_r, read_length)
211207

208+
logger.info("Calculating insert size distribution")
209+
# insert_size = int(np.mean(insert_size_dist))
210+
hist_insert_size = modeller.insert_size(template_length_dist, read_length)
211+
212212
write_to_file(
213213
"kde",
214214
read_length,

Diff for: iss/modeller.py

+15-5
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,29 @@
99
from iss import util
1010

1111

12-
def insert_size(insert_size_distribution):
12+
def insert_size(template_length_dist, read_length):
1313
"""Calculate cumulative distribution function from the raw insert size
1414
distributin. Uses 1D kernel density estimation.
1515
1616
Args:
17-
insert_size_distribution (list): list of insert sizes from aligned
18-
read pairs
17+
template_length_dist (list): List of template lengths from bam file.
18+
read_length (int): The length of the read.
1919
2020
Returns:
2121
1darray: a cumulative density function
2222
"""
23-
kde = stats.gaussian_kde(insert_size_distribution, bw_method=0.2 / np.std(insert_size_distribution, ddof=1))
24-
x_grid = np.linspace(min(insert_size_distribution), max(insert_size_distribution), 1000)
23+
# we want to remove zeroes and outliers
24+
tld = np.asarray(template_length_dist)
25+
min_mask = tld > 0
26+
tld = tld[min_mask]
27+
# 2000 is a common upper limit for template length for illumina sequencing
28+
max_mask = tld < 2000
29+
tld = tld[max_mask]
30+
31+
isd = tld - (2 * read_length) # convert to insert size
32+
33+
kde = stats.gaussian_kde(isd, bw_method=0.2 / np.std(isd, ddof=1))
34+
x_grid = np.linspace(min(isd), max(isd), 2000)
2535
kde = kde.evaluate(x_grid)
2636
cdf = np.cumsum(kde)
2737
cdf = cdf / cdf[-1]

0 commit comments

Comments
 (0)