Merge pull request #247 from HadrienG/isize

HadrienG · web-flow · commit 716a70052bf0 · 2024-01-08T11:27:33.000+01:00
Fix for insert size distributions
diff --git a/iss/bam.py b/iss/bam.py
@@ -113,7 +113,7 @@ def to_model(bam_path, output):
     """
     logger = logging.getLogger(__name__)
 
-    insert_size_dist = []
+    template_length_dist = []
     qualities_forward = []
     qualities_reverse = []
     subst_matrix_f = np.zeros([301, 16])  # we dont know the len of the reads
@@ -124,10 +124,10 @@ def to_model(bam_path, output):
     # read the bam file and extract info needed for modelling
     for read in read_bam(bam_path):
         # get insert size distribution
-        if read.is_proper_pair:
+        if read.is_paired:
             template_length = abs(read.template_length)
-            i_size = template_length - (2 * len(read.seq))
-            insert_size_dist.append(i_size)
+            # i_size = template_length - (2 * len(read.seq))
+            template_length_dist.append(template_length)
 
         # get qualities
         if read.is_read1:
@@ -167,10 +167,6 @@ def to_model(bam_path, output):
                 elif read.is_read2:
                     indel_matrix_r[pos, indel] += 1
 
-    logger.info("Calculating insert size distribution")
-    # insert_size = int(np.mean(insert_size_dist))
-    hist_insert_size = modeller.insert_size(insert_size_dist)
-
     logger.info("Calculating mean and base quality distribution")
     quality_bins_f = modeller.divide_qualities_into_bins(qualities_forward)
     quality_bins_r = modeller.divide_qualities_into_bins(qualities_reverse)
@@ -209,6 +205,10 @@ def to_model(bam_path, output):
     ins_f, del_f = modeller.indel_matrix_to_choices(indel_matrix_f, read_length)
     ins_r, del_r = modeller.indel_matrix_to_choices(indel_matrix_r, read_length)
 
+    logger.info("Calculating insert size distribution")
+    # insert_size = int(np.mean(insert_size_dist))
+    hist_insert_size = modeller.insert_size(template_length_dist, read_length)
+
     write_to_file(
         "kde",
         read_length,
diff --git a/iss/modeller.py b/iss/modeller.py
@@ -9,19 +9,29 @@
 from iss import util
 
 
-def insert_size(insert_size_distribution):
+def insert_size(template_length_dist, read_length):
     """Calculate cumulative distribution function from the raw insert size
     distributin. Uses 1D kernel density estimation.
 
     Args:
-        insert_size_distribution (list): list of insert sizes from aligned
-        read pairs
+        template_length_dist (list): List of template lengths from bam file.
+        read_length (int): The length of the read.
 
     Returns:
         1darray: a cumulative density function
     """
-    kde = stats.gaussian_kde(insert_size_distribution, bw_method=0.2 / np.std(insert_size_distribution, ddof=1))
-    x_grid = np.linspace(min(insert_size_distribution), max(insert_size_distribution), 1000)
+    # we want to remove zeroes and outliers
+    tld = np.asarray(template_length_dist)
+    min_mask = tld > 0
+    tld = tld[min_mask]
+    # 2000 is a common upper limit for template length for illumina sequencing
+    max_mask = tld < 2000
+    tld = tld[max_mask]
+
+    isd = tld - (2 * read_length)  # convert to insert size
+
+    kde = stats.gaussian_kde(isd, bw_method=0.2 / np.std(isd, ddof=1))
+    x_grid = np.linspace(min(isd), max(isd), 2000)
     kde = kde.evaluate(x_grid)
     cdf = np.cumsum(kde)
     cdf = cdf / cdf[-1]