Initial commit.

PolinaBevad · Jun 17, 2019 · e5da7e5 · e5da7e5
commit e5da7e5
Show file tree

Hide file tree

Showing 25 changed files with 332 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+venv/*
+.pytest_cache/*
+Pipfile.lock
+.idea/
diff --git a/LICENSE b/LICENSE
diff --git a/Pipfile b/Pipfile
@@ -0,0 +1,13 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+
+[packages]
+pysam = "*"
+pytest = "*"
+
+[requires]
+python_version = "3.6"
diff --git a/README.md b/README.md
@@ -0,0 +1,14 @@
+TMB (tumor mutation burden) 
+
+### Requirements
+* Python >=3.6
+* pip3
+* pipenv (it is the recommended way to rule packages and environments)
+* pysam
+
+Example of installation packages and Python 3.6 in Ubuntu:  
+`sudo apt install python3.6`  
+`alias python='usr/bin/python3.6`  
+`sudo apt install python3-pip`  
+`pip3 install pipenv`  
+Run `pipenv install pysam` in project directory
diff --git a/data/bed_bam_test/test.bam b/data/bed_bam_test/test.bam
diff --git a/data/bed_bam_test/test.bam.bai b/data/bed_bam_test/test.bam.bai
diff --git a/data/bed_bam_test/test1.bed b/data/bed_bam_test/test1.bed
@@ -0,0 +1,4 @@
+1   87  89  gene
+1   88  92  gene2
+1   89  94  gene2
+1   90  92  gene3
diff --git a/data/bed_bam_test/test2.bed b/data/bed_bam_test/test2.bed
@@ -0,0 +1,4 @@
+1   87  89
+1   88  92
+1   89  94
+1   90  92
diff --git a/data/bed_bam_test/test3.bed b/data/bed_bam_test/test3.bed
@@ -0,0 +1,4 @@
+1   87  89
+1   88
+1   89  94
+1   90  92
diff --git a/data/chr5_665281/dist.bed b/data/chr5_665281/dist.bed
@@ -0,0 +1 @@
+chr5   665279  665338   GENE
diff --git a/data/chr5_665281/normal_chr5_665281.bam b/data/chr5_665281/normal_chr5_665281.bam
diff --git a/data/chr5_665281/normal_chr5_665281.bam.bai b/data/chr5_665281/normal_chr5_665281.bam.bai
diff --git a/data/chr5_665281/tumour_chr5_665281.bam b/data/chr5_665281/tumour_chr5_665281.bam
diff --git a/data/chr5_665281/tumour_chr5_665281.bam.bai b/data/chr5_665281/tumour_chr5_665281.bam.bai
diff --git a/main.py b/main.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+
+import argparse
+from tmb.config import Config
+from tmb.bedreader import BedReader
+from tmb.distribution import calculate_tmb
+from multiprocessing import Pool
+
+
+def main():
+    config = parse_config()
+    exons = BedReader(config.bed).exonList
+
+    pool = Pool(processes=Config.th)
+    results = [pool.apply_async(calculate_tmb, args=(config.tumor, config.normal, exon)) for exon in exons]
+
+    print_header()
+    for output in results:
+        tmb = output.get()
+        print(tmb)
+
+
+def parse_config():
+    parser = argparse.ArgumentParser()
+    required = parser.add_argument_group("required")
+    optional = parser.add_argument_group("optional")
+    optional.add_argument('--freq', type=float,
+                        help="Minimum difference of frequencies for base to consider site as somatic. "
+                             "Default: 5%% (0.05)")
+    optional.add_argument('--mapq', type=float, help="Minimum read mapping quality. Default: 10.0")
+    optional.add_argument('--baseq', type=int, help="Minimum base quiality. Default: 25")
+    optional.add_argument('--mincov', type=int,
+                        help="Minimum coverage of base for nucleotide to be considered as somatic. "
+                             "Default: 2")
+    optional.add_argument('--th', type=int, help="Number of threads for multiprocessing mode. Default: 1")
+    required.add_argument('--normal', required=True, type=str, help="Path to normal SAM/BAM/CRAM file.")
+    required.add_argument('--tumor', required=True, type=str, help="Path to tumor SAM/BAM/CRAM file.")
+    required.add_argument('--bed', required=True, type=str, help="Path to BED file.")
+
+    args = parser.parse_args()
+    return Config(args)
+
+
+def print_header():
+    print("\t".join(['Chr', 'Start', 'End', 'Gene', 'SomaticSites', 'TMB']))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_bamreader_pytest.py b/tests/test_bamreader_pytest.py
@@ -0,0 +1,29 @@
+from tmb.bamreader import BamReader
+from tmb.bedreader import BedReader
+
+import pytest
+
+
+def test_bam_test_read():
+    path = '../data/bed_bam_test/test.bam'
+    bam = BamReader(path).file
+    bam_iter = bam.fetch('1', 28234090, 28234093)
+    list1 = [x for x in bam_iter]
+    assert len(list1) == 4
+
+
+def test_bed_test_read_correct():
+    path = '../data/bed_bam_test/test1.bed'
+    exons = BedReader(path).exonList
+    assert len(exons) == 4
+
+    path = '../data/bed_bam_test/test2.bed'
+    exons = BedReader(path).exonList
+    assert len(exons) == 4
+
+
+def test_bed_test_read_incorrect():
+    path = '../data/bed_bam_test/test3.bed'
+    with pytest.raises(IndexError):
+        BedReader(path)
+
diff --git a/tests/test_distribution.py b/tests/test_distribution.py
@@ -0,0 +1,26 @@
+from numpy.testing import assert_almost_equal
+
+from tmb.bedreader import BedReader
+from tmb.distribution import calculate_tmb
+
+import pytest
+
+
+def test_distribution():
+    normal = '../data/chr5_665281/normal_chr5_665281.bam'
+    tumor = '../data/chr5_665281/tumour_chr5_665281.bam'
+    exons = BedReader('../data/chr5_665281/dist.bed').exonList
+
+    for exon in exons:
+        tmb = calculate_tmb(tumor, normal, exon)
+        assert tmb.somatic_sites == [665281, 665308, 665336]
+        assert_almost_equal(tmb.tmb, 50847.46)
+
+
+def test_panel_indexes():
+    tumor = '../data/panel_az_600/Dev_731_GTL_16_5_Pool1-ready.bam'
+    normal = '../data/panel_az_600/Dev_731_NA12878a_Pool1-ready.bam'
+    exons = BedReader('../data/panel_az_600/panel_az600_chr7_MET.bed').exonList
+
+    for exon in exons:
+        tmb = calculate_tmb(tumor, normal, exon)
diff --git a/tmb/__init__.py b/tmb/__init__.py
diff --git a/tmb/bamreader.py b/tmb/bamreader.py
@@ -0,0 +1,24 @@
+import pysam
+
+
+class BamReader:
+    def __init__(self, path):
+        extension = path.strip().split('.')[-1]
+        if extension == 'bam':
+            self.file = self.read_bam(path)
+        if extension == 'cram':
+            self.file = self.read_cram(path)
+        if extension == 'cram':
+            self.file = self.read_sam(path)
+
+    def read_bam(self, path):
+        file = pysam.AlignmentFile(path, "rb")
+        return file
+
+    def read_sam(self, path):
+        file = pysam.AlignmentFile(path, "r")
+        return file
+
+    def read_cram(self, path):
+        file = pysam.AlignmentFile(path, "rc")
+        return file
diff --git a/tmb/bedreader.py b/tmb/bedreader.py
@@ -0,0 +1,22 @@
+from tmb.exome import Exome
+
+
+class BedReader:
+
+    def __init__(self, path):
+        extension = path.split('.')[-1]
+        if extension == 'bed':
+            self.exonList = self.read_bed(path)
+
+    def read_bed(self, path):
+        exonlist = []
+
+        file = open(path, "r")
+        for x in file:
+            line = x.split()
+            if len(line) == 4:
+                exome = Exome(str(line[0]), int(line[1]), int(line[2]), str(line[3]))
+            else:
+                exome = Exome(str(line[0]), int(line[1]), int(line[2]), '.')
+            exonlist.append(exome)
+        return exonlist
diff --git a/tmb/config.py b/tmb/config.py
@@ -0,0 +1,28 @@
+# Configuration parameters for thresholds of quality etc.
+
+
+class Config:
+    freq = 0.05
+    mapq = 10.0
+    baseq = 25
+    mincov = 2
+    th = 1
+
+    @staticmethod
+    def set_config(args):
+        if args.freq:
+            Config.freq = args.freq
+        if args.mapq:
+            Config.mapq = args.mapq
+        if args.baseq:
+            Config.baseq = args.baseq
+        if args.mincov:
+            Config.mincov = args.mincov
+        if args.th:
+            Config.th = args.th
+
+    def __init__(self, args):
+        self.normal = args.normal
+        self.tumor = args.tumor
+        self.bed = args.bed
+        self.set_config(args)
diff --git a/tmb/distribution.py b/tmb/distribution.py
@@ -0,0 +1,85 @@
+from tmb.tmbresult import TMBResult
+from tmb.bamreader import BamReader
+from tmb.config import Config
+
+
+def calculate_tmb(tumor_path, normal_path, exon):
+    tumor_distribution, normal_distribution = collect_distributions(tumor_path, normal_path, exon)
+    tmb, somatic_sites = hypotesis_test(tumor_distribution, normal_distribution, exon)
+    tmb = TMBResult(exon, tmb, somatic_sites)
+    return tmb
+
+
+def collect_distributions(tumor_path, normal_path, exon):
+    tumor = BamReader(tumor_path).file
+    normal = BamReader(normal_path).file
+
+    tumor_distribution = fill_maps(tumor, exon)
+    # print(tumor_distribution)
+    normal_distribution = fill_maps(normal, exon)
+    # print(normal_distribution)
+    return tumor_distribution, normal_distribution
+
+
+def fill_maps(bam, exon):
+    positions_to_acgt = {}
+    for i in range(exon.start, exon.end):
+        positions_to_acgt[i] = {"A": 0, "C": 0, "G": 0, "T": 0, "N": 0, "DEL": 0}
+
+    # Pile can get reads that will cover this position in column-like form
+    pile = bam.pileup(exon.chr, exon.start, exon.end)
+    for pileupcolumn in pile:
+        position = pileupcolumn.pos
+        if position < exon.start or position >= exon.end:
+            continue
+
+        acgt_to_counts = {"A": 0, "C": 0, "G": 0, "T": 0, "N": 0, "DEL": 0}
+        for pileupread in pileupcolumn.pileups:
+            if bad_read(pileupread):
+                continue
+            if pileupread.query_position is None:
+                base = 'DEL'
+            else:
+                base_quality = pileupread.alignment.query_qualities[pileupread.query_position]
+                if base_quality < Config.baseq:
+                    break
+                base = pileupread.alignment.query_sequence[pileupread.query_position]
+            acgt_to_counts[base] += 1
+        positions_to_acgt[position] = acgt_to_counts
+    return positions_to_acgt
+
+
+def hypotesis_test(tumor, normal, exon):
+    exone_len = exon.end - exon.start
+    somatic_sites = []
+    for position in tumor:
+        tumor_counts = list(tumor[position].values())
+        normal_counts = list(normal[position].values())
+
+        tumor_total_coverage = sum(tumor_counts)
+        normal_total_coverage = sum(normal_counts)
+
+        if tumor_total_coverage == 0 or normal_total_coverage == 0:
+            continue
+
+        tumor_percentage = list([x / tumor_total_coverage for x in tumor_counts])
+        normal_percentage = list([x / normal_total_coverage for x in normal_counts])
+
+        for i in range(len(tumor_counts)):
+            a = abs(tumor_percentage[i] - normal_percentage[i])
+            if a > Config.freq and tumor_counts[i] > Config.mincov and normal_counts[i] > Config.mincov:
+                # Sam is 0-based, extend position:
+                somatic_sites.append(position + 1)
+                break
+
+    tmb = round((len(somatic_sites) / exone_len) * 1000000, 2)
+    return tmb, somatic_sites
+
+
+# Read doesn't fit criteria for quality
+def bad_read(read):
+    if read.alignment.mapping_quality < Config.mapq:
+        return True
+
+    return False
+
diff --git a/tmb/exome.py b/tmb/exome.py
@@ -0,0 +1,16 @@
+class Exome:
+    def __init__(self, chr, start, end, gene):
+        self.chr = chr
+        self.start = start
+        self.end = end
+        self.gene = gene
+        self.length = end - start
+
+    def __str__(self):
+        exon_description = ', '.join(['{key}={value}'.format(key=key, value=self.__dict__.get(key))
+                                      for key in self.__dict__])
+        return '\n' + exon_description
+
+
+
+
diff --git a/tmb/tmbresult.py b/tmb/tmbresult.py
@@ -0,0 +1,9 @@
+class TMBResult:
+    def __init__(self, exon, tmb, somatic_sites):
+        self.exon = exon
+        self.tmb = tmb
+        self.somatic_sites = somatic_sites
+
+    def __str__(self):
+        return "\t".join([str(self.exon.chr), str(self.exon.start), str(self.exon.end), self.exon.gene,
+                         str(len(self.somatic_sites)), str(self.somatic_sites), str(self.tmb)])
-Original file line number
+Diff line change
@@ -0,0 +1,4 @@
+87  89
+88  92
+89  94
+90  92