Skip to content

Commit

Permalink
Initial commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
PolinaBevad committed Jun 17, 2019
0 parents commit e5da7e5
Show file tree
Hide file tree
Showing 25 changed files with 332 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
venv/*
.pytest_cache/*
Pipfile.lock
.idea/
Empty file added LICENSE
Empty file.
13 changes: 13 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true

[dev-packages]

[packages]
pysam = "*"
pytest = "*"

[requires]
python_version = "3.6"
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
TMB (tumor mutation burden)

### Requirements
* Python >=3.6
* pip3
* pipenv (it is the recommended way to rule packages and environments)
* pysam

Example of installation packages and Python 3.6 in Ubuntu:
`sudo apt install python3.6`
`alias python='usr/bin/python3.6`
`sudo apt install python3-pip`
`pip3 install pipenv`
Run `pipenv install pysam` in project directory
Binary file added data/bed_bam_test/test.bam
Binary file not shown.
Binary file added data/bed_bam_test/test.bam.bai
Binary file not shown.
4 changes: 4 additions & 0 deletions data/bed_bam_test/test1.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
1 87 89 gene
1 88 92 gene2
1 89 94 gene2
1 90 92 gene3
4 changes: 4 additions & 0 deletions data/bed_bam_test/test2.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
1 87 89
1 88 92
1 89 94
1 90 92
4 changes: 4 additions & 0 deletions data/bed_bam_test/test3.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
1 87 89
1 88
1 89 94
1 90 92
1 change: 1 addition & 0 deletions data/chr5_665281/dist.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
chr5 665279 665338 GENE
Binary file added data/chr5_665281/normal_chr5_665281.bam
Binary file not shown.
Binary file added data/chr5_665281/normal_chr5_665281.bam.bai
Binary file not shown.
Binary file added data/chr5_665281/tumour_chr5_665281.bam
Binary file not shown.
Binary file added data/chr5_665281/tumour_chr5_665281.bam.bai
Binary file not shown.
49 changes: 49 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python

import argparse
from tmb.config import Config
from tmb.bedreader import BedReader
from tmb.distribution import calculate_tmb
from multiprocessing import Pool


def main():
config = parse_config()
exons = BedReader(config.bed).exonList

pool = Pool(processes=Config.th)
results = [pool.apply_async(calculate_tmb, args=(config.tumor, config.normal, exon)) for exon in exons]

print_header()
for output in results:
tmb = output.get()
print(tmb)


def parse_config():
parser = argparse.ArgumentParser()
required = parser.add_argument_group("required")
optional = parser.add_argument_group("optional")
optional.add_argument('--freq', type=float,
help="Minimum difference of frequencies for base to consider site as somatic. "
"Default: 5%% (0.05)")
optional.add_argument('--mapq', type=float, help="Minimum read mapping quality. Default: 10.0")
optional.add_argument('--baseq', type=int, help="Minimum base quiality. Default: 25")
optional.add_argument('--mincov', type=int,
help="Minimum coverage of base for nucleotide to be considered as somatic. "
"Default: 2")
optional.add_argument('--th', type=int, help="Number of threads for multiprocessing mode. Default: 1")
required.add_argument('--normal', required=True, type=str, help="Path to normal SAM/BAM/CRAM file.")
required.add_argument('--tumor', required=True, type=str, help="Path to tumor SAM/BAM/CRAM file.")
required.add_argument('--bed', required=True, type=str, help="Path to BED file.")

args = parser.parse_args()
return Config(args)


def print_header():
print("\t".join(['Chr', 'Start', 'End', 'Gene', 'SomaticSites', 'TMB']))


if __name__ == "__main__":
main()
Empty file added tests/__init__.py
Empty file.
29 changes: 29 additions & 0 deletions tests/test_bamreader_pytest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from tmb.bamreader import BamReader
from tmb.bedreader import BedReader

import pytest


def test_bam_test_read():
path = '../data/bed_bam_test/test.bam'
bam = BamReader(path).file
bam_iter = bam.fetch('1', 28234090, 28234093)
list1 = [x for x in bam_iter]
assert len(list1) == 4


def test_bed_test_read_correct():
path = '../data/bed_bam_test/test1.bed'
exons = BedReader(path).exonList
assert len(exons) == 4

path = '../data/bed_bam_test/test2.bed'
exons = BedReader(path).exonList
assert len(exons) == 4


def test_bed_test_read_incorrect():
path = '../data/bed_bam_test/test3.bed'
with pytest.raises(IndexError):
BedReader(path)

26 changes: 26 additions & 0 deletions tests/test_distribution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from numpy.testing import assert_almost_equal

from tmb.bedreader import BedReader
from tmb.distribution import calculate_tmb

import pytest


def test_distribution():
normal = '../data/chr5_665281/normal_chr5_665281.bam'
tumor = '../data/chr5_665281/tumour_chr5_665281.bam'
exons = BedReader('../data/chr5_665281/dist.bed').exonList

for exon in exons:
tmb = calculate_tmb(tumor, normal, exon)
assert tmb.somatic_sites == [665281, 665308, 665336]
assert_almost_equal(tmb.tmb, 50847.46)


def test_panel_indexes():
tumor = '../data/panel_az_600/Dev_731_GTL_16_5_Pool1-ready.bam'
normal = '../data/panel_az_600/Dev_731_NA12878a_Pool1-ready.bam'
exons = BedReader('../data/panel_az_600/panel_az600_chr7_MET.bed').exonList

for exon in exons:
tmb = calculate_tmb(tumor, normal, exon)
Empty file added tmb/__init__.py
Empty file.
24 changes: 24 additions & 0 deletions tmb/bamreader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pysam


class BamReader:
def __init__(self, path):
extension = path.strip().split('.')[-1]
if extension == 'bam':
self.file = self.read_bam(path)
if extension == 'cram':
self.file = self.read_cram(path)
if extension == 'cram':
self.file = self.read_sam(path)

def read_bam(self, path):
file = pysam.AlignmentFile(path, "rb")
return file

def read_sam(self, path):
file = pysam.AlignmentFile(path, "r")
return file

def read_cram(self, path):
file = pysam.AlignmentFile(path, "rc")
return file
22 changes: 22 additions & 0 deletions tmb/bedreader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from tmb.exome import Exome


class BedReader:

def __init__(self, path):
extension = path.split('.')[-1]
if extension == 'bed':
self.exonList = self.read_bed(path)

def read_bed(self, path):
exonlist = []

file = open(path, "r")
for x in file:
line = x.split()
if len(line) == 4:
exome = Exome(str(line[0]), int(line[1]), int(line[2]), str(line[3]))
else:
exome = Exome(str(line[0]), int(line[1]), int(line[2]), '.')
exonlist.append(exome)
return exonlist
28 changes: 28 additions & 0 deletions tmb/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Configuration parameters for thresholds of quality etc.


class Config:
freq = 0.05
mapq = 10.0
baseq = 25
mincov = 2
th = 1

@staticmethod
def set_config(args):
if args.freq:
Config.freq = args.freq
if args.mapq:
Config.mapq = args.mapq
if args.baseq:
Config.baseq = args.baseq
if args.mincov:
Config.mincov = args.mincov
if args.th:
Config.th = args.th

def __init__(self, args):
self.normal = args.normal
self.tumor = args.tumor
self.bed = args.bed
self.set_config(args)
85 changes: 85 additions & 0 deletions tmb/distribution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from tmb.tmbresult import TMBResult
from tmb.bamreader import BamReader
from tmb.config import Config


def calculate_tmb(tumor_path, normal_path, exon):
tumor_distribution, normal_distribution = collect_distributions(tumor_path, normal_path, exon)
tmb, somatic_sites = hypotesis_test(tumor_distribution, normal_distribution, exon)
tmb = TMBResult(exon, tmb, somatic_sites)
return tmb


def collect_distributions(tumor_path, normal_path, exon):
tumor = BamReader(tumor_path).file
normal = BamReader(normal_path).file

tumor_distribution = fill_maps(tumor, exon)
# print(tumor_distribution)
normal_distribution = fill_maps(normal, exon)
# print(normal_distribution)
return tumor_distribution, normal_distribution


def fill_maps(bam, exon):
positions_to_acgt = {}
for i in range(exon.start, exon.end):
positions_to_acgt[i] = {"A": 0, "C": 0, "G": 0, "T": 0, "N": 0, "DEL": 0}

# Pile can get reads that will cover this position in column-like form
pile = bam.pileup(exon.chr, exon.start, exon.end)
for pileupcolumn in pile:
position = pileupcolumn.pos
if position < exon.start or position >= exon.end:
continue

acgt_to_counts = {"A": 0, "C": 0, "G": 0, "T": 0, "N": 0, "DEL": 0}
for pileupread in pileupcolumn.pileups:
if bad_read(pileupread):
continue
if pileupread.query_position is None:
base = 'DEL'
else:
base_quality = pileupread.alignment.query_qualities[pileupread.query_position]
if base_quality < Config.baseq:
break
base = pileupread.alignment.query_sequence[pileupread.query_position]
acgt_to_counts[base] += 1
positions_to_acgt[position] = acgt_to_counts
return positions_to_acgt


def hypotesis_test(tumor, normal, exon):
exone_len = exon.end - exon.start
somatic_sites = []
for position in tumor:
tumor_counts = list(tumor[position].values())
normal_counts = list(normal[position].values())

tumor_total_coverage = sum(tumor_counts)
normal_total_coverage = sum(normal_counts)

if tumor_total_coverage == 0 or normal_total_coverage == 0:
continue

tumor_percentage = list([x / tumor_total_coverage for x in tumor_counts])
normal_percentage = list([x / normal_total_coverage for x in normal_counts])

for i in range(len(tumor_counts)):
a = abs(tumor_percentage[i] - normal_percentage[i])
if a > Config.freq and tumor_counts[i] > Config.mincov and normal_counts[i] > Config.mincov:
# Sam is 0-based, extend position:
somatic_sites.append(position + 1)
break

tmb = round((len(somatic_sites) / exone_len) * 1000000, 2)
return tmb, somatic_sites


# Read doesn't fit criteria for quality
def bad_read(read):
if read.alignment.mapping_quality < Config.mapq:
return True

return False

16 changes: 16 additions & 0 deletions tmb/exome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
class Exome:
def __init__(self, chr, start, end, gene):
self.chr = chr
self.start = start
self.end = end
self.gene = gene
self.length = end - start

def __str__(self):
exon_description = ', '.join(['{key}={value}'.format(key=key, value=self.__dict__.get(key))
for key in self.__dict__])
return '\n' + exon_description




9 changes: 9 additions & 0 deletions tmb/tmbresult.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
class TMBResult:
def __init__(self, exon, tmb, somatic_sites):
self.exon = exon
self.tmb = tmb
self.somatic_sites = somatic_sites

def __str__(self):
return "\t".join([str(self.exon.chr), str(self.exon.start), str(self.exon.end), self.exon.gene,
str(len(self.somatic_sites)), str(self.somatic_sites), str(self.tmb)])

0 comments on commit e5da7e5

Please sign in to comment.