-
Notifications
You must be signed in to change notification settings - Fork 2
/
detect_union_bin_errors.py
89 lines (80 loc) · 3.83 KB
/
detect_union_bin_errors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Copyright 2013-2015 James S Blachly, MD and The Ohio State University
#
# This file is part of Mucor.
#
# Mucor is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Mucor is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Mucor. If not, see <http://www.gnu.org/licenses/>.
import argparse
import HTSeq
import itertools
import os
from collections import defaultdict
def FindProblems(featureDict):
multipleCopies = defaultdict(list) # keys are gene, values are lists of affected contigs
# note that not all contigs need to be excluded for an affected gene
for feature, locs in featureDict.items():
for contig in locs.keys():
if len(locs[contig]) > 1:
# this feature has multiple copies on this contig
if ThereAreCopies(locs[contig]):
# these copies are problematic, ie: they are not overlapping features of the same name
multipleCopies[feature].append(contig)
else:
# although there are copies of this feature on the same contig, they overlap each other enough to be 1 bin
pass
return multipleCopies
def ThereAreCopies(locs):
# if the lowest ending point is less than the highest starting point, these are separate copies
locLowestEnd = float('inf')
locHighestStart = 0
for loc in locs:
if loc[1] < locLowestEnd:
locLowestEnd = loc[1]
if loc[0] > locHighestStart:
locHighestStart = loc[0]
if locLowestEnd < locHighestStart:
return True
return False
def WriteProblems(problems, outputDir):
output = open(outputDir + '/union_incompatible_genes.txt', 'w')
for gene in problems.keys():
output.write(str(gene + '\n'))
for contig in problems[gene]:
output.write(str(gene + '.' + contig + '\n'))
return True
def main():
'''
This tool was designed to aid users running Mucor with --union while using feature type = gene_name.
It will find features with the same name on the same contig, which cause large, problematic feature bins when running Mucor with --union.
The output is a text document to be placed into the current working directory where Mucor will be executed
It will be read automatically if present at runtimme
'''
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--output_directory", required=True, help="Directory in which to place output")
parser.add_argument("-g", "--gff", required=True, help="Annotation GFF/GTF for feature binning")
parser.add_argument("-f", "--featuretype", required=True, help="Feature type into which to bin. Gencode GTF example: gene_name, gene_id, transcript_name, transcript_id, etc. ")
args = parser.parse_args()
gffFile = HTSeq.GFF_Reader(args.gff)
features = defaultdict(dict)
for feature in itertools.islice(gffFile, 0, None):
if feature.type == "gene":
name = feature.attr[args.featuretype]
try:
features[name][feature.iv.chrom].append((feature.iv.start,feature.iv.end))
except KeyError:
features[name][feature.iv.chrom] = [(feature.iv.start,feature.iv.end)]
problems = FindProblems(features)
WriteProblems(problems, os.path.expanduser(args.output_directory))
if __name__ == "__main__":
main()