-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextractsnps.py
executable file
·165 lines (126 loc) · 4.06 KB
/
extractsnps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python
# coding: utf-8
import os
import argparse
import glob
from itertools import combinations
from functools import partial
import multiprocessing
import subprocess
class Pairwise:
"""Calculated pairwise differnce counts between two isolate multifasta files
Attributes:
isolate1: String representing the name of the first isolate in the comparison
snpDiff: Integer representing the number of snp differnces calculated by GATK HaplotypeCaller
"""
def __init__(self,isolate1,snpDiff):
self.isolate1 = isolate1
self.snpDiff = snpDiff
"""
Returns a new Pairwise object
"""
def vcfOutputParser(logList,baseName):
"""Parses vcf output file
Params
------
logList: String
A list of the lines in the log file created by vcf filtering command
baseName: String
Basename of the isolate
Returns
-------
snpCount: String
The number of SNPs after filtering
"""
snps = 0
#baseName = baseName.split('/')[-1]
for line in logList:
if line.startswith('After filtering'):
if 'possible' in line:
snps = int(line.split(' ')[3])
snpCount = Pairwise(baseName,snps)
return(snpCount)
def extractSNPs(vcfFile, baseName):
"""Performs vcf filtering
Params
------
vcfFile: String
Input vcf file
Returns
-------
snpCount: String
The number of SNPs after filtering
"""
r1 = subprocess.Popen(('vcftools','--vcf',vcfFile,'--freq'),stderr=subprocess.PIPE)
r2 =(r1.stderr.read().decode('ascii'))
r2 = r2.split('\n')
snpCount = vcfOutputParser(r2,baseName)
return(snpCount)
def listener_process(q):
"""Sets the logging file for the listener process
Params
------
q: queue
queue for logging events
Returns
-------
"""
with open(logFile,'w') as f:
while 1:
msg = q.get()
if msg == 'kill':
break
f.write(str(msg)+'\n')
f.flush()
def bcftoolsParallelFunctions(vcfFile, q):
"""Run SNP calling functions
Params
------
vcfFile: String
File path of the input vcf file
Returns
-------
snpCount: String
Count of SNP differences between isolate and reference
"""
baseName = os.path.splitext(os.path.basename(vcfFile))[0]
snpCount = extractSNPs(vcfFile, baseName)
return(snpCount)
def ParallelRunner(parallelFunctions, inputFiles):
""" Run functions on multiple files in parallel
Params
------
myFunction: String
Name of the function to run
inputFiles: String
list of input files to run in parallel
Returns
-------
"""
manager = multiprocessing.Manager()
q = manager.Queue()
pool = multiprocessing.Pool(processes=args.numcores)
watcher = pool.apply_async(listener_process,(q,))
parallelStatic = partial(parallelFunctions,q=q)
result_list = pool.map(parallelStatic,inputFiles)
q.put('kill')
pool.close()
pool.join()
return(result_list)
parser = argparse.ArgumentParser(description='Script takes bam files and performs SNP calling against a reference sequence using bcftools mpileup')
parser.add_argument('-d', '--directory',type=str,help='Enter the path to the folder containing the bam files to be used')
parser.add_argument('-n', '--numcores',type=int,help='Enter the number of cores you would like to use during processing')
parser.add_argument('-o', '--output',type=str,help='Enter a name for the output file')
args = parser.parse_args()
# Starting from the vcf files
vcfFiles = sorted(glob.glob(args.directory+'/'+'*deDup.vcf'))
logFile = 'snpcaller.log'
# Use parallel processing from Sean's functions to process the samFiles
if __name__ == '__main__':
# Run mpileup here and tabulate the SNP counts
results = ParallelRunner(bcftoolsParallelFunctions, vcfFiles)
# Output isolate SNP count to a file
outFile = open(args.output, 'w')
for i in results:
print(i.isolate1+'\t'+str(i.snpDiff), file=outFile)
outFile.close()