-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathconstruct_DESeq2_input.py
127 lines (93 loc) · 4.07 KB
/
construct_DESeq2_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
### Boas Pucker ###
### [email protected] ###
### v0.1 ###
__usage__ = """
python construct_DESeq2_input.py
--counts <FULL_PATH_TO_OUTPUT_COUNT_TABLE>
--info <FULL_PATH_TO_SAMPLE_INFO_FILE>
--out <FULL_PATH_TO_OUTPUT_DIRECTORY>
bug reports and feature requests: [email protected]
"""
import glob, sys, re, os
from datetime import date
from operator import itemgetter
# --- end of imports --- #
def load_expression_values( filename, ID ):
"""! @brief load all expression values from featureCounts result file and return raw counts """
expression_data = {}
with open( filename, "r" ) as f:
header = f.readline().strip().split('\t')
idx_of_interest = header.index( ID )
line = f.readline()
while line:
parts = line.strip().split('\t')
expression_data.update( { parts[0]: float( parts[ idx_of_interest ] ) } )
line = f.readline()
return expression_data
def load_sample_information( sample_information_file, combined_count_table ):
"""! @brief load information about all samples into list of dictionaries """
sample_information = []
with open( sample_information_file, "r" ) as f:
f.readline() #header
line = f.readline()
while line:
parts = line.strip().split('\t')
sample_information.append( { 'name': parts[0],
'expression':load_expression_values( combined_count_table, parts[0] ),
'replicate': parts[2],
'genotype': parts[1],
} )
line = f.readline()
print "number of loaded sample information data sets: " + str( len( sample_information ) )
return sample_information
def construct_sample_table( sample_information, sample_table ):
"""! @brief construct sample table for DeSeq2 """
# --- construct sample table --- #
ordered_valid_samples = sorted( sample_information, key=itemgetter( 'genotype', 'replicate' ) )
with open( sample_table, "w" ) as out:
out.write( '\t'.join( [ "", 'genotype', 'replicate' ] ) + '\n' )
for idx, sample in enumerate( ordered_valid_samples ):
new_line = "\t".join( [ sample['name'], #sample name
sample['genotype'],
sample['replicate'] #replicate
] ) + '\n'
out.write( new_line )
return ordered_valid_samples
def construct_data_matrix( ordered_valid_samples, data_matrix_file ):
"""! @brief construct the data matrix for DESeq2 based on featureCount result files """
with open( data_matrix_file, "w" ) as out:
# --- construct header line (all sample IDs in order) --- #
ordered_valid_ids = []
for sample in ordered_valid_samples:
ordered_valid_ids.append( sample['name'] )
out.write( "\t".join( [ "" ] + ordered_valid_ids ) + '\n' )
# --- construct data body (expression data per sample in one column) --- #
gene_IDs = sorted( ordered_valid_samples[0]['expression'].keys() )
for gene in gene_IDs:
new_line = [ gene ] #add geneID
for sample in ordered_valid_samples:
new_line.append( int( sample[ 'expression' ][ gene ] ) )
out.write( '\t'.join( map( str, new_line ) ) + '\n' )
def main( arguments ):
"""! @brief run all parts of this script """
combined_count_table = arguments[ arguments.index('--counts')+1 ] #path to data table
sample_information_file = arguments[ arguments.index('--info')+1 ] #path to table with meta information
prefix = arguments[ arguments.index('--out')+1 ] #prefix for output
if prefix[ -1 ] != "/":
prefix += "/"
if not os.path.exists( prefix ):
os.makedirs( prefix )
# --- nothing needs to be changed beyond this point --- #
if prefix[-1] != '/':
prefix += "/"
if not os.path.exists( prefix ):
os.makedirs( prefix )
sample_table = prefix + "clean_sample_table.txt"
data_matrix_file = prefix + "clean_data_matrix.txt"
sample_information = load_sample_information( sample_information_file, combined_count_table )
ordered_valid_samples = construct_sample_table( sample_information, sample_table )
construct_data_matrix( ordered_valid_samples, data_matrix_file )
if '--counts' in sys.argv and '--info' in sys.argv and '--out' in sys.argv:
main( sys.argv )
else:
sys.exit( __usage__ )