-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest.py
224 lines (180 loc) · 9.48 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def test():
print test
Snippet code:
now = datetime.now()
current_month = str(now.month)
current_day = str(now.day)
current_year = str(now.year)
print current_month + "/" + current_day + "/" + current_year
import sqlite3 as lite
import csv
import numpy as np
import scipy
"""
This script will use a file containing RNA-RPKM (Reads Per Kilobase per Million mapped reads)
values for 22 different human tissues. Each tissue have measurement of the RNA-transcript
levels for 23115 ensg IDs (gene identifiers).
The script will normalize the values across the samples for each and every gene and finally
a CSV file with 22 rows (one for each tissue) and 23115 columns (one for each gene).
So the output of the script is a normalized RNA transcript vector for each sample (tissue).
"""
# Please note, before you can run the code below you need to create a database from
# the shell command-line. To create a new empty database named "rpkm.db" run the following:
"""
sqlite3 rpkm.db
.tables
.exit
"""
# Create a connection to the database
con = lite.connect('rpkm.db')
# cur is a "cursor" object that can be used to traverse the records from the result,
# the curser is bound to a connection
cur = con.cursor()
# Get the database version
cur.execute('SELECT SQLITE_VERSION()')
# Fetch the result of the query
data = cur.fetchone()
# Print the result
print "SQLite version: %s" % data
"""This is what the 2 first lines of the "raw data" (the file WangSandberg.rpkm.txt) looks like:
Gene UHRLowcov brainLowcov adipose brain breast colon heart liver lymphNode skelMuscle testes cerebellum1 cerebellum2 cerebellum3 cerebellum4 cerebellum5 cerebellum6 MCF7 BT474 HME MB435 T47D Symbol Description
ENSG00000000003 18.51 1.33 5.91 2.41 4.25 3.90 2.51 17.43 1.40 0.51 8.95 0.58 0.33 0.35 0.36 0.33 0.44 7.25 0.81 3.09 5.02 3.00 TSPAN6 transmembrane 4 superfamily member 6
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
23115 lines
"""
# Creat a table that looks as your data ( I am skipping the description column)
cur.execute("""CREATE TABLE wangsandberg (
ensg TEXT,
gene_name TEXT,
uhrlowcov REAL,
brainlowcov REAL,
adipose REAL,
brain REAL,
breast REAL,
colon REAL,
heart REAL,
liver REAL,
lymphnode REAL,
skelmuscle REAL,
testes REAL,
cerebellum1 REAL,
cerebellum2 REAL,
cerebellum3 REAL,
cerebellum4 REAL,
cerebellum5 REAL,
cerebellum6 REAL,
mcf7 REAL,
bt474 REAL,
hme REAL,
mb435 REAL,
t47d REAL)""")
# Open the data file for reading
ifile = open("WangSandberg.rpkm.txt", "rb")
# It is "tab-separated"
reader = csv.reader(ifile, delimiter='\t')
# Skip the first line of the file since that is the header
next(reader)
iCount = 0
for row in reader:
if (iCount % 500) == 0:
# For every 500 lines we commit the transaction to the database, otherwise the redo log will grow
con.commit()
print "At line: " + str(iCount)
sSql = """insert into wangsandberg (gene_name, ensg, uhrlowcov, brainlowcov, adipose, brain, breast,
colon, heart, liver, lymphnode, skelmuscle, testes, cerebellum1, cerebellum2, cerebellum3, cerebellum4,
cerebellum5, cerebellum6, mcf7, bt474, hme, mb435, t47d) values
('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s',
'%s', '%s', '%s', '%s', '%s', '%s', '%s')""" % (row[23], row[0], row[1], row[2], row[3], row[4], row[5],
row[6], row[7], row[8], row[9], row[10], row[11], row[12], row[13], row[14], row[15], row[16], row[17],
row[18], row[19], row[20], row[21], row[22])
slask = cur.execute(sSql)
iCount += 1
# One more commit for the last lines
con.commit()
# Create an unique index on the ensg column, this will speed up inserts and selects about 100-1000 times
# important that we create the index AFTER we have inserted all the ensg IDs
cur.execute('CREATE UNIQUE INDEX ensg_unique ON wangsandberg (ensg ASC)')
# Create a new column for each tissue type
cur.execute("alter table wangsandberg add column uhrlowcov_rel REAL")
cur.execute("alter table wangsandberg add column brainlowcov_rel REAL")
cur.execute("alter table wangsandberg add column adipose_rel REAL")
cur.execute("alter table wangsandberg add column brain_rel REAL")
cur.execute("alter table wangsandberg add column breast_rel REAL")
cur.execute("alter table wangsandberg add column colon_rel REAL")
cur.execute("alter table wangsandberg add column heart_rel REAL")
cur.execute("alter table wangsandberg add column liver_rel REAL")
cur.execute("alter table wangsandberg add column lymphnode_rel REAL")
cur.execute("alter table wangsandberg add column skelmuscle_rel REAL")
cur.execute("alter table wangsandberg add column testes_rel REAL")
cur.execute("alter table wangsandberg add column cerebellum1_rel REAL")
cur.execute("alter table wangsandberg add column cerebellum2_rel REAL")
cur.execute("alter table wangsandberg add column cerebellum3_rel REAL")
cur.execute("alter table wangsandberg add column cerebellum4_rel REAL")
cur.execute("alter table wangsandberg add column cerebellum5_rel REAL")
cur.execute("alter table wangsandberg add column cerebellum6_rel REAL")
cur.execute("alter table wangsandberg add column mcf7_rel REAL")
cur.execute("alter table wangsandberg add column bt474_rel REAL")
cur.execute("alter table wangsandberg add column hme_rel REAL")
cur.execute("alter table wangsandberg add column mb435_rel REAL")
cur.execute("alter table wangsandberg add column t47d_rel REAL")
# Next step is to add a normalized value for each tissue and each gene
sSql = """select ensg, uhrlowcov, brainlowcov, adipose, brain, breast,
colon, heart, liver, lymphnode, skelmuscle, testes, cerebellum1, cerebellum2, cerebellum3, cerebellum4,
cerebellum5, cerebellum6, mcf7, bt474, hme, mb435, t47d from wangsandberg order by ensg"""
# Fetch all the data in the table
cur.execute(sSql)
rows = cur.fetchall()
iCount = 0
lenOfRow = len(rows[0])
for row in rows:
# This loop will calculate normalized rpkm values for each gene
# and insert the result into the new columns we just created
iCount += 1
if (iCount % 500) == 0:
# Commit everey 500 insert to spare the redo log
con.commit()
print iCount
fMax = np.asarray(row[1:lenOfRow]).max() # highest rpkm value for this gene
if fMax == 0:
# If there are no values for the gene, then we skip it and take the next one
continue
row2 = [float(x) for x in row[1:lenOfRow]] # copy the values for this gene
row2 /= fMax # Divide all values with highest, row2 is now normalized
# Update the database with the normalized values
sSql = """update wangsandberg set uhrlowcov_rel='%s', brainlowcov_rel='%s', adipose_rel='%s', brain_rel='%s',
breast_rel='%s', colon_rel='%s', heart_rel='%s', liver_rel='%s', lymphnode_rel='%s', skelmuscle_rel='%s',
testes_rel='%s', cerebellum1_rel='%s', cerebellum2_rel='%s', cerebellum3_rel='%s', cerebellum4_rel='%s',
cerebellum5_rel='%s', cerebellum6_rel='%s', mcf7_rel='%s', bt474_rel='%s', hme_rel='%s', mb435_rel='%s',
t47d_rel='%s' where ensg = '%s' """ % (row2[0], row2[1], row2[2], row2[3], row2[4], row2[5], row2[6],
row2[7], row2[8], row2[9], row2[10], row2[11], row2[12], row2[13], row2[14], row2[15], row2[16], row2[17],
row2[18], row2[19], row2[20], row2[21], row[0])
slask = cur.execute(sSql)
# Commit the last bunch of inserts
con.commit()
# Export the normalized values to a text file
sSql = """select uhrlowcov_rel, brainlowcov_rel, adipose_rel, brain_rel, breast_rel,
colon_rel, heart_rel, liver_rel, lymphnode_rel, skelmuscle_rel, testes_rel, cerebellum1_rel,
cerebellum2_rel, cerebellum3_rel, cerebellum4_rel, cerebellum5_rel, cerebellum6_rel,
mcf7_rel, bt474_rel, hme_rel, mb435_rel, t47d_rel from wangsandberg order by ensg"""
# Fetch all the data in the table
cur.execute(sSql)
rows = cur.fetchall()
iNrOfGenes = len(rows)
# All the sample-names in one long string:
sSamples = "uhrlowcov_rel,brainlowcov_rel,adipose_rel,brain_rel,breast_rel,colon_rel,heart_rel,liver_rel,lymphnode_rel,skelmuscle_rel,testes_rel,cerebellum1_rel,cerebellum2_rel,cerebellum3_rel,cerebellum4_rel,cerebellum5_rel,cerebellum6_rel,mcf7_rel,bt474_rel,hme_rel,mb435_rel,t47d_rel"
arraySamples = sSamples.split(',')
iNrOfSamples = len(arraySamples)
# Create a matrix with as many rows as we have samples (22) and as many columns as we have ensg (23115)
D = scipy.zeros([iNrOfSamples, iNrOfGenes])
# These 2 loops just populates the D matrix with the all the values that we got from the database
for i in range(iNrOfSamples):
for j in range(iNrOfGenes):
D[i, j] = rows[j][i]
with open('normalizedRPKM.csv', 'wb') as outputFile:
# For each sample we construct a row of ';' separated values (23115, one for each ensg)
for i in range(iNrOfSamples):
sTmpStr = []
sTmpStr.append([str(x) for x in D[i]])
sStr = arraySamples[i] + ';' + ';'.join(sTmpStr[0])
outputFile.write(sStr + '\n')
outputFile.close()