-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpaxdb.py
93 lines (69 loc) · 3.2 KB
/
paxdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import csv
import numpy as np
from data_helpers import getSpeciesProperty, SpeciesCDSSource, CDSHelper
def parsePaxDbFile(filename, taxId = None):
abundanceData = {}
# External protein-ids appear prefixed by the tax-id; this should be removed
if not taxId is None:
fixedPrefixToRemove = "{}.".format(taxId)
else:
fixedPrefixToRemove = ""
with open(filename, 'r') as csvfile:
for row in csv.reader(csvfile, delimiter='\t'):
if not row: continue
if row[0][0]=='#': continue
assert( len(row) >= 3 )
# Extract the protein-id
protId = row[1]
if protId[:len(fixedPrefixToRemove)]==fixedPrefixToRemove: # remove common prefix
protId = protId[len(fixedPrefixToRemove):]
# Extract the abundance
protAbundance = float( row[2] )
# Store the results
abundanceData[protId] = protAbundance
return abundanceData
def getSpeciesPaxdbDataFromFile( taxId : int ) -> dict:
(paxfn, _) = getSpeciesProperty( taxId, 'paxdb-path' )
if paxfn is None: return {}
return parsePaxDbFile( paxfn, taxId=taxId )
def convertValuesDictToPercentiles( rawdata : dict ) -> dict:
if not rawdata: return {}
k = list(rawdata.keys())
v = list(rawdata.values())
vv = np.array( v )
percentileRanks = np.vectorize( lambda x, a: (x>a).sum()/len(a), excluded=frozenset((1,)) )(vv, vv) # no built-in for percentile ranks...
assert(vv.shape==percentileRanks.shape)
return dict(zip(k, percentileRanks))
_paxdbData = {}
def getSpeciesPaxdbData( taxId : int, convertToPercentiles : bool = True ) -> dict:
ret = _paxdbData.get( (taxId,convertToPercentiles), None )
if not ret is None: return ret
newdata = getSpeciesPaxdbDataFromFile( taxId )
if convertToPercentiles:
newdata = convertValuesDictToPercentiles( newdata )
_paxdbData[ (taxId,convertToPercentiles) ] = newdata
return newdata
def testSpecies(taxId):
paData = getSpeciesPaxdbData( taxId )
countFound = 0
countNotFound = 0
for protId in SpeciesCDSSource(taxId):
cds = CDSHelper( taxId=taxId, protId=protId )
geneId = cds.getGeneId()
if geneId in paData:
countFound += 1
else:
countNotFound += 1
print("Species: {} -> Found: {} ({:.3}%) Not found: {}".format(taxId, countFound, countFound/(countFound+countNotFound)*100, countNotFound))
return( countFound, countNotFound)
def testAll():
#ret = parsePaxDbFile( '/tamir1/mich1/termfold/data/PaxDB/722438-Mycoplasma_pneumoniae_M129_Kuhner_et_al_Science2009.txt', taxId=722438 )
#print(ret)
#allSpeciedWithPaxdbData = frozenset((1148,158878,160490,169963,192222,208964,224308,243159,267671,272623,283166,449447,511145,546414,593117,64091,722438,83332,85962,882,99287))
allSpeciedWithPaxdbData = frozenset((1148,158878,160490,169963,192222,208964,224308,243159,267671,272623,283166,449447,511145,546414,64091,83332,85962,882,99287,722438,196627,298386))
for taxId in allSpeciedWithPaxdbData:
testSpecies(taxId)
return 0
if __name__=="__main__":
import sys
sys.exit(testAll())