-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathncbi_querying.py
129 lines (103 loc) · 2.95 KB
/
ncbi_querying.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
'''
Getting IDs and stuff from NCBI.
'''
from Bio import Entrez
Entrez.email = '[email protected]'
import re
def get_ncbi_assembly_id(aid):
handle2 = Entrez.efetch(db="nucleotide", id=aid, rettype="gb", retmode="text")
return_str = handle2.read()
re_str = 'Assembly: (?P<assemb>[A-Za-z0-9\._]+)'
assemb_re = re.compile(re_str)
assid = assemb_re.search(return_str)
if assid is not None and assid.group('assemb'):
return assid.group('assemb')
else:
return None
"""
The part below contains functions for getting NCBI taxonomy information from
NCBI accession IDs.
Written by Erin K. Molloy ([email protected]) in September 2016.
"""
def get_ncbi_taxon_id(aid):
"""
(Written by Erin)
Takes NCBI Accession ID and returns NCBI Taxon ID.
Parameters
-----------
aid : string
NCBI Accession ID
Returns
--------
xid : string
NCBI Accession ID
"""
import subprocess
pad = 100
p = subprocess.Popen(["curl", "-s",
"https://www.ncbi.nlm.nih.gov/nuccore/" + aid],
stdout=subprocess.PIPE)
(content, error) = p.communicate()
start = "ORGANISM="
end = "&"
s = content.find(start) + len(start)
e = s + content[s:s+pad].find(end)
return content[s:e]
def get_ncbi_taxonomy(aids, ofile):
"""
Takes NCBI Accession ID and returns NCBI taxonomic lineage.
Parameters
----------
xid : list of strings
NCBI Accession IDs
Returns
-------
df : dataframe
NCBI taxonomic lineage
"""
from ete3 import NCBITaxa
import pandas as pd
of = open(ofile, 'w')
ncbi = NCBITaxa()
# keys = ["accid", "taxid",
# "superkingdom", "phylum", "class",
# "order", "family", "genus", "species"]
keys = ['accid', 'taxid']
rows = []
# print "starting the list of accession ids"
ct = 0
for aid in aids:
xid = get_ncbi_taxon_id(aid)
# lineage = ncbi.get_lineage(xid)
# ranks = ncbi.get_rank(lineage)
# names = ncbi.get_taxid_translator(lineage)
tax = {}
tax["accid"] = aid
tax["taxid"] = xid
of.write('%s,%s\n' % (aid, xid))
# for k in keys[2:]:
# try:
# i = ranks.values().index(k)
# except ValueError:
# i = -1
# if i == -1:
# tax[k] = "NA"
# else:
# tax[k] = names[ranks.keys()[i]].replace(' ', '_')
# rows.append(tax)
# if ct % 100 ==0 :
# ct +=1
# print "%s done." % ct
of.close()
# df = pd.DataFrame(rows, columns=keys)
# return df
if __name__ == "__main__":
import sys
ifile = str(sys.argv[1])
ofile = str(sys.argv[2])
with open(ifile, 'r') as f:
aids = f.readlines()
aids = [aid.rstrip('\n') for aid in aids]
get_ncbi_taxonomy(aids, ofile)
# df = get_ncbi_taxonomy(aids)
# df.to_csv(ofile, header=True, index=False)