-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfilter_genes.py
133 lines (107 loc) · 3.78 KB
/
filter_genes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import sys
from traceback import print_exc
import time
import socket
try:
from urllib2 import urlopen
import urllib
except:
from urllib.request import urlopen
import urllib.request as urllib
import xml.etree.ElementTree as ET
ORGANISM = "Mus%20musculus"
DESCRIPTION = "splicing factor"
def try_send_request(url):
attempts = 0
response = None
connection_errors = 0
while attempts < 3:
try:
request = urlopen(url)
connection_errors = 0
response = request.read()
if not isinstance(response, str):
response = response.decode('utf-8')
if response is None or 'ERROR' in response:
request.close()
raise Exception
break
except Exception:
attempts += 1
if attempts >= 3:
connection_errors += 1
if connection_errors >= 3:
print('Cannot establish connection to NCBI')
return None
# NCBI recommends users post no more than three URL requests per second, so adding artificial 1-sec delay
time.sleep(1)
return response
def get_gene_ids(gene_name):
query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term=({gene_name}[gene])%20AND%20({organism}[orgn])%20AND%20alive[prop]%20NOT%20newentry[gene]".\
format(gene_name=gene_name, organism=ORGANISM)
response = try_send_request(query)
if response is None:
print('Failed to retrieve gene information')
return None
xml_tree = ET.fromstring(response)
if xml_tree.find('Count').text == '0': # Organism is not found
print("Gene " + gene_name + " was not found")
return None
gene_id_list = xml_tree.find('IdList').findall('Id')
gene_ids = []
for gid in gene_id_list:
gene_ids.append(gid.text)
return gene_ids
def check_gene_description(ncbi_gene_id, gene_name):
query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id={gene_id}".format(gene_id=ncbi_gene_id)
response = try_send_request(query)
gene_block_found = False
description = ""
locus = ""
in_syn_block = False
synonyms = ""
for l in response.split('\n'):
if not gene_block_found and l.find("gene {") != -1:
gene_block_found = True
continue
if not gene_block_found:
continue
#print(l)
if l.find("desc") != -1:
description = l
if l.find("locus") != -1:
locus = l
if l.find("syn") != -1:
in_syn_block = True
continue
if in_syn_block:
if l.find('}') != -1:
in_syn_block = False
continue
synonyms += l.strip()
if len(description) > 0 and len(locus) > 0 and len(synonyms) > 0 and not in_syn_block:
break
return description.lower().find(DESCRIPTION.lower()) != -1 and (locus.lower().find(gene_name.lower()) != -1 or synonyms.lower().find(gene_name.lower()) != -1)
def main():
if len(sys.argv) != 2:
print("Usage: " + sys.argv[0] + " <file with gene list one per line> > <genes that have " + DESCRIPTION + " in their description>")
exit(0)
for l in open(sys.argv[1]):
gene_name = l.strip()
if len(gene_name) == 0:
continue
ids = get_gene_ids(gene_name)
if ids is None:
continue
for gid in ids:
if check_gene_description(gid, gene_name):
print(gene_name)
if __name__ == "__main__":
# stuff only to run when not called via 'import' here
try:
main()
except SystemExit:
raise
except:
print_exc()
sys.exit(-1)