-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathpubmed_crawler.py
202 lines (167 loc) · 6.57 KB
/
pubmed_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import sys,os,pickle,snp,datetime, PyRSS2Gen
from Bio import Entrez
from glob import glob
# This script crawls PubMed for all SNPs included in a 23andme-raw-data-file
# and publishes the resulting hits as a RSS-feed. The script is meant to run
# once a month using cron etc.
# New items will be added to the RSS-feed while old items are kept.
# To reduce load on the NCBI-servers, old items are stored locally
# Dependencies:
# BioPython http://www.biopython.org/
# PyRSS2Gen http://www.dalkescientific.com/Python/PyRSS2Gen.html
# The SNP class found at https://github.com/gedankenstuecke/genotyping_23andme
# CONFIG, EDIT FOR YOUR NEEDS! #
Entrez.email = "[email protected]" # Your email, so NCBI can contact you, if needed
raw_data = "genome_Bastian_Greshake_Full_20110503120911.txt" # The genotyping-raw-data-file of interest
last_update = "last.update" # The file where the last-update-time will be saved
old_items = "pubmed_rss.backup" # Were old feed-times shall be stored
feed_location = "23andme_feed.xml" # Were the RSS-Feed shall be published (/var/www/younameit)
pickleFile = 'pubmed_crawler.pickle'
def store(data, file):
f = open(file, "wb")
try:
pickle.dump(data, f)
except:
print "error saving"
sys.exit(1)
f.close()
def load(file):
if os.path.exists(file):
f = open(file, 'rb')
data = pickle.load(f)
f.close()
return data
return None
def paper_getter(snps,last_update, startAt=1):
new_hits = []
if glob(last_update) == []: #check for time of last update
today = datetime.date.today().strftime("%Y %b")
updated_on = datetime.datetime.strptime(today, "%Y %b")
pickle_out = open(last_update,"wb")
pickle.dump(updated_on,pickle_out)
pickle_out.close() #create new "last update" if needed
updated_on = ""
else: #else load time of last update
pickle_in = open(last_update,"rb")
updated_on = pickle.load(pickle_in)
if os.path.exists(pickleFile):
data = load(pickleFile)
if startAt == 1: startAt = data[0]
new_hits = data[1]
print "RE STARTING AT POSITION", startAt, "WITH",len(new_hits), "HITS"
snp_counter = 1
for single_snp in snps: #iterate over all SNPs
try:
snp_counter += 1
if snp_counter <= startAt:
print "skipping snp", snp_counter, 'till',startAt
continue
handle = Entrez.esearch(db="pubmed", term=single_snp.name)
results = Entrez.read(handle) #Get all papers for the SNP
print snp_counter,
if results["IdList"] != []: #If papers are found for SNP:
for single_paper in results["IdList"]:
try:
summary_handle = Entrez.esummary(db="pubmed", id=single_paper) #Get summary
summary = Entrez.read(summary_handle)
if updated_on == "" or updated_on < datetime.datetime.strptime(summary[0]["PubDate"][:8], "%Y %b"): #If new: Get further information
paper = Papers(single_paper) # Create new paper with basic information about it
paper.snp = single_snp.name
paper.title = summary[0]["Title"]
paper.author = summary[0]["AuthorList"][0]
paper.journal = summary[0]["Source"]
try:
detail_handle = Entrez.efetch(db="pubmed",id=single_paper,retmode="xml")
details = Entrez.read(detail_handle)
try:
paper.abstract = str(details[0]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"])
except:
paper.abstract = "no abstract available"
print paper.snp + ": " + paper.title
print "Written by " + paper.author + " et al"
print "Published in " + paper.journal
print paper.abstract
new_hits.append(paper)
except:
print "NO UPDATE"
pass
except:
print "NO SUMMARY"
pass
else:
print "NONE"
data = [ snp_counter, new_hits ]
store(data, pickleFile)
except:
pass
if os.path.exists(pickleFile):
os.remove(pickleFile)
return new_hits
def item_creator(new_paper): # Create RSS-items out of all new papers
items = []
for paper in new_paper:
heading = paper.snp + ": " + paper.title
item = PyRSS2Gen.RSSItem(title = heading)
description = "Written by " + paper.author + "\n"
description = description + "Published in " + paper.journal + "\n"
description = description + "Abstract: " + paper.abstract + "\n"
description = description + "PMID: " + paper.pmid
link = "http://www.ncbi.nlm.nih.gov/pubmed?term=" + paper.pmid
item.description = description
item.link = link
item.guid = PyRSS2Gen.Guid(link)
items.append(item)
return items
def item_import(rss_items,backup): # Import old RSS-Items
if glob(backup) != []:
pickle_in = open(backup,"rb")
old_rss_items = pickle.load(pickle_in)
for old_item in old_rss_items:
rss_items.append(old_item)
pickle_in.close()
pickle_out = open(backup,"wb")
pickle.dump(rss_items,pickle_out)
pickle_out.close()
else:
pickle_out = open(backup,"wb")
pickle.dump(rss_items,pickle_out)
pickle_out.close()
return rss_items
def create_feed(all_rss_items,feed_location): # Create RSS-Feed
rss = PyRSS2Gen.RSS2(
title = "Papers on SNPs analyzed by 23andme",
link = "https://github.com/gedankenstuecke/genotyping_23andme",
description = "Monthly updated feed on all new papers listed in PubMed about SNPs which are analyzed by 23andme",
lastBuildDate = datetime.datetime.utcnow(),
items = all_rss_items,
)
rss.write_xml(open(feed_location,"w"))
class Papers(): # simple Class for storing basic information about papers
"""docstring for paper"""
def __init__(self, pmid):
self.pmid = pmid
self.title = ""
self.abstract = ""
self.author = ""
self.snp = ""
self.journal = ""
if sys.argv[1] is not None:
if os.path.exists(sys.argv[1]):
raw_data = sys.argv[1] # The genotyping-raw-data-file of interest
startPos = 1
if len(sys.argv) > 2 and sys.argv[2] is not None:
if sys.argv[2].isdigit() and sys.argv[2] > 0:
startPos = int(sys.argv[2])
sys.stderr.write("INPUT FILE " + raw_data + " STARTING AT " + str(startPos) + "\n")
sys.stderr.write("READING SNPS\n")
snps = snp.reader( raw_data)
sys.stderr.write("GETTING PAPER\n")
new_paper = paper_getter( snps, last_update, startPos)
sys.stderr.write("CREATING RSS\n")
rss_items = item_creator( new_paper)
sys.stderr.write("IMPORT RSS\n")
all_rss_items = item_import( rss_items, old_items)
sys.stderr.write("CREATING FEED\n")
create_feed( all_rss_items,feed_location)
sys.stderr.write("FINISHED\n")
# todo: backup new items, import old items, create feed