-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutils.py
81 lines (57 loc) · 1.69 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import time
import os
import gzip
import hashlib
import json
import requests
from pathlib import Path
from wasabi import msg
def days_since_modified(f):
"""
Return the number of days since the file was modified.
"""
dt = time.time() - os.path.getmtime(f)
seconds_per_day = 60 * 60 * 24
return dt / seconds_per_day
def mkdir(dest):
"""
Safely creates a nested directory. Ignores if exists.
"""
Path(dest).mkdir(parents=True, exist_ok=True)
def file_md5sum(f, chunksize=4096):
"""
Returns the md5sum for an input file.
Chunks the data so it's non-blocking.
"""
hash_md5 = hashlib.md5()
with open(f, "rb") as FIN:
for chunk in iter(lambda: FIN.read(chunksize), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def iterate_pubmed_xml(f0):
"""
Iterator that returns a list of lines of XML
broken up by the PubmedArticle.
"""
article_start = "<PubmedArticle>"
article_end = "</PubmedArticle>"
pmid = None
with gzip.GzipFile(f0, "rb") as FIN:
raw_article_xml = []
# Advance to the first article
for line in FIN:
line = line.decode("utf-8")
if article_start in line:
break
xml = [line]
for line in FIN:
line = line.decode("utf-8")
# Match the first instance of PMID
if pmid is None and "</PMID>" in line:
pmid = int(line.split("<")[-2].split(">")[-1])
xml.append(line)
if article_end in line:
# yield ''.join(xml)
yield (pmid, "".join(xml))
xml = list()
pmid = None