Skip to content

Commit a466a45

Browse files
committed
added adjusted files for further work on OAMI2
1 parent 1841042 commit a466a45

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+6580
-612
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1+
env
12
*.pyc
3+
userconfig

LICENSE.md

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
This program is free software: you can redistribute it and/or modify
2+
it under the terms of the GNU General Public License as published by
3+
the Free Software Foundation, either version 3 of the License, or
4+
(at your option) any later version.
5+
6+
This program is distributed in the hope that it will be useful,
7+
but WITHOUT ANY WARRANTY; without even the implied warranty of
8+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9+
GNU General Public License for more details.
10+
11+
You should have received a copy of the GNU General Public License
12+
along with this program. If not, see <http://www.gnu.org/licenses/>.
13+
14+
Dieses Programm ist Freie Software: Sie können es unter den Bedingungen
15+
der GNU General Public License, wie von der Free Software Foundation,
16+
Version 3 der Lizenz oder (nach Ihrer Wahl) jeder neueren
17+
veröffentlichten Version, weiter verteilen und/oder modifizieren.
18+
19+
Dieses Programm wird in der Hoffnung bereitgestellt, dass es nützlich sein wird, jedoch
20+
OHNE JEDE GEWÄHR,; sogar ohne die implizite
21+
Gewähr der MARKTFÄHIGKEIT oder EIGNUNG FÜR EINEN BESTIMMTEN ZWECK.
22+
Siehe die GNU General Public License für weitere Einzelheiten.
23+
24+
Sie sollten eine Kopie der GNU General Public License zusammen mit diesem
25+
Programm erhalten haben. Wenn nicht, siehe <https://www.gnu.org/licenses/>.

OAMI/.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.pyc
File renamed without changes.

README OAMI/README

File renamed without changes.
File renamed without changes.

doi_pref.tsv OAMI/doi_pref.tsv

File renamed without changes.

OAMI/helpers/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from autovividict import autovividict
2+
from urllib2 import quote
3+
4+
def make_datestring(year, month, day):
5+
datestring = "%04d" % year # YYYY
6+
if month is not None:
7+
datestring += "-%02d" % month # YYYY-MM
8+
if day is not None:
9+
datestring += "-%02d" % day # YYYY-MM-DD
10+
return datestring
11+
12+
def filename_from_url(url):
13+
return quote(url, safe='')

OAMI/helpers/autovividict.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from collections import defaultdict
2+
3+
# counting dictionary that serves as 0 for addition
4+
# this allows incrementing unknown keys: d['k'] += 1
5+
class countdict(defaultdict):
6+
def __init__(self, *args, **kwargs):
7+
self.value = 0
8+
super(countdict, self).__init__(*args, **kwargs)
9+
def __repr__(self):
10+
return str(dict(self))
11+
def __add__(self, x):
12+
return self.value + x
13+
14+
# autovivicatious counting dictionary, allowing dynamic creation of keys
15+
# explained at <http://en.wikipedia.org/wiki/Autovivification#Python>
16+
def autovividict():
17+
return countdict(autovividict)

OAMI/helpers/config.py

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
from os import makedirs, path
5+
from sys import stderr, exit
6+
from xdg import BaseDirectory
7+
from ConfigParser import RawConfigParser, NoSectionError, NoOptionError
8+
9+
APPLICATION_NAME="open-access-media-importer"
10+
cache_path = path.join(BaseDirectory.xdg_cache_home, APPLICATION_NAME)
11+
config_path = path.join(BaseDirectory.xdg_config_home, APPLICATION_NAME)
12+
data_path = path.join(BaseDirectory.xdg_data_home, APPLICATION_NAME)
13+
14+
def database_path(source):
15+
return path.join(data_path, '%s.sqlite' % source)
16+
17+
def ensure_directory_exists(directory):
18+
if not path.exists(directory):
19+
makedirs(directory)
20+
21+
for p in (cache_path, config_path, data_path):
22+
ensure_directory_exists(p)
23+
24+
_metadata_path = path.join(cache_path, 'metadata')
25+
26+
_metadata_raw_path = path.join(_metadata_path, 'raw')
27+
def get_metadata_raw_source_path(source_name):
28+
p = path.join(_metadata_raw_path, source_name)
29+
ensure_directory_exists(p)
30+
return p
31+
32+
_metadata_refined_path = path.join(_metadata_path, 'refined')
33+
def get_metadata_refined_source_path(source_name):
34+
p = path.join(_metadata_refined_path, source_name)
35+
ensure_directory_exists(p)
36+
return p
37+
38+
_media_path = path.join(cache_path, 'media')
39+
40+
_media_raw_path = path.join(_media_path, 'raw')
41+
def get_media_raw_source_path(source_name):
42+
p = path.join(_media_raw_path, source_name)
43+
ensure_directory_exists(p)
44+
return p
45+
46+
_media_refined_path = path.join(_media_path, 'refined')
47+
def get_media_refined_source_path(source_name):
48+
p = path.join(_media_refined_path, source_name)
49+
ensure_directory_exists(p)
50+
return p
51+
52+
free_license_urls = [
53+
'http://creativecommons.org/licenses/by/2.0/',
54+
'http://creativecommons.org/licenses/by-sa/2.0/',
55+
'http://creativecommons.org/licenses/by/2.5/',
56+
'http://creativecommons.org/licenses/by-sa/2.5/',
57+
'http://creativecommons.org/licenses/by/3.0/',
58+
'http://creativecommons.org/licenses/by-sa/3.0/',
59+
'http://creativecommons.org/licenses/by/4.0/',
60+
'http://creativecommons.org/licenses/by-sa/4.0/',
61+
'http://creativecommons.org/publicdomain/zero/1.0/'
62+
]
63+
64+
USERCONFIG_FILENAME = "userconfig"
65+
userconfig_file = path.join(config_path, USERCONFIG_FILENAME)
66+
userconfig = RawConfigParser()
67+
userconfig.optionsxform = str # case sensitivity
68+
userconfig.read(userconfig_file)
69+
70+
def get_userconfig(section, option):
71+
try:
72+
return userconfig.get(section, option)
73+
except NoSectionError:
74+
stderr.write("“%s” does not contain a “%s” section.\n" % \
75+
(userconfig_file, section))
76+
exit(127)
77+
except NoOptionError:
78+
stderr.write("“%s” does not contain a “%s” option in the “%s” section.\n" % \
79+
(userconfig_file, option, section))
80+
exit(127)
81+
82+
api_url = get_userconfig('wiki', 'api_url')
83+
username = get_userconfig('wiki', 'username')
84+
password = get_userconfig('wiki', 'password')
85+
whitelist_doi = get_userconfig('whitelist', 'doi').split()

OAMI/helpers/efetch.py

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
from urllib2 import urlopen, urlparse, Request, HTTPError
5+
from xml.etree.cElementTree import dump, ElementTree
6+
from sys import stderr
7+
8+
def _get_file_from_url(url):
9+
req = Request(url, None, {'User-Agent' : 'oa-put/2012-08-15'})
10+
try:
11+
remote_file = urlopen(req)
12+
return remote_file
13+
except HTTPError as e:
14+
stderr.write('When trying to download <%s>, the following error occured: “%s”.\n' % \
15+
(url, str(e)))
16+
exit(255)
17+
18+
def get_pmcid_from_doi(doi):
19+
if not type(doi) == unicode:
20+
raise TypeError, "Cannot get PMCID for DOI %s of type %s." % (doi, type(doi))
21+
url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=%s' % doi
22+
xml_file = _get_file_from_url(url)
23+
tree = ElementTree()
24+
tree.parse(xml_file)
25+
try:
26+
return int(tree.find('IdList/Id').text)
27+
except AttributeError:
28+
return None
29+
30+
def get_pmid_from_doi(doi):
31+
if not type(doi) == unicode:
32+
raise TypeError, "Cannot get PMID for DOI %s of type %s." % (doi, type(doi))
33+
url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%s' % doi
34+
xml_file = _get_file_from_url(url)
35+
tree = ElementTree()
36+
tree.parse(xml_file)
37+
try:
38+
return int(tree.find('IdList/Id').text)
39+
except AttributeError:
40+
return None
41+
42+
def get_categories_from_pmid(pmid):
43+
"""
44+
Gets MeSH headings, returns those not deemed too broad.
45+
"""
46+
if not type(pmid) == int:
47+
raise TypeError, "Cannot get Categories for PMID %s of type %s." % (pmid, type(pmid))
48+
url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml' % pmid
49+
xml_file = _get_file_from_url(url)
50+
tree = ElementTree()
51+
tree.parse(xml_file)
52+
categories = []
53+
for heading in tree.iterfind('PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading'):
54+
htree = ElementTree(heading)
55+
descriptor_text = htree.find('DescriptorName').text
56+
if (htree.find('QualifierName') is not None) or \
57+
(' ' in descriptor_text and not 'and' in descriptor_text):
58+
categories.append(descriptor_text)
59+
return categories

OAMI/helpers/media.py

+138
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
import gobject, pygst
4+
pygst.require("0.10")
5+
6+
import gst
7+
import progressbar
8+
9+
from sys import exit, stderr
10+
11+
class Media():
12+
def __init__(self, filename):
13+
self.filename = filename
14+
self.has_audio = False
15+
self.has_video = False
16+
self.position = 0
17+
self.lastposition = 0
18+
19+
def find_streams(self):
20+
"""
21+
Determines if media file has audio and / or video streams.
22+
"""
23+
loop = gobject.MainLoop()
24+
pipeline = gst.parse_launch("filesrc name=source ! decodebin2 ! fakesink")
25+
26+
source = pipeline.get_by_name("source")
27+
source.set_property("location", self.filename)
28+
29+
bus = pipeline.get_bus()
30+
def on_message(bus, message):
31+
t = message.type
32+
if t == gst.MESSAGE_TAG:
33+
pipeline.set_state(gst.STATE_PAUSED)
34+
keys = message.structure.keys()
35+
if 'audio-codec' in keys:
36+
self.has_audio = True
37+
if 'video-codec' in keys:
38+
self.has_video = True
39+
if t == gst.MESSAGE_ASYNC_DONE:
40+
pipeline.set_state(gst.STATE_NULL)
41+
loop.quit()
42+
elif t == gst.MESSAGE_ERROR: # error
43+
err, debug = message.parse_error()
44+
pipeline.set_state(gst.STATE_NULL)
45+
loop.quit()
46+
if str(err) in [
47+
'Your GStreamer installation is missing a plug-in.',
48+
'Could not demultiplex stream.'
49+
]:
50+
raise RuntimeError, str(err)
51+
else:
52+
stderr.write('ERROR: %s\n' %str(err))
53+
54+
bus.add_signal_watch()
55+
bus.connect('message', on_message)
56+
57+
pipeline.set_state(gst.STATE_PLAYING)
58+
pipeline.get_state()
59+
60+
loop.run()
61+
62+
def convert(self, outfile):
63+
"""
64+
Converts media file to Ogg Theora or Ogg Theora+Vorbis.
65+
"""
66+
loop = gobject.MainLoop()
67+
68+
if self.has_video and self.has_audio:
69+
pipeline = gst.parse_launch("""
70+
filesrc name=source ! decodebin2 name=decoder
71+
decoder. ! queue ! theoraenc ! queue ! oggmux name=muxer
72+
decoder. ! queue ! audioconvert ! audioresample ! vorbisenc ! progressreport name=report ! muxer.
73+
muxer. ! filesink name=sink
74+
""")
75+
elif self.has_video and not self.has_audio:
76+
pipeline = gst.parse_launch("""
77+
filesrc name=source ! decodebin2 name=decoder
78+
decoder. ! queue ! ffmpegcolorspace ! theoraenc ! progressreport name=report ! oggmux name=muxer
79+
muxer. ! filesink name=sink
80+
""")
81+
elif not self.has_video and self.has_audio:
82+
pipeline = gst.parse_launch("""
83+
filesrc name=source ! decodebin2 name=decoder
84+
decoder. ! queue ! audioconvert ! audioresample ! vorbisenc ! progressreport name=report ! oggmux name=muxer
85+
muxer. ! filesink name=sink
86+
""")
87+
else:
88+
raise RuntimeError, 'Unknown audio/video stream combination.'
89+
90+
source = pipeline.get_by_name('source')
91+
source.set_property('location', self.filename)
92+
93+
sink = pipeline.get_by_name('sink')
94+
sink.set_property('location', outfile)
95+
96+
report = pipeline.get_by_name('report')
97+
report.set_property('silent', True)
98+
99+
bus = pipeline.get_bus()
100+
def on_message(bus, message):
101+
t = message.type
102+
if t == gst.MESSAGE_EOS: # end of stream
103+
pipeline.set_state(gst.STATE_NULL)
104+
loop.quit()
105+
elif t == gst.MESSAGE_ERROR: # error
106+
err, debug = message.parse_error()
107+
stderr.write('ERROR: %s\n' %str(err))
108+
pipeline.set_state(gst.STATE_NULL)
109+
loop.quit()
110+
111+
bus.add_signal_watch()
112+
bus.connect("message", on_message)
113+
114+
pipeline.set_state(gst.STATE_PLAYING)
115+
pipeline.get_state()
116+
117+
try:
118+
duration = pipeline.query_duration(gst.FORMAT_TIME, None)[0]
119+
progress = progressbar.ProgressBar(maxval=duration).start()
120+
except gst.QueryError:
121+
pass
122+
123+
def update_progress():
124+
try:
125+
self.position = pipeline.query_position(gst.FORMAT_TIME, \
126+
None)[0]
127+
except:
128+
return False # stop loop
129+
try:
130+
progress.update(self.position)
131+
except:
132+
# progressbar fails on >100% progress
133+
# fall back to pipeline reporting
134+
report.set_property('silent', False)
135+
return True # continue loop
136+
137+
gobject.timeout_add(100, update_progress)
138+
loop.run()

0 commit comments

Comments
 (0)