22import os
33import shutil
44import time
5+ import tempfile
56import zipfile
67from datetime import datetime , timedelta , timezone
78from urllib .request import urlretrieve
9+ from urllib .parse import urlparse , urlunparse
810
911from .base import ContentProviderException
1012from .doi import DoiProvider
13+ from ..utils import is_doi
1114
1215
1316class Hydroshare (DoiProvider ):
1417 """Provide contents of a Hydroshare resource."""
1518
16- def _fetch_version (self , host ):
17- """Fetch resource modified date and convert to epoch"""
18- json_response = self .session .get (host ["version" ].format (self .resource_id )).json ()
19+ HYDROSHARE_DOMAINS = ["www.hydroshare.org" ]
20+
21+ def get_version (self , resource_id : str ) -> str :
22+ """
23+ Get current version of given resource_id
24+ """
25+ api_url = f"https://{ self .HYDROSHARE_DOMAIN } /hsapi/resource/{ resource_id } /scimeta/elements"
26+
27+ json_response = self .session .get (api_url ).json ()
1928 date = next (
2029 item for item in json_response ["dates" ] if item ["type" ] == "modified"
2130 )["start_date" ]
@@ -26,7 +35,7 @@ def _fetch_version(self, host):
2635 # truncate the timestamp
2736 return str (int (epoch ))
2837
29- def detect (self , doi , ref = None , extra_args = None ):
38+ def detect (self , spec , ref = None , extra_args = None ):
3039 """Trigger this provider for things that resolve to a Hydroshare resource"""
3140 hosts = [
3241 {
@@ -35,30 +44,33 @@ def detect(self, doi, ref=None, extra_args=None):
3544 "http://www.hydroshare.org/resource/" ,
3645 ],
3746 "django_irods" : "https://www.hydroshare.org/django_irods/download/bags/" ,
38- "version" : "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements " ,
47+ "version" : "" ,
3948 }
4049 ]
41- url = self .doi2url (doi )
42-
43- for host in hosts :
44- if any ([url .startswith (s ) for s in host ["hostname" ]]):
45- self .resource_id = url .strip ("/" ).rsplit ("/" , maxsplit = 1 )[1 ]
46- self .version = self ._fetch_version (host )
47- return {
48- "resource" : self .resource_id ,
49- "host" : host ,
50- "version" : self .version ,
51- }
50+
51+ # Our spec could be a doi that resolves to a hydroshare URL, or a hydroshare URL
52+ if is_doi (spec ):
53+ url = self .doi2url (spec )
54+ else :
55+ url = spec
56+
57+ parsed = urlparse (url )
58+
59+ print (url )
60+ if parsed .netloc in self .HYDROSHARE_DOMAINS :
61+ return url
5262
5363 def _urlretrieve (self , bag_url ):
5464 return urlretrieve (bag_url )
5565
5666 def fetch (self , spec , output_dir , yield_output = False , timeout = 120 ):
5767 """Fetch and unpack a Hydroshare resource"""
58- resource_id = spec ["resource" ]
59- host = spec ["host" ]
68+ url = spec
69+ print (url )
70+ parts = urlparse (url )
71+ self .resource_id = parts .path .strip ("/" ).rsplit ("/" , maxsplit = 1 )[1 ]
6072
61- bag_url = f' { host [ "django_irods" ] } { resource_id } '
73+ bag_url = urlunparse ( parts . _replace ( path = f "django_irods/download/bags/ { self . resource_id } " ))
6274
6375 yield f"Downloading { bag_url } .\n "
6476
@@ -87,16 +99,17 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120):
8799 filehandle , _ = self ._urlretrieve (bag_url )
88100 zip_file_object = zipfile .ZipFile (filehandle , "r" )
89101 yield "Downloaded, unpacking contents.\n "
90- zip_file_object .extractall ("temp" )
91- # resources store the contents in the data/contents directory, which is all we want to keep
92- contents_dir = os .path .join ("temp" , self .resource_id , "data" , "contents" )
93- files = os .listdir (contents_dir )
94- for f in files :
95- shutil .move (os .path .join (contents_dir , f ), output_dir )
96- yield "Finished, cleaning up.\n "
97- shutil .rmtree ("temp" )
102+
103+ with tempfile .TemporaryDirectory () as d :
104+ zip_file_object .extractall (d )
105+ # resources store the contents in the data/contents directory, which is all we want to keep
106+ contents_dir = os .path .join (d , self .resource_id , "data" , "contents" )
107+ files = os .listdir (contents_dir )
108+ for f in files :
109+ shutil .move (os .path .join (contents_dir , f ), output_dir )
110+ yield "Finished, cleaning up.\n "
98111
99112 @property
100113 def content_id (self ):
101114 """The HydroShare resource ID"""
102- return f"{ self .resource_id } .v { self . version } "
115+ return f"{ self .resource_id } "
0 commit comments