Skip to content

Commit

Permalink
Add support for Pangaea datasets (#45)
Browse files Browse the repository at this point in the history
  • Loading branch information
J535D165 authored Sep 21, 2023
1 parent c892377 commit 3ca48ed
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 4 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Datahugger is a tool to download scientific datasets, software, and code from a

## Supported repositories

Datahugger offers support for more than [<!-- count -->376<!-- count --> generic and specific (scientific) repositories](https://j535d165.github.io/datahugger/repositories) (and more to come!).
Datahugger offers support for more than [<!-- count -->377<!-- count --> generic and specific (scientific) repositories](https://j535d165.github.io/datahugger/repositories) (and more to come!).

[![Datahugger support Zenodo, Dataverse, DataOne, GitHub, FigShare, HuggingFace, Mendeley Data, Dryad, OSF, and many more](https://github.com/J535D165/datahugger/raw/main/docs/images/logos.png)](https://j535d165.github.io/datahugger/repositories)

Expand Down
5 changes: 4 additions & 1 deletion datahugger/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,10 @@ def _get(
):
if (
len(self.files) == 1
and self.files[0]["link"].endswith(".zip")
and (
self.files[0]["link"].endswith(".zip")
or self.files[0]["name"].endswith(".zip")
)
and self.unzip
):
self._unpack_single_folder(self.files[0]["link"], output_folder)
Expand Down
3 changes: 2 additions & 1 deletion datahugger/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from datahugger.services import HuggingFaceDataset
from datahugger.services import MendeleyDataset
from datahugger.services import OSFDataset
from datahugger.services import PangaeaDataset
from datahugger.services import ZenodoDataset

# fast lookup
Expand Down Expand Up @@ -40,7 +41,7 @@
"get.iedadata.org": DataOneDataset,
"usap-dc.org": DataOneDataset,
"iys.hakai.org": DataOneDataset,
# "doi.pangaea.de": DataOneDataset,
"doi.pangaea.de": PangaeaDataset,
"rvdata.us": DataOneDataset,
"sead-published.ncsa.illinois.edu": DataOneDataset,
# DataVerse repositories (extracted from re3data)
Expand Down
40 changes: 40 additions & 0 deletions datahugger/services.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import io
import re
import xml.etree.ElementTree as ET
import zipfile
from pathlib import Path
Expand Down Expand Up @@ -142,6 +143,45 @@ def files(self):
return self._files


class PangaeaDataset(DatasetDownloader):
"""Downloader for PangaeaDataset repository."""

REGEXP_ID = r"doi\.pangaea\.de/(?P<record_id>.*)"

# the base entry point of the REST API
API_URL = "https://doi.pangaea.de/"

@property
def files(self):
# get the difference between collection and file
r = requests.get(
f"{self.API_URL}{self._params['record_id']}?format=metadata_jsonld"
)
r.raise_for_status()
dists = r.json()["distribution"]

if isinstance(dists, dict):
dists = [dists]

files = []
for d in dists:
if d["encodingFormat"] in ["text/tab-separated-values", "application/zip"]:
r_filename = requests.head(d["contentUrl"])
content_d = r_filename.headers["content-disposition"]

files.append(
{
"link": d["contentUrl"],
"name": re.findall("filename=(.+)", content_d)[0],
"size": None,
"hash": None,
"hash_type": None,
}
)

return files


class DSpaceDataset(DatasetDownloader):
"""Downloader for DSpaceDataset repositories."""

Expand Down
2 changes: 1 addition & 1 deletion docs/repositories.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Supported repositories

Datahugger offers support for more than <!-- count -->376<!-- count --> generic and specific (scientific) repositories (and more to come!).
Datahugger offers support for more than <!-- count -->377<!-- count --> generic and specific (scientific) repositories (and more to come!).

![Datahugger support Zenodo, Dataverse, DataOne, GitHub, FigShare, HuggingFace, Mendeley Data, Dryad, OSF, and many more](images/logos.png)

Expand Down
1 change: 1 addition & 0 deletions scripts/estimate_repos_supported.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def count_repos():
"dryad": 1,
"github": 1,
"huggingface": 1,
"pangaea": 1,
}

print(counts)
Expand Down
3 changes: 3 additions & 0 deletions tests/test_repositories.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@
),
# huggingface
# ("10.57967/hf/0034", "test.csv"),
# Pangaea
("https://doi.org/10.1594/PANGAEA.954547", "Gubbio_age.tab"),
("https://doi.pangaea.de/10.1594/PANGAEA.954543", "AA_age.tab"),
]


Expand Down

0 comments on commit 3ca48ed

Please sign in to comment.