Skip to content

Commit 89f7131

Browse files
committed
add replace_curie function
1 parent 832400e commit 89f7131

File tree

5 files changed

+58
-10
lines changed

5 files changed

+58
-10
lines changed

commonmeta/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"""
1111

1212
__title__ = "commonmeta-py"
13-
__version__ = "0.74"
13+
__version__ = "0.75"
1414
__author__ = "Martin Fenner"
1515
__license__ = "MIT"
1616

commonmeta/utils.py

+24-4
Original file line numberDiff line numberDiff line change
@@ -1116,16 +1116,33 @@ def extract_curie(string: Optional[str]) -> Optional[str]:
11161116
"""Extract CURIE"""
11171117
if string is None:
11181118
return None
1119-
match = re.search(r"((?:doi|DOI):\s?([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))", string)
1119+
match = re.search(
1120+
r"((?:doi|DOI):\s?([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))", string
1121+
)
11201122
if match is None:
11211123
return None
11221124
return doi_as_url(match.group(2))
11231125

1124-
1126+
1127+
def replace_curie(string: Optional[str]) -> Optional[str]:
1128+
"""Replace CURIE with DOI expressed as URL"""
1129+
if string is None:
1130+
return None
1131+
match = re.sub(
1132+
r"((?:doi|DOI):\s?([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))", r'https://doi.org/\2', string
1133+
)
1134+
if match is None:
1135+
return None
1136+
return match
1137+
1138+
11251139
def extract_url(string: str) -> list:
11261140
"""Extract urls from string, including markdown and html."""
11271141

1128-
match = re.search(r"((?:http|https):\/\/(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))", string)
1142+
match = re.search(
1143+
r"((?:http|https):\/\/(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))",
1144+
string,
1145+
)
11291146
if match is None:
11301147
return None
11311148
return normalize_url(match.group(1))
@@ -1134,7 +1151,10 @@ def extract_url(string: str) -> list:
11341151
def extract_urls(string: str) -> list:
11351152
"""Extract urls from string, including markdown and html."""
11361153

1137-
urls = re.findall(r"((?:http|https):\/\/(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))", string)
1154+
urls = re.findall(
1155+
r"((?:http|https):\/\/(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))",
1156+
string,
1157+
)
11381158
return py_.uniq(urls)
11391159

11401160

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "commonmeta-py"
3-
version = "0.74"
3+
version = "0.75"
44
description = "Library for conversions to/from the Commonmeta scholarly metadata format"
55
authors = [{ name = "Martin Fenner", email = "[email protected]" }]
66
requires-python = ">=3.9,<4.0"

tests/test-utils.py

+31-3
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
github_as_repo_url,
4141
from_curie,
4242
extract_curie,
43+
replace_curie,
4344
get_language,
4445
validate_url,
4546
format_name_identifier,
@@ -815,6 +816,7 @@ def test_from_curie():
815816
"ISNI:0000000121099845"
816817
)
817818

819+
818820
def test_extract_curie():
819821
"""extract_curie"""
820822
string = """Melstrom, Keegan M., Michael D. D’Emic, Daniel Chure and Jeffrey A.
@@ -835,14 +837,31 @@ def test_extract_curie_doi_space():
835837
assert "https://doi.org/10.1080/02724634.2016.1111898" == extract_curie(string)
836838

837839

840+
def test_replace_curie():
841+
"""replace_curie"""
842+
string = """Melstrom, Keegan M., Michael D. D’Emic, Daniel Chure and Jeffrey A.
843+
Wilson. 2016. A juvenile sauropod dinosaur from the Late Jurassic of
844+
Utah, USA, presents further evidence of an avian style air-sac
845+
system. Journal of Vertebrate Paleontology 36(4):e1111898. doi:10.1080/02724634.2016.1111898"""
846+
assert """Melstrom, Keegan M., Michael D. D’Emic, Daniel Chure and Jeffrey A.
847+
Wilson. 2016. A juvenile sauropod dinosaur from the Late Jurassic of
848+
Utah, USA, presents further evidence of an avian style air-sac
849+
system. Journal of Vertebrate Paleontology 36(4):e1111898. https://doi.org/10.1080/02724634.2016.1111898""" == replace_curie(string)
850+
851+
838852
def test_extract_urls():
839853
"""extract_urls"""
840854
string = """Zauner, H. (2025, January 9). Sex chromosome madness in the iconic echidna. GigaBlog. https://doi.org/10.59350/9509z-ns663
841855
842856
Willighagen, E. (2024, December 30). FAIR blog-to-blog citations. Chem-Bla-Ics. https://doi.org/10.59350/er1mn-m5q69
843857
844858
Marcum, C. S. (2024, August 27). Drinking from the Firehose? Write More and Publish Less. Upstream. https://doi.org/10.54900/r8zwg-62003"""
845-
assert ['https://doi.org/10.59350/9509z-ns663','https://doi.org/10.59350/er1mn-m5q69','https://doi.org/10.54900/r8zwg-62003',] == extract_urls(string)
859+
assert [
860+
"https://doi.org/10.59350/9509z-ns663",
861+
"https://doi.org/10.59350/er1mn-m5q69",
862+
"https://doi.org/10.54900/r8zwg-62003",
863+
] == extract_urls(string)
864+
846865

847866
def test_extract_urls_markdown():
848867
"""extract_urls markdown"""
@@ -854,12 +873,21 @@ def test_extract_urls_markdown():
854873
855874
Marcum, C. S. (2024, August 27). Drinking from the Firehose? Write More
856875
and Publish Less. *Upstream*. <https://doi.org/10.54900/r8zwg-62003>"""
857-
assert ['https://doi.org/10.59350/9509z-ns663', 'https://doi.org/10.59350/er1mn-m5q69', 'https://doi.org/10.54900/r8zwg-62003'] == extract_urls(string)
876+
assert [
877+
"https://doi.org/10.59350/9509z-ns663",
878+
"https://doi.org/10.59350/er1mn-m5q69",
879+
"https://doi.org/10.54900/r8zwg-62003",
880+
] == extract_urls(string)
881+
858882

859883
def test_extract_urls_html():
860884
"""extract_urls html"""
861885
string = """<p>Zauner, H. (2025, January 9). Sex chromosome madness in the iconic echidna. <em>GigaBlog</em>. <a href="https://doi.org/10.59350/9509z-ns663">https://doi.org/10.59350/9509z-ns663</a></p><p>Willighagen, E. (2024, December 30). FAIR blog-to-blog citations. <em>Chem-Bla-Ics</em>. <a href="https://doi.org/10.59350/er1mn-m5q69">https://doi.org/10.59350/er1mn-m5q69</a></p><p>Marcum, C. S. (2024, August 27). Drinking from the Firehose? Write More and Publish Less. <em>Upstream</em>. <a href="https://doi.org/10.54900/r8zwg-62003">https://doi.org/10.54900/r8zwg-62003</a></p>"""
862-
assert ['https://doi.org/10.59350/9509z-ns663', 'https://doi.org/10.59350/er1mn-m5q69', 'https://doi.org/10.54900/r8zwg-62003'] == extract_urls(string)
886+
assert [
887+
"https://doi.org/10.59350/9509z-ns663",
888+
"https://doi.org/10.59350/er1mn-m5q69",
889+
"https://doi.org/10.54900/r8zwg-62003",
890+
] == extract_urls(string)
863891

864892

865893
def test_id_from_url():

uv.lock

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)