Skip to content

Commit

Permalink
[wikimedia] generalize (mikf#1443)
Browse files Browse the repository at this point in the history
- support mediawiki.org
- support mariowiki.com (mikf#3660)

- combine code into a single extractor
  (use prefix as subcategory)
- handle non-wiki instances
- unescape titles
  • Loading branch information
mikf authored and bradenhilton committed Feb 5, 2024
1 parent 47cb4dd commit e49776b
Show file tree
Hide file tree
Showing 14 changed files with 126 additions and 47 deletions.
30 changes: 21 additions & 9 deletions docs/supportedsites.md
Original file line number Diff line number Diff line change
Expand Up @@ -1484,55 +1484,67 @@ Consider all listed sites to potentially be NSFW.
<tr>
<td>Wikipedia</td>
<td>https://www.wikipedia.org/</td>
<td>Articles, Categories</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wiktionary</td>
<td>https://www.wiktionary.org/</td>
<td>Articles, Categories</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikiquote</td>
<td>https://www.wikiquote.org/</td>
<td>Articles, Categories</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikibooks</td>
<td>https://www.wikibooks.org/</td>
<td>Articles, Categories</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikisource</td>
<td>https://www.wikisource.org/</td>
<td>Articles, Categories</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikinews</td>
<td>https://www.wikinews.org/</td>
<td>Articles, Categories</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikiversity</td>
<td>https://www.wikiversity.org/</td>
<td>Articles, Categories</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikispecies</td>
<td>https://species.wikimedia.org/</td>
<td>Articles, Categories</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikimedia Commons</td>
<td>https://commons.wikimedia.org/</td>
<td>Articles, Categories</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>MediaWiki</td>
<td>https://www.mediawiki.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Super Mario Wiki</td>
<td>https://www.mariowiki.com/</td>
<td>Articles</td>
<td></td>
</tr>

Expand Down
80 changes: 51 additions & 29 deletions gallery_dl/extractor/wikimedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for Wikimedia and Wikipedia"""
"""Extractors for Wikimedia sites"""

from .common import BaseExtractor, Message
from .. import text
Expand All @@ -22,7 +22,41 @@ class WikimediaExtractor(BaseExtractor):

def __init__(self, match):
BaseExtractor.__init__(self, match)
self.title = match.group(match.lastindex)
path = match.group(match.lastindex)

if path.startswith("wiki/"):
path = path[5:]
self.api_path = "/w/api.php"
else:
self.api_path = "/api.php"

pre, sep, _ = path.partition(":")
prefix = pre.lower() if sep else None

self.title = path = text.unquote(path)
self.subcategory = prefix

if prefix == "category":
self.params = {
"generator": "categorymembers",
"gcmtitle" : path,
"gcmtype" : "file",
}
else:
self.params = {
"generator": "images",
"titles" : path,
}

def _init(self):
api_path = self.config_instance("api-path")
if api_path:
if api_path[0] == "/":
self.api_url = self.root + api_path
else:
self.api_url = api_path
else:
self.api_url = self.root + self.api_path

def items(self):
for info in self._pagination(self.params):
Expand Down Expand Up @@ -51,9 +85,14 @@ def _pagination(self, params):
https://opendata.stackexchange.com/questions/13381
"""

url = self.root + "/w/api.php"
url = self.api_url
params["action"] = "query"
params["format"] = "json"
params["prop"] = "imageinfo"
params["iiprop"] = (
"timestamp|user|userid|comment|canonicaltitle|url|size|"
"sha1|mime|metadata|commonmetadata|extmetadata|bitdepth"
)

while True:
data = self.request(url, params=params).json()
Expand Down Expand Up @@ -109,36 +148,19 @@ def _pagination(self, params):
"root": "https://commons.wikimedia.org",
"pattern": r"commons\.wikimedia\.org",
},
"mediawiki": {
"root": "https://www.mediawiki.org",
"pattern": r"(?:www\.)?mediawiki\.org",
},
"mariowiki": {
"root": "https://www.mariowiki.com",
"pattern": r"(?:www\.)?mariowiki\.com",
},
})


class WikimediaArticleExtractor(WikimediaExtractor):
"""Extractor for wikimedia articles"""
subcategory = "article"
pattern = BASE_PATTERN + r"/wiki/(?!Category:)([^/?#]+)"
pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)"
example = "https://en.wikipedia.org/wiki/TITLE"

def _init(self):
self.params = {
"generator": "images",
"titles" : self.title,
"prop" : "imageinfo",
"iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|"
"sha1|mime|metadata|commonmetadata|extmetadata|bitdepth",
}


class WikimediaCategoryExtractor(WikimediaExtractor):
subcategory = "category"
pattern = BASE_PATTERN + r"/wiki/(Category:[^/?#]+)"
example = "https://commons.wikimedia.org/wiki/Category:NAME"

def _init(self):
self.params = {
"generator": "categorymembers",
"gcmtitle" : self.title,
"gcmtype" : "file",
"prop" : "imageinfo",
"iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|"
"sha1|mime|metadata|commonmetadata|extmetadata|bitdepth",
}
2 changes: 2 additions & 0 deletions scripts/supportedsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@
"mangapark" : "MangaPark",
"mangaread" : "MangaRead",
"mangasee" : "MangaSee",
"mariowiki" : "Super Mario Wiki",
"mastodon.social": "mastodon.social",
"mediawiki" : "MediaWiki",
"micmicidol" : "MIC MIC IDOL",
"myhentaigallery": "My Hentai Gallery",
"myportfolio" : "Adobe Portfolio",
Expand Down
19 changes: 19 additions & 0 deletions test/results/mariowiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

from gallery_dl.extractor import wikimedia


__tests__ = (
{
"#url" : "https://www.mariowiki.com/Rabbit",
"#category": ("wikimedia", "wikibooks", "article"),
"#class" : wikimedia.WikimediaArticleExtractor,
"#pattern" : r"https://mario\.wiki\.gallery/images/.+",
"#count" : range(20, 50),
},

)
24 changes: 24 additions & 0 deletions test/results/mediawiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

from gallery_dl.extractor import wikimedia


__tests__ = (
{
"#url" : "https://www.mediawiki.org/wiki/Help:Navigation",
"#category": ("wikimedia", "mediawiki", "help"),
"#class" : wikimedia.WikimediaArticleExtractor,
"#urls" : (
"https://upload.wikimedia.org/wikipedia/commons/e/ec/OOjs_UI_icon_information-progressive.svg",
"https://upload.wikimedia.org/wikipedia/commons/6/62/PD-icon.svg",
"https://upload.wikimedia.org/wikipedia/commons/0/0e/Vector_Sidebar.png",
"https://upload.wikimedia.org/wikipedia/commons/7/77/Vector_page_tabs.png",
"https://upload.wikimedia.org/wikipedia/commons/6/6e/Vector_user_links.png",
),
},

)
2 changes: 1 addition & 1 deletion test/results/wikibooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
{
"#url" : "https://en.wikibooks.org/wiki/Category:Title",
"#category": ("wikimedia", "wikibooks", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
"#class" : wikimedia.WikimediaArticleExtractor,
},

)
2 changes: 1 addition & 1 deletion test/results/wikimediacommons.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
{
"#url" : "https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro",
"#category": ("wikimedia", "wikimediacommons", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
"#class" : wikimedia.WikimediaArticleExtractor,
},

)
2 changes: 1 addition & 1 deletion test/results/wikinews.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
{
"#url" : "https://en.wikinews.org/wiki/Category:Title",
"#category": ("wikimedia", "wikinews", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
"#class" : wikimedia.WikimediaArticleExtractor,
},

)
2 changes: 1 addition & 1 deletion test/results/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
{
"#url" : "https://en.wikipedia.org/wiki/Category:Physics",
"#category": ("wikimedia", "wikipedia", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
"#class" : wikimedia.WikimediaArticleExtractor,
},

)
2 changes: 1 addition & 1 deletion test/results/wikiquote.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
{
"#url" : "https://en.wikiquote.org/wiki/Category:Title",
"#category": ("wikimedia", "wikiquote", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
"#class" : wikimedia.WikimediaArticleExtractor,
},

)
2 changes: 1 addition & 1 deletion test/results/wikisource.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
{
"#url" : "https://en.wikisource.org/wiki/Category:Title",
"#category": ("wikimedia", "wikisource", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
"#class" : wikimedia.WikimediaArticleExtractor,
},

)
2 changes: 1 addition & 1 deletion test/results/wikispecies.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
{
"#url" : "https://species.wikimedia.org/wiki/Category:Names",
"#category": ("wikimedia", "wikispecies", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
"#class" : wikimedia.WikimediaArticleExtractor,
},

)
2 changes: 1 addition & 1 deletion test/results/wikiversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
{
"#url" : "https://en.wikiversity.org/wiki/Category:Title",
"#category": ("wikimedia", "wikiversity", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
"#class" : wikimedia.WikimediaArticleExtractor,
},

)
2 changes: 1 addition & 1 deletion test/results/wiktionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
{
"#url" : "https://en.wiktionary.org/wiki/Category:Words",
"#category": ("wikimedia", "wiktionary", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
"#class" : wikimedia.WikimediaArticleExtractor,
},

)

0 comments on commit e49776b

Please sign in to comment.