Skip to content

Commit

Permalink
[ao3] add initial support (#6013)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikf committed Sep 15, 2024
1 parent 7d6520e commit 638a676
Show file tree
Hide file tree
Showing 6 changed files with 418 additions and 0 deletions.
14 changes: 14 additions & 0 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1279,6 +1279,20 @@ Extractor-specific Options
==========================


extractor.ao3.formats
---------------------
Type
* ``string``
* ``list`` of ``strings``
Default
``"pdf"``
Example
* ``"azw3,epub,mobi,pdf,html"``
* ``["azw3", "epub", "mobi", "pdf", "html"]``
Description
Format(s) to download.


extractor.artstation.external
-----------------------------
Type
Expand Down
6 changes: 6 additions & 0 deletions docs/supportedsites.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ Consider all listed sites to potentially be NSFW.
<td>Firms, Projects</td>
<td></td>
</tr>
<tr>
<td>Archive of Our Own</td>
<td>https://archiveofourown.org/</td>
<td>Search Results, Series, Tag Searches, User Profiles, Bookmarks, Works</td>
<td></td>
</tr>
<tr>
<td>ArtStation</td>
<td>https://www.artstation.com/</td>
Expand Down
1 change: 1 addition & 0 deletions gallery_dl/extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"8muses",
"adultempire",
"agnph",
"ao3",
"architizer",
"artstation",
"aryion",
Expand Down
200 changes: 200 additions & 0 deletions gallery_dl/extractor/ao3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
# -*- coding: utf-8 -*-

# Copyright 2024 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://archiveofourown.org/"""

from .common import Extractor, Message
from .. import text, util

BASE_PATTERN = r"(?:https?://)?(?:www\.)?archiveofourown.org"


class Ao3Extractor(Extractor):
"""Base class for ao3 extractors"""
category = "ao3"
root = "https://archiveofourown.org"
categorytransfer = True
request_interval = (0.5, 1.5)

def items(self):
base = self.root + "/works/"
data = {"_extractor": Ao3WorkExtractor}

for work_id in self.works():
yield Message.Queue, base + work_id, data

def works(self):
return self._pagination(self.groups[0])

def _pagination(self, path, needle='<li id="work_'):
while True:
page = self.request(self.root + path).text
yield from text.extract_iter(page, needle, '"')
path = text.extr(page, '<a rel="next" href="', '"')
if not path:
return
path = text.unescape(path)


class Ao3WorkExtractor(Ao3Extractor):
"""Extractor for an AO3 work"""
subcategory = "work"
directory_fmt = ("{category}", "{author}")
filename_fmt = "{id} {title}.{extension}"
archive_fmt = "{id}.{extension}"
pattern = BASE_PATTERN + r"/works/(\d+)"
example = "https://archiveofourown.org/works/12345"

def _init(self):
formats = self.config("formats")
if formats is None:
self.formats = ("pdf",)
elif not formats:
self.formats = ()
elif isinstance(formats, str):
self.formats = formats.lower().replace(" ", "").split(",")
else:
self.formats = formats

self.cookies.set("view_adult", "true", domain="archiveofourown.org")

def items(self):
work_id = self.groups[0]
url = "{}/works/{}".format(self.root, work_id)
extr = text.extract_from(self.request(url).text)

fmts = {}
download = extr(' class="download"', "</ul>")
for dl in text.extract_iter(download, ' href="', "</"):
path, _, type = dl.rpartition('">')
fmts[type.lower()] = path

data = {
"id" : text.parse_int(work_id),
"rating" : text.split_html(
extr('<dd class="rating tags">', "</dd>")),
"warnings" : text.split_html(
extr('<dd class="warning tags">', "</dd>")),
"categories" : text.split_html(
extr('<dd class="category tags">', "</dd>")),
"fandom" : text.split_html(
extr('<dd class="fandom tags">', "</dd>")),
"relationships": text.split_html(
extr('<dd class="relationship tags">', "</dd>")),
"characters" : text.split_html(
extr('<dd class="character tags">', "</dd>")),
"tags" : text.split_html(
extr('<dd class="freeform tags">', "</dd>")),
"lang" : extr('<dd class="language" lang="', '"'),
"series" : extr('<dd class="series">', "</dd>"),
"date" : text.parse_datetime(
extr('<dd class="published">', "<"), "%Y-%m-%d"),
"words" : text.parse_int(
extr('<dd class="words">', "<").replace(",", "")),
"chapters" : text.parse_int(
extr('<dd class="chapters">', "/")),
"comments" : text.parse_int(
extr('<dd class="comments">', "<").replace(",", "")),
"likes" : text.parse_int(
extr('<dd class="kudos">', "<").replace(",", "")),
"bookmarks" : text.parse_int(text.remove_html(
extr('<dd class="bookmarks">', "</dd>")).replace(",", "")),
"views" : text.parse_int(
extr('<dd class="hits">', "<").replace(",", "")),
"title" : text.unescape(
extr(' class="title heading">', "<").strip()),
"author" : text.unescape(text.remove_html(
extr(' class="byline heading">', "</h3>"))),
"summary" : text.split_html(
extr(' class="heading">Summary:</h3>', "</div>")),
}
data["language"] = util.code_to_language(data["lang"])

yield Message.Directory, data
for fmt in self.formats:
try:
url = text.urljoin(self.root, fmts[fmt])
except KeyError:
self.log.warning("%s: Format '%s' not available", work_id, fmt)
else:
yield Message.Url, url, text.nameext_from_url(url, data)


class Ao3SeriesExtractor(Ao3Extractor):
"""Extractor for AO3 works of a series"""
subcategory = "series"
pattern = BASE_PATTERN + r"(/series/(\d+))"
example = "https://archiveofourown.org/series/12345"


class Ao3TagExtractor(Ao3Extractor):
"""Extractor for AO3 works by tag"""
subcategory = "tag"
pattern = BASE_PATTERN + r"(/tags/([^/?#]+)/works(?:/?\?.+)?)"
example = "https://archiveofourown.org/tags/TAG/works"


class Ao3SearchExtractor(Ao3Extractor):
"""Extractor for AO3 search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"(/works/search/?\?.+)"
example = "https://archiveofourown.org/works/search?work_search[query]=air"


class Ao3UserExtractor(Ao3Extractor):
"""Extractor for an AO3 user profile"""
subcategory = "user"
pattern = (BASE_PATTERN + r"/users/([^/?#]+(?:/pseuds/[^/?#]+)?)"
r"(?:/profile)?/?(?:$|\?|#)")
example = "https://archiveofourown.org/users/USER"

def initialize(self):
pass

def items(self):
base = "{}/users/{}/".format(self.root, self.groups[0])
return self._dispatch_extractors((
(Ao3UserWorksExtractor , base + "works"),
(Ao3UserSeriesExtractor , base + "series"),
(Ao3UserBookmarkExtractor, base + "bookmarks"),
), ("user-works", "user-series"))


class Ao3UserWorksExtractor(Ao3Extractor):
"""Extractor for works of an AO3 user"""
subcategory = "user-works"
pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
r"works(?:/?\?.+)?)")
example = "https://archiveofourown.org/users/USER/works"


class Ao3UserSeriesExtractor(Ao3Extractor):
"""Extractor for series of an AO3 user"""
subcategory = "user-series"
pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
r"series(?:/?\?.+)?)")
example = "https://archiveofourown.org/users/USER/series"

def items(self):
base = self.root + "/series/"
data = {"_extractor": Ao3SeriesExtractor}

for series_id in self.series():
yield Message.Queue, base + series_id, data

def series(self):
path, user, pseud, query = self.groups
return self._pagination(self.groups[0], '<li id="series_')


class Ao3UserBookmarkExtractor(Ao3Extractor):
"""Extractor for bookmarked works of an AO3 user"""
subcategory = "user-bookmark"
pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
r"bookmarks(?:/?\?.+)?)")
example = "https://archiveofourown.org/users/USER/bookmarks"
6 changes: 6 additions & 0 deletions scripts/supportedsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"adultempire" : "Adult Empire",
"agnph" : "AGNPH",
"allgirlbooru" : "All girl",
"ao3" : "Archive of Our Own",
"archivedmoe" : "Archived.Moe",
"archiveofsins" : "Archive of Sins",
"artstation" : "ArtStation",
Expand Down Expand Up @@ -181,6 +182,11 @@
"related-pin" : "related Pins",
"related-board": "",

"ao3": {
"user-works" : "",
"user-series" : "",
"user-bookmark": "Bookmarks",
},
"artstation": {
"artwork": "Artwork Listings",
"collections": "",
Expand Down
Loading

1 comment on commit 638a676

@AtomicTEM
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:D

Please sign in to comment.