Skip to content

Commit

Permalink
merge #5037: [hatenablog] add support (#5036)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikf committed Jan 12, 2024
2 parents b1c175f + 9f53daa commit 71e2c3e
Show file tree
Hide file tree
Showing 5 changed files with 324 additions and 1 deletion.
6 changes: 6 additions & 0 deletions docs/supportedsites.md
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,12 @@ Consider all listed sites to potentially be NSFW.
<td>Folders</td>
<td></td>
</tr>
<tr>
<td>HatenaBlog</td>
<td>https://hatenablog.com</td>
<td>Archive, Individual Posts, Home Feed, Search Results</td>
<td></td>
</tr>
<tr>
<td>HBrowse</td>
<td>https://www.hbrowse.com/</td>
Expand Down
1 change: 1 addition & 0 deletions gallery_dl/extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
"gelbooru_v01",
"gelbooru_v02",
"gofile",
"hatenablog",
"hbrowse",
"hentai2read",
"hentaicosplays",
Expand Down
167 changes: 167 additions & 0 deletions gallery_dl/extractor/hatenablog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://hatenablog.com"""

import re
from .common import Extractor, Message
from .. import text


BASE_PATTERN = (
r"(?:hatenablog:https?://([^/?#]+)|(?:https?://)?"
r"([\w-]+\.(?:hatenablog\.(?:com|jp)"
r"|hatenadiary\.com|hateblo\.jp)))"
)
QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$"


class HatenablogExtractor(Extractor):
"""Base class for HatenaBlog extractors"""
category = "hatenablog"
directory_fmt = ("{category}", "{domain}")
filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}"
archive_fmt = "{filename}"

def __init__(self, match):
Extractor.__init__(self, match)
self.domain = match.group(1) or match.group(2)

def _init(self):
self._find_img = re.compile(r'<img +([^>]+)').finditer

def _handle_article(self, article: str):
extr = text.extract_from(article)
date = text.parse_datetime(extr('<time datetime="', '"'))
entry_link = text.unescape(extr('<a href="', '"'))
entry = entry_link.partition("/entry/")[2]
title = text.unescape(extr('>', '<'))
content = extr(
'<div class="entry-content hatenablog-entry">', '</div>')

images = []
for i in self._find_img(content):
attributes = i.group(1)
if 'class="hatena-fotolife"' not in attributes:
continue
image = text.unescape(text.extr(attributes, 'src="', '"'))
images.append(image)

data = {
"domain": self.domain,
"date": date,
"entry": entry,
"title": title,
"count": len(images),
}
yield Message.Directory, data
for data["num"], url in enumerate(images, 1):
yield Message.Url, url, text.nameext_from_url(url, data)


class HatenablogEntriesExtractor(HatenablogExtractor):
"""Base class for a list of entries"""
allowed_parameters = ()

def __init__(self, match):
HatenablogExtractor.__init__(self, match)
self.path = match.group(3)
self.query = {key: value for key, value in text.parse_query(
match.group(4)).items() if self._acceptable_query(key)}

def _init(self):
HatenablogExtractor._init(self)
self._find_pager_url = re.compile(
r' class="pager-next">\s*<a href="([^"]+)').search

def items(self):
url = "https://" + self.domain + self.path
query = self.query

while url:
page = self.request(url, params=query).text

extr = text.extract_from(page)
attributes = extr('<body ', '>')
if "page-archive" in attributes:
yield from self._handle_partial_articles(extr)
else:
yield from self._handle_full_articles(extr)

match = self._find_pager_url(page)
url = text.unescape(match.group(1)) if match else None
query = None

def _handle_partial_articles(self, extr):
while True:
section = extr('<section class="archive-entry', '</section>')
if not section:
break

url = "hatenablog:" + text.unescape(text.extr(
section, '<a class="entry-title-link" href="', '"'))
data = {"_extractor": HatenablogEntryExtractor}
yield Message.Queue, url, data

def _handle_full_articles(self, extr):
while True:
attributes = extr('<article ', '>')
if not attributes:
break
if "no-entry" in attributes:
continue

article = extr('', '</article>')
yield from self._handle_article(article)

def _acceptable_query(self, key):
return key == "page" or key in self.allowed_parameters


class HatenablogEntryExtractor(HatenablogExtractor):
"""Extractor for a single entry URL"""
subcategory = "entry"
pattern = BASE_PATTERN + r"/entry/([^?#]+)" + QUERY_RE
example = "https://BLOG.hatenablog.com/entry/PATH"

def __init__(self, match):
HatenablogExtractor.__init__(self, match)
self.path = match.group(3)

def items(self):
url = "https://" + self.domain + "/entry/" + self.path
page = self.request(url).text

extr = text.extract_from(page)
while True:
attributes = extr('<article ', '>')
if "no-entry" in attributes:
continue
article = extr('', '</article>')
return self._handle_article(article)


class HatenablogHomeExtractor(HatenablogEntriesExtractor):
"""Extractor for a blog's home page"""
subcategory = "home"
pattern = BASE_PATTERN + r"(/?)" + QUERY_RE
example = "https://BLOG.hatenablog.com"


class HatenablogArchiveExtractor(HatenablogEntriesExtractor):
"""Extractor for a blog's archive page"""
subcategory = "archive"
pattern = (BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?"
r"|/category/[^?#]+)?)" + QUERY_RE)
example = "https://BLOG.hatenablog.com/archive/2024"


class HatenablogSearchExtractor(HatenablogEntriesExtractor):
"""Extractor for a blog's search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"(/search)" + QUERY_RE
example = "https://BLOG.hatenablog.com/search?q=QUERY"
allowed_parameters = ("q",)
7 changes: 6 additions & 1 deletion scripts/supportedsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
"fanbox" : "pixivFANBOX",
"fashionnova" : "Fashion Nova",
"furaffinity" : "Fur Affinity",
"hatenablog" : "HatenaBlog",
"hbrowse" : "HBrowse",
"hentai2read" : "Hentai2Read",
"hentaicosplays" : "Hentai Cosplay",
Expand Down Expand Up @@ -102,7 +103,6 @@
"pornimagesxxx" : "Porn Image",
"pornpics" : "PornPics.com",
"pornreactor" : "PornReactor",
"postmill" : "Postmill",
"readcomiconline": "Read Comic Online",
"rbt" : "RebeccaBlackTech",
"redgifs" : "RedGIFs",
Expand Down Expand Up @@ -189,6 +189,11 @@
"fapello": {
"path": "Videos, Trending Posts, Popular Videos, Top Models",
},
"hatenablog": {
"archive": "Archive",
"entry" : "Individual Posts",
"home" : "Home Feed",
},
"hentaifoundry": {
"story": "",
},
Expand Down
144 changes: 144 additions & 0 deletions test/results/hatenablog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

from gallery_dl.extractor import hatenablog


__tests__ = (
{
"#url" : "https://cosmiclatte.hatenablog.com/entry/2020/05/28/003227",
"#category": ("", "hatenablog", "entry"),
"#class" : hatenablog.HatenablogEntryExtractor,
"#count" : 20,
},

{
"#url" : "https://moko0908.hatenablog.jp/entry/2023/12/31/083846",
"#category": ("", "hatenablog", "entry"),
"#class" : hatenablog.HatenablogEntryExtractor,
},

{
"#url" : "https://p-shirokuma.hatenadiary.com/entry/20231227/1703685600",
"#category": ("", "hatenablog", "entry"),
"#class" : hatenablog.HatenablogEntryExtractor,
},

{
"#url" : "https://urakatahero.hateblo.jp/entry/2ndlife",
"#category": ("", "hatenablog", "entry"),
"#class" : hatenablog.HatenablogEntryExtractor,
},

{
"#url" : "hatenablog:https://blog.hyouhon.com/entry/2023/12/22/133549",
"#category": ("", "hatenablog", "entry"),
"#class" : hatenablog.HatenablogEntryExtractor,
},

{
"#url" : "https://cetriolo.hatenablog.com",
"#category": ("", "hatenablog", "home"),
"#class" : hatenablog.HatenablogHomeExtractor,
"#range" : "1-7",
"#count" : 7,
},

{
"#url" : "https://moko0908.hatenablog.jp/",
"#category": ("", "hatenablog", "home"),
"#class" : hatenablog.HatenablogHomeExtractor,
},

{
"#url" : "https://p-shirokuma.hatenadiary.com/",
"#category": ("", "hatenablog", "home"),
"#class" : hatenablog.HatenablogHomeExtractor,
},

{
"#url" : "https://urakatahero.hateblo.jp/",
"#category": ("", "hatenablog", "home"),
"#class" : hatenablog.HatenablogHomeExtractor,
},

{
"#url" : "hatenablog:https://blog.hyouhon.com/",
"#category": ("", "hatenablog", "home"),
"#class" : hatenablog.HatenablogHomeExtractor,
},

{
"#url" : ("https://8saki.hatenablog.com/archive/category/%E3%82%BB%E3"
"%83%AB%E3%83%95%E3%82%B8%E3%82%A7%E3%83%AB%E3%83%8D%E3%82"
"%A4%E3%83%AB"),
"#category": ("", "hatenablog", "archive"),
"#class" : hatenablog.HatenablogArchiveExtractor,
"#range" : "1-30",
"#count" : 30,
},

{
"#url" : "https://moko0908.hatenablog.jp/archive/2023",
"#category": ("", "hatenablog", "archive"),
"#class" : hatenablog.HatenablogArchiveExtractor,
"#count" : 13,
},

{
"#url" : "https://p-shirokuma.hatenadiary.com/archive/2023/01",
"#category": ("", "hatenablog", "archive"),
"#class" : hatenablog.HatenablogArchiveExtractor,
"#count" : 5,
},

{
"#url" : "https://urakatahero.hateblo.jp/archive",
"#category": ("", "hatenablog", "archive"),
"#class" : hatenablog.HatenablogArchiveExtractor,
"#range" : "1-30",
"#count" : 30,
},

{
"#url" : "hatenablog:https://blog.hyouhon.com/archive/2024/01/01",
"#category": ("", "hatenablog", "archive"),
"#class" : hatenablog.HatenablogArchiveExtractor,
},

{
"#url" : "hatenablog:https://blog.hyouhon.com/search?q=a",
"#category": ("", "hatenablog", "search"),
"#class" : hatenablog.HatenablogSearchExtractor,
"#range" : "1-30",
"#count" : 30,
},

{
"#url" : "https://cosmiclatte.hatenablog.com/search?q=a",
"#category": ("", "hatenablog", "search"),
"#class" : hatenablog.HatenablogSearchExtractor,
},

{
"#url" : "https://moko0908.hatenablog.jp/search?q=a",
"#category": ("", "hatenablog", "search"),
"#class" : hatenablog.HatenablogSearchExtractor,
},

{
"#url" : "https://p-shirokuma.hatenadiary.com/search?q=a",
"#category": ("", "hatenablog", "search"),
"#class" : hatenablog.HatenablogSearchExtractor,
},

{
"#url" : "https://urakatahero.hateblo.jp/search?q=a",
"#category": ("", "hatenablog", "search"),
"#class" : hatenablog.HatenablogSearchExtractor,
},

)

0 comments on commit 71e2c3e

Please sign in to comment.