[poipiku] add 'user' and 'post' extractors (#1602)

mikf · Jun 20, 2022 · 27e8078 · 27e8078
1 parent 1aae9f2
commit 27e8078
Show file tree

Hide file tree

Showing 3 changed files with 174 additions and 0 deletions.
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
@@ -631,6 +631,12 @@ Consider all sites to be NSFW unless otherwise known.
     <td>Posts, Timelines</td>
     <td></td>
 </tr>
+<tr>
+    <td>Poipiku</td>
+    <td>https://poipiku.com/</td>
+    <td>Posts, User Profiles</td>
+    <td></td>
+</tr>
 <tr>
     <td>Porn Image</td>
     <td>https://porn-images-xxx.com/</td>

diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
@@ -107,6 +107,7 @@
     "pixiv",
     "pixnet",
     "plurk",
+    "poipiku",
     "pornhub",
     "pururin",
     "reactor",

diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py
@@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://poipiku.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?poipiku\.com"
+
+
+class PoipikuExtractor(Extractor):
+    """Base class for poipiku extractors"""
+    category = "poipiku"
+    root = "https://poipiku.com"
+    directory_fmt = ("{category}", "{user_id} {user_name}")
+    filename_fmt = "{post_id}_{num}.{extension}"
+    archive_fmt = "{post_id}_{num}"
+    request_interval = (0.5, 1.5)
+
+    def items(self):
+        for post_url in self.posts():
+            parts = post_url.split("/")
+            if post_url[0] == "/":
+                post_url = self.root + post_url
+            page = self.request(post_url).text
+            extr = text.extract_from(page)
+
+            post = {
+                "post_category": extr("<title>[", "]"),
+                "count"      : extr("(", " "),
+                "post_id"    : parts[-1].partition(".")[0],
+                "user_id"    : parts[-2],
+                "user_name"  : text.unescape(extr(
+                    '<h2 class="UserInfoUserName">', '</').rpartition(">")[2]),
+                "description": text.unescape(extr(
+                    'class="IllustItemDesc" >', '<')),
+            }
+
+            yield Message.Directory, post
+            post["num"] = 0
+
+            while True:
+                thumb = extr('class="IllustItemThumbImg" src="', '"')
+                if not thumb:
+                    break
+                elif thumb.startswith("/img/"):
+                    continue
+                post["num"] += 1
+                url = text.ensure_http_scheme(thumb[:-8])
+                yield Message.Url, url, text.nameext_from_url(url, post)
+
+            if not extr('</i> show all', '<'):
+                continue
+
+            url = self.root + "/f/ShowAppendFileF.jsp"
+            headers = {
+                "Accept" : "application/json, text/javascript, */*; q=0.01",
+                "X-Requested-With": "XMLHttpRequest",
+                "Origin" : self.root,
+                "Referer": post_url,
+            }
+            data = {
+                "UID": post["user_id"],
+                "IID": post["post_id"],
+                "PAS": "",
+                "MD" : "0",
+                "TWF": "-1",
+            }
+            page = self.request(
+                url, method="POST", headers=headers, data=data).json()["html"]
+
+            for thumb in text.extract_iter(
+                    page, 'class="IllustItemThumbImg" src="', '"'):
+                post["num"] += 1
+                url = text.ensure_http_scheme(thumb[:-8])
+                yield Message.Url, url, text.nameext_from_url(url, post)
+
+
+class PoipikuUserExtractor(PoipikuExtractor):
+    """Extractor for posts from a poipiku user"""
+    subcategory = "user"
+    pattern = (BASE_PATTERN + r"/(?:IllustListPcV\.jsp\?PG=(\d+)&ID=)?"
+               r"(\d+)/?(?:$|[?&#])")
+    test = (
+        ("https://poipiku.com/25049/", {
+            "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049"
+                       r"/\d+_\w+\.(jpe?g|png)$",
+            "range": "1-10",
+            "count": 10,
+        }),
+        ("https://poipiku.com/IllustListPcV.jsp?PG=1&ID=25049&KWD=")
+    )
+
+    def __init__(self, match):
+        PoipikuExtractor.__init__(self, match)
+        self._page, self.user_id = match.groups()
+
+    def posts(self):
+        url = self.root + "/IllustListPcV.jsp"
+        params = {
+            "PG" : text.parse_int(self._page, 0),
+            "ID" : self.user_id,
+            "KWD": "",
+        }
+
+        while True:
+            page = self.request(url, params=params).text
+
+            cnt = 0
+            for path in text.extract_iter(
+                    page, 'class="IllustInfo" href="', '"'):
+                yield path
+                cnt += 1
+
+            if cnt < 48:
+                return
+            params["PG"] += 1
+
+
+class PoipikuPostExtractor(PoipikuExtractor):
+    """Extractor for a poipiku post"""
+    subcategory = "post"
+    pattern = BASE_PATTERN + r"/(\d+)/(\d+)"
+    test = (
+        ("https://poipiku.com/25049/5864576.html", {
+            "pattern": r"https://img\.poipiku\.com/user_img03/000025049"
+                       r"/005864576_EWN1Y65gQ\.png$",
+            "keyword": {
+                "count": "1",
+                "description": "",
+                "extension": "png",
+                "filename": "005864576_EWN1Y65gQ",
+                "num": 1,
+                "post_category": "DOODLE",
+                "post_id": "5864576",
+                "user_id": "25049",
+                "user_name": "ユキウサギ",
+            },
+        }),
+        ("https://poipiku.com/2166245/6411749.html", {
+            "pattern": r"https://img\.poipiku\.com/user_img01/002166245"
+                       r"/006411749_\w+\.jpeg$",
+            "count": 4,
+            "keyword": {
+                "count": "4",
+                "description": "絵茶の産物ネタバレあるやつ",
+                "num": int,
+                "post_category": "SPOILER",
+                "post_id": "6411749",
+                "user_id": "2166245",
+                "user_name": "wadahito",
+            },
+        }),
+    )
+
+    def __init__(self, match):
+        PoipikuExtractor.__init__(self, match)
+        self.user_id, self.post_id = match.groups()
+
+    def posts(self):
+        return ("/{}/{}.html".format(self.user_id, self.post_id),)