[deviantart] improvements (#392)

- consistent 'filename' entries, at least as far as possible - GIFs and SWFs don't have a <title>_by_<artist>_<id> anywhere in their metadata - Generating <id> (from 'deviationid'?) might be something that needs to be figured out, so we can build those filenames ourselves - better code structure etc. - tests for videos, archives, and flash animations
mikf · Aug 23, 2019 · 63daa68 · 63daa68
1 parent d1db518
commit 63daa68
Showing 1 changed file with 114 additions and 82 deletions.
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
@@ -140,8 +140,13 @@ def prepare(self, deviation):
     @staticmethod
     def commit(deviation, target):
         url = target["src"]
-        deviation["target"] = text.nameext_from_url(url, target.copy())
-        deviation["extension"] = deviation["target"]["extension"]
+        thumb = deviation["thumbs"][0]["src"] if "thumbs" in deviation else url
+        target = text.nameext_from_url(thumb, target.copy())
+        if target["filename"].endswith("-150"):
+            target["filename"] = target["filename"][:-4]
+        deviation["target"] = target
+        deviation["filename"] = target["filename"]
+        deviation["extension"] = target["extension"] = text.ext_from_url(url)
         return Message.Url, url, deviation
 
     def _commit_journal_html(self, deviation, journal):
@@ -570,126 +575,152 @@ def items(self):
         }
 
         yield Message.Version, 1
-        for params["deviationid"], params["username"], params["type"] \
-                in self.deviations():
+        for deviation in self.deviations():
+            params["deviationid"] = deviation["deviationId"]
+            params["username"] = deviation["author"]["username"]
+            params["type"] = "journal" if deviation["isJournal"] else "art"
             data = self.request(url, params=params, headers=headers).json()
+
             if "deviation" not in data:
                 self.log.warning("Skipping %s", params["deviationid"])
                 continue
-            deviation = data["deviation"]
-            extended = deviation["extended"]
-            del deviation["extended"]
-
-            deviation["stats"] = extended["stats"]
-            deviation["stats"]["comments"] = data["comments"]["total"]
-            deviation["description"] = extended.get("description", "")
-            deviation["tags"] = [t["name"] for t in extended.get("tags") or ()]
-            deviation["index"] = deviation["deviationId"]
-            deviation["username"] = params["username"].lower()
-            deviation["date"] = text.parse_datetime(
-                deviation["publishedTime"])
-            deviation["published_time"] = int(deviation["date"].timestamp())
-
-            deviation["category_path"] = "/".join(
-                extended[key]["displayNameEn"]
-                for key in ("typeFacet", "contentFacet", "categoryFacet")
-                if key in extended
-            )
-
-            lfile = deviation["files"][-1]
-            if lfile["type"] == "gif":
-                deviation["target"] = lfile
-
-            elif lfile["type"] == "video":
-                # select largest video
-                deviation["target"] = max(
-                    deviation["files"],
-                    key=lambda x: text.parse_int(x.get("quality", "")[:-1])
-                )
-
-            elif lfile["type"] == "flash":
-                if lfile["src"].startswith("https://sandbox.deviantart.com"):
-                    # extract SWF file from "sandbox"
-                    lfile["src"] = text.extract(
-                        self.request(lfile["src"]).text,
-                        'id="sandboxembed" src="', '"',
-                    )[0]
-                deviation["target"] = lfile
-
-            elif "download" in extended:
-                deviation["target"] = extended["download"]
-                deviation["target"]["src"] = extended["download"]["url"]
+            deviation = self._extract(data)
 
-            else:
-                deviation["target"] = lfile
-
-            src = deviation["target"]["src"]
-            if src.startswith("https://images-wixmp-"):
-                if deviation["index"] <= 790677560:
-                    # https://github.com/r888888888/danbooru/issues/4069
-                    src = re.sub(
-                        r"(/f/[^/]+/[^/]+)/v\d+/.*",
-                        r"/intermediary\1", src)
-                if self.quality:
-                    src = re.sub(
-                        r"q_\d+", self.quality, src)
-                deviation["target"]["src"] = src
-
-            text.nameext_from_url(src, deviation)
             yield Message.Directory, deviation
-            yield Message.Url, src, deviation
-
+            yield Message.Url, deviation["target"]["src"], deviation
             if self.extra:
                 for match in DeviantartStashExtractor.pattern.finditer(
                         deviation["description"]):
                     deviation["_extractor"] = DeviantartStashExtractor
                     yield Message.Queue, match.group(0), deviation
 
+    def _extract(self, data):
+        deviation = data["deviation"]
+        extended = deviation["extended"]
+        files = deviation["files"]
+        del deviation["extended"]
+        del deviation["files"]
+
+        # prepare deviation metadata
+        deviation["description"] = extended.get("description", "")
+        deviation["username"] = self.user.lower()
+        deviation["stats"] = extended["stats"]
+        deviation["stats"]["comments"] = data["comments"]["total"]
+        deviation["index"] = deviation["deviationId"]
+        deviation["tags"] = [t["name"] for t in extended.get("tags") or ()]
+        deviation["date"] = text.parse_datetime(
+            deviation["publishedTime"])
+        deviation["category_path"] = "/".join(
+            extended[key]["displayNameEn"]
+            for key in ("typeFacet", "contentFacet", "categoryFacet")
+            if key in extended
+        )
+
+        # extract download target
+        target = files[-1]
+        name = files[0]["src"]
+
+        if target["type"] == "gif":
+            pass
+        elif target["type"] == "video":
+            # select largest video
+            target = max(
+                files, key=lambda x: text.parse_int(x.get("quality", "")[:-1]))
+            name = target["src"]
+        elif target["type"] == "flash":
+            if target["src"].startswith("https://sandbox.deviantart.com"):
+                # extract SWF file from "sandbox"
+                target["src"] = text.extract(
+                    self.request(target["src"]).text,
+                    'id="sandboxembed" src="', '"',
+                )[0]
+        elif "download" in extended:
+            target = extended["download"]
+            target["src"] = target["url"]
+            del target["url"]
+
+        # url rewrites
+        if target["src"].startswith("https://images-wixmp-"):
+            if deviation["index"] <= 790677560:
+                # https://github.com/r888888888/danbooru/issues/4069
+                target["src"] = re.sub(
+                    r"(/f/[^/]+/[^/]+)/v\d+/.*",
+                    r"/intermediary\1", target["src"])
+            if self.quality:
+                target["src"] = re.sub(
+                    r"q_\d+", self.quality, target["src"])
+
+        text.nameext_from_url(name, target)
+        if target["filename"].endswith("-150"):
+            target["filename"] = target["filename"][:-4]
+        deviation["target"] = target
+        deviation["filename"] = target["filename"]
+        deviation["extension"] = target["extension"] = (
+            text.ext_from_url(target["src"]))
+        return deviation
+
 
 class DeviantartDeviationExtractor(DeviantartExtractorV2):
     """Extractor for single deviations"""
     subcategory = "deviation"
     archive_fmt = "{index}.{extension}"
     pattern = BASE_PATTERN + r"/(art|journal)/(?:[^/?&#]+-)?(\d+)"
     test = (
-        (("https://www.deviantart.com/shimoda7/art/"
-          "For-the-sake-of-a-memory-10073852"), {
+        (("https://www.deviantart.com/shimoda7/art/For-the-sake-10073852"), {
             "options": (("original", 0),),
             "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e",
         }),
         ("https://www.deviantart.com/zzz/art/zzz-1234567890", {
             "count": 0,
         }),
-        (("https://www.deviantart.com/myria-moon/art/"
-          "Aime-Moi-part-en-vadrouille-261986576"), {
+        (("https://www.deviantart.com/myria-moon/art/Aime-Moi-261986576"), {
             "pattern": (r"https://www.deviantart.com/download/261986576"
                         r"/[\w-]+\.jpg\?token=\w+&ts=\d+"),
         }),
         # wixmp URL rewrite
-        (("https://www.deviantart.com/citizenfresh/art/"
-          "Hverarond-14-the-beauty-of-the-earth-789295466"), {
+        (("https://www.deviantart.com/citizenfresh/art/Hverarond-789295466"), {
             "pattern": (r"https://images-wixmp-\w+\.wixmp\.com"
                         r"/intermediary/f/[^/]+/[^.]+\.jpg$")
         }),
         # wixmp URL rewrite v2 (#369)
-        (("https://www.deviantart.com/josephbiwald/art/"
-          "Destiny-2-Warmind-Secondary-Keyart-804940104"), {
+        (("https://www.deviantart.com/josephbiwald/art/Destiny-2-804940104"), {
             "pattern": r"https://images-wixmp-\w+\.wixmp\.com/.*,q_100,"
         }),
         # non-download URL for GIFs (#242)
-        (("https://www.deviantart.com/skatergators/art/"
-          "COM-Monique-Model-781571783"), {
+        (("https://www.deviantart.com/skatergators/art/COM-Moni-781571783"), {
             "pattern": (r"https://images-wixmp-\w+\.wixmp\.com"
                         r"/f/[^/]+/[^.]+\.gif\?token="),
         }),
         # external URLs from description (#302)
-        (("https://www.deviantart.com/uotapo/art/"
-          "INANAKI-Memorial-Humane7-590297498"), {
+        (("https://www.deviantart.com/uotapo/art/INANAKI-Memo-590297498"), {
             "options": (("extra", 1), ("original", 0)),
             "pattern": r"https?://sta\.sh/\w+$",
             "range": "2-",
             "count": 4,
         }),
+        # video
+        ("https://www.deviantart.com/chi-u/art/-VIDEO-Brushes-330774593", {
+            "url": "3b6e6e761d2d393fa61a4dc3ed6e7db51b14d07b",
+            "keyword": {
+                "target": {
+                    "duration": 306,
+                    "extension": "mp4",
+                    "filename": r"re:_video____brushes_\w+_by_chi_u-d5gxnb5",
+                    "filesize": 9963639,
+                    "quality": "1080p",
+                    "src": str,
+                    "type": "video",
+                },
+            }
+        }),
+        # archive
+        ("https://www.deviantart.com/itsvenue/art/-brush-pngs-14-763300948", {
+            "pattern": r"https://.+deviantart.com/download/763300948/.*\.rar",
+        }),
+        # swf
+        ("https://www.deviantart.com/ikatxfruti/art/Bang-Bang-528130222", {
+            "pattern": r"https://images-wixmp-.*wixmp.com/f/.*\.swf",
+        }),
         # old-style URLs
         ("https://shimoda7.deviantart.com"
          "/art/For-the-sake-of-a-memory-10073852"),
@@ -706,7 +737,11 @@ def __init__(self, match):
         self.deviation_id = match.group(4)
 
     def deviations(self):
-        return ((self.deviation_id, self.user, self.type),)
+        return ({
+            "deviationId": self.deviation_id,
+            "author"     : {"username": self.user},
+            "isJournal"  : self.type == "journal",
+        },)
 
 
 class DeviantartScrapsExtractor(DeviantartExtractorV2):
@@ -739,10 +774,7 @@ def deviations(self):
             data = self.request(url, params=params, headers=headers).json()
 
             for obj in data["results"]:
-                deviation = obj["deviation"]
-                user = deviation["author"]["username"]
-                type = "journal" if deviation["isJournal"] else "art"
-                yield deviation["deviationId"], user, type
+                yield obj["deviation"]
 
             if not data["hasMore"]:
                 return