Skip to content

Commit

Permalink
[instagram] Add support for GraphSidecar media types (#201)
Browse files Browse the repository at this point in the history
* [instagram] Add support for GraphSidecar media types

Refactor _extract_postpage() to always return a list of medias.

Fetch common keywords and gracefully handle GraphSidecar media type
by extracting each single media and adding `sidecar_media_id' and
`sidecar_shortcode' keywords to indicate the parent of sidecar
childrens.

While here join the copyright comment lines in a single one.

Closes #178.

* [instagram] Use `yield from' instead of `for ... yield' (thanks @mikf)!

* [instagram] Adjust filename for GraphSidecar medias

Add a possible leading `media_id' of the sidecar for GraphSidecar
media.

Thanks to @mikf for the suggestion!

* [instagram] Add extra metadata for youtube-dl in GraphSidecar childrens

GraphSidecar children ytdl: URLs when consumed by youtube-dl
redirects to the URL of their parent.  In GraphSidecar-s with
multiple GraphVideo-s this leads to downloading the same video
multiple times.

Add a `_ytdl_index' field to indicate the index of the youtube-dl
playlist corresponding the children of the sidecar.

This will be used by the `ytdl' downloader.
  • Loading branch information
iamleot authored and mikf committed Mar 24, 2019
1 parent e7d0d98 commit 1e38f65
Showing 1 changed file with 42 additions and 25 deletions.
67 changes: 42 additions & 25 deletions gallery_dl/extractor/instagram.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-

# Copyright 2018 Leonardo Taccari
# Copyright 2019 Mike Fährmann
# Copyright 2018-2019 Leonardo Taccari, Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
Expand All @@ -19,7 +18,7 @@ class InstagramExtractor(Extractor):
"""Base class for instagram extractors"""
category = "instagram"
directory_fmt = ("{category}", "{username}")
filename_fmt = "{media_id}.{extension}"
filename_fmt = "{sidecar_media_id:?/_/}{media_id}.{extension}"
archive_fmt = "{media_id}"
root = "https://www.instagram.com"

Expand All @@ -32,11 +31,6 @@ def items(self):
if data['typename'] == 'GraphImage':
yield Message.Url, data['display_url'], \
text.nameext_from_url(data['display_url'], data)
elif data['typename'] == 'GraphSidecar':
# TODO: Extract all images in edge_sidecar_to_children
# TODO: instead of just extracting the main one!
yield Message.Url, data['display_url'], \
text.nameext_from_url(data['display_url'], data)
elif data['typename'] == 'GraphVideo':
yield Message.Url, \
'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data
Expand All @@ -50,20 +44,48 @@ def _extract_postpage(self, url):
shared_data = self._extract_shared_data(page)
media = shared_data['entry_data']['PostPage'][0]['graphql']['shortcode_media']

return {
'media_id': media['id'],
'shortcode': media['shortcode'],
'typename': media['__typename'],
'display_url': media['display_url'],
'height': text.parse_int(media['dimensions']['height']),
'width': text.parse_int(media['dimensions']['width']),
common = {
'comments': text.parse_int(media['edge_media_to_comment']['count']),
'likes': text.parse_int(media['edge_media_preview_like']['count']),
'owner_id': media['owner']['id'],
'username': media['owner']['username'],
'fullname': media['owner']['full_name'],
}

medias = []
if media['__typename'] == 'GraphSidecar':
yi = 0
for n in media['edge_sidecar_to_children']['edges']:
children = n['node']
ytdl_metadata = {}
if children['__typename'] == 'GraphVideo':
yi += 1
ytdl_metadata['_ytdl_index'] = yi
medias.append({
'media_id': children['id'],
'shortcode': children['shortcode'],
'typename': children['__typename'],
'display_url': children['display_url'],
'height': text.parse_int(children['dimensions']['height']),
'width': text.parse_int(children['dimensions']['width']),
'sidecar_media_id': media['id'],
'sidecar_shortcode': media['shortcode'],
**common,
**ytdl_metadata,
})
else:
medias.append({
'media_id': media['id'],
'shortcode': media['shortcode'],
'typename': media['__typename'],
'display_url': media['display_url'],
'height': text.parse_int(media['dimensions']['height']),
'width': text.parse_int(media['dimensions']['width']),
**common,
})

return medias

def _extract_profilepage(self, url):
page = self.request(url).text
shared_data = self._extract_shared_data(page)
Expand All @@ -86,7 +108,7 @@ def _extract_profilepage(self, url):

for s in shortcodes:
url = '{}/p/{}/'.format(self.root, s)
yield self._extract_postpage(url)
yield from self._extract_postpage(url)

if not has_next_page:
break
Expand Down Expand Up @@ -133,18 +155,13 @@ class InstagramImageExtractor(InstagramExtractor):

# GraphSidecar
("https://www.instagram.com/p/BoHk1haB5tM/", {
"pattern": r"https://[^/]+\.(cdninstagram\.com|fbcdn\.net)"
r"/vp/[0-9a-f]+/[0-9A-F]+/t51.2885-15/e35"
r"/40758827_2138611023072230_4073975203662780931_n.jpg",
"count": 5,
"keyword": {
"sidecar_media_id": "1875629777499953996",
"sidecar_shortcode": "BoHk1haB5tM",
"comments": int,
"height": int,
"likes": int,
"media_id": "1875629777499953996",
"shortcode": "BoHk1haB5tM",
"typename": "GraphSidecar",
"username": "instagram",
"width": int,
}
}),

Expand All @@ -170,7 +187,7 @@ def __init__(self, match):

def instagrams(self):
url = '{}/p/{}/'.format(self.root, self.shortcode)
return (self._extract_postpage(url),)
return self._extract_postpage(url)


class InstagramUserExtractor(InstagramExtractor):
Expand Down

0 comments on commit 1e38f65

Please sign in to comment.