Skip to content

Commit

Permalink
Add search downloading to twitter.py (#448)
Browse files Browse the repository at this point in the history
Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en

The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
  • Loading branch information
alice945 authored and mikf committed Oct 16, 2019
1 parent 1693d97 commit bcddcca
Showing 1 changed file with 22 additions and 6 deletions.
28 changes: 22 additions & 6 deletions gallery_dl/extractor/twitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,18 +140,23 @@ def _tweets_from_api(self, url):
if not data["has_more_items"]:
return

position = text.parse_int(text.extract(
tweet, 'data-tweet-id="', '"')[0])
if "max_position" in params and position >= params["max_position"]:
return
if "min_position" in data:
position = data["min_position"]
if "max_position" in params and position == params["max_position"]:
return
else:
position = text.parse_int(text.extract(
tweet, 'data-tweet-id="', '"')[0])
if "max_position" in params and position >= params["max_position"]:
return
params["max_position"] = position


class TwitterTimelineExtractor(TwitterExtractor):
"""Extractor for all images from a user's timeline"""
subcategory = "timeline"
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
r"/([^/?&#]+)/?(?:$|[?#])")
r"/((?!search)[^/?&#]+)/?(?:$|[?#])")
test = (
("https://twitter.com/supernaturepics", {
"range": "1-40",
Expand All @@ -171,7 +176,7 @@ class TwitterMediaExtractor(TwitterExtractor):
"""Extractor for all images from a user's Media Tweets"""
subcategory = "media"
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
r"/([^/?&#]+)/media(?!\w)")
r"/((?!search)[^/?&#]+)/media(?!\w)")
test = (
("https://twitter.com/supernaturepics/media", {
"range": "1-40",
Expand All @@ -185,6 +190,17 @@ def tweets(self):
self.root, self.user)
return self._tweets_from_api(url)

class TwitterSearchExtractor(TwitterExtractor):
"""Extractor for all images from a search timeline"""
subcategory = "search"
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
r"/search[^q]+q=([^/?&#]+)(?:$|&)")
test = ()

def tweets(self):
url = "{}/i/search/timeline?f=tweets&q={}".format(
self.root, self.user)
return self._tweets_from_api(url)

class TwitterTweetExtractor(TwitterExtractor):
"""Extractor for images from individual tweets"""
Expand Down

0 comments on commit bcddcca

Please sign in to comment.