forked from lchlnd/comp90024-project-2
-
Notifications
You must be signed in to change notification settings - Fork 1
/
harvesterSearch.py
97 lines (80 loc) · 3.14 KB
/
harvesterSearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
Use Twitter search APIs find tweets from specific location.
Raw tweets are stored in specified couchDB databse.
"""
import logging
import tweepy
import couchdb
class TwitterSearcher():
"""Use Twitter search APIs find tweets from specific location."""
def __init__(self, api, db, geo, query):
"""Set variables required by Twitter Search API."""
self.api = api
self.db = db
self.geo = geo
self.query = query
# API rate call limit.
self.limit = 100
def search(self):
"""Search for tweets via Twitter Search API."""
# Track the upper and lower bound of each returned set.
lower_id = None
upper_id = -1
# Track number of tweets returned in total.
tweet_count = 0
# Pull tweets until erorr or no more to process.
while True:
try:
if (upper_id <= 0):
if (not lower_id):
new_tweets = self.api.search(
q=self.query,
geocode=self.geo,
count=self.limit
)
else:
new_tweets = self.api.search(
q=self.query,
geocode=self.geo,
count=self.limit,
since_id=lower_id
)
else:
if (not lower_id):
new_tweets = self.api.search(
q=self.query,
geocode=self.geo,
count=self.limit,
upper_id=str(upper_id - 1)
)
else:
new_tweets = self.api.search(
q=self.query,
geocode=self.geo,
count=self.limit,
upper_id=str(upper_id - 1),
since_id=lower_id
)
# Exit when no new tweets are found.
if not new_tweets:
logging.info("No more tweets to read.")
break
# Process received tweets.
for tweet in new_tweets:
jtweet = tweet._json
# Only store tweets that have location we can use.
if tweet.coordinates or tweet.place:
jtweet['_id'] = str(jtweet['id'])
try:
self.db.save(jtweet)
except couchdb.http.ResourceConflict:
logging.info("Ignored duplicate tweet.")
# Output current number of tweets.
tweet_count += len(new_tweets)
logging.info("Downloaded {0} tweets".format(tweet_count))
# Track upper id.
upper_id = new_tweets[-1].id
# Exit upon error.
except tweepy.TweepError as e:
logging.error(str(e))
break