-
Notifications
You must be signed in to change notification settings - Fork 0
/
7 twitterdataUGC.py
237 lines (210 loc) · 8.62 KB
/
7 twitterdataUGC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#!/usr/bin/env python3
import tweepy
import xlwt
from xlwt import Workbook
import spacy
import operator
import numpy as np
import pandas as pd
from pandas import DataFrame
import json
import re
# INITIALIZING SPACY AND ITS 'en' MODEL
nlp = spacy.load("en_core_web_sm")
# OPENING JSON SENTIMENT DICTIONARY
with open('afinn-165.json') as f:
data = json.load(f)
#CLEANING DATA
avoid=['@','#','$','%','^','&','*','(',')','_','=','+','[',']','|','\n','\t','<','>','/']
# READING GLOSSARY EXCEL
def read_standards():
sheet= pd.read_excel("Standards.xlsx")
return sheet
# READING SENTIMENTS EXCEL
def read_sentiments():
sheet= pd.read_excel("Sentiments.xlsx")
return sheet
standards_data=read_standards()
sentiments_data=read_sentiments()
glossary=standards_data.values.tolist()
keywords=[]
for item in glossary:
if item[2] not in keywords:
keywords.append(item[2])
if item[1] not in keywords:
keywords.append(item[1])
# Authenticate to Twitter
auth = tweepy.OAuthHandler("oGMSF0ZE60EPmq63CnJ5OVy56", "hoqrp3VgWWvzE3OjAT6iCD8upY4vc3UwcswH3Fk5Q4x46xllGK")
auth.set_access_token("847417106-WmqvtW8uv0HSHy0MXMpOjLEesVHoq450uXODLG9E", "ek7td0b383tqC75ZZ3bcuOqzl9ooxOrkpUAdjjHs5qp14")
# Create API object
api = tweepy.API(auth)
# api.update_status(status='Test')
class reports:
def __init__(self, name):
self.company = name
# TOKANIZING SINGLE TWEET
def tokenify_tweet(self,tweet):
buff = ''
sentences=[]
for letter in tweet:
letter=letter.lower()
if letter in avoid:
if buff != '':
sentences.append(buff)
buff = ''
elif (buff is not None):
buff += letter
if buff is not None:
sentences.append(buff)
buff=''
return sentences
#print(tokenify_glossary(read_file()))
#READING TWEETS
def read_tweets(self):
master_list=pd.DataFrame(columns=['standards','sub-standard','sentence', 'sentiment','time','retweets'])
stakeholders = 0
materiality_count = []
for keyword in keywords:
search_terms=[keyword, self.company]
# public_tweets = api.search(q=search_terms, lang="en", count=100) # q= “string that we are looking for”
# public_tweets=[]
try:
for tweet in tweepy.Cursor(api.search, q=search_terms, lang="en").items(5000000):
if (not tweet.retweeted) and ('RT @' not in tweet.text):
# try:
# item=[]
# item.append(tweet.text.encode('utf-8'))
# item.append(tweet.created_at)
# item.append(tweet.retweet_count)
# public_tweets.append(item)
# except:
# continue
# print(public_tweets)
# for each tweet within all tweets pulled
# for tweet in public_tweets:
tweet_time="-"
tweet_retweet_count="-"
try:
tweet_time=tweet.created_at
tweet_retweet_count=tweet.retweet_count
except:
pass
tweet=str(tweet.text.encode('utf-8'))
sentences = self.tokenify_tweet(tweet)
for sentence in sentences:
doc=nlp(sentence)
try:
root = [token for token in doc if token.head == token][0]
except:
continue
root=[root]
tree=[]
materiality = self.select_standards(doc)
if materiality:
tree=self.create_tree(root,doc,tree)
sentiment = self.calculate_sentiments(tree)
for sub_standard in materiality:
# print(sub_standard,materiality,len(materiality))
# print(root,doc,sub_standard[0],sub_standard[1],sentence,sentiment)
data_point=[]
data_point.append(sub_standard[0])
data_point.append(sub_standard[1])
data_point.append(sentence)
data_point.append(sentiment)
data_point.append(tweet_time)
data_point.append(tweet_retweet_count)
to_append = data_point
df_length = len(master_list)
master_list.loc[df_length] = to_append
if sub_standard[1] not in materiality_count:
materiality_count.append(sub_standard[1])
except:
continue
print(stakeholders)
print(materiality_count)
return master_list
#LIST-TREE
def create_tree(self,temp_head,doc,TREE):
for sub_head in temp_head:
self.track_subjects(sub_head)
if not self.ignore_fillers(sub_head):
TREE.append(sub_head)
sub_tree=[child for child in sub_head.children]
if not sub_tree:
continue
else:
self.create_tree(sub_tree,doc,TREE,)
return TREE
#Track subjects and pronouns
def track_subjects(self,sub_head):
if sub_head.dep_=="nsubj" or sub_head.dep_=="csubj":
subject=sub_head.text
elif sub_head.dep_=="PRON" and (sub_head.text=="It" or sub_head.text=="it"):
sub_head.text=subject
elif sub_head.dep_=="PRON" and (sub_head.text!="It" or sub_head.text!="it"):
stakeholders=stakeholders+1
#Ignore dep
def ignore_fillers(self,sub_head):
if sub_head.dep_=="aux" or sub_head.pos_=="DET" or sub_head.pos_=="PUNCT" or sub_head.dep_=="preconj" or sub_head.dep_=="prep":
return 1
else:
return 0
# INDENTIFY MATERIALITY
def select_standards(self,doc):
materiality=[]
for token in doc:
found=standards_data.loc[standards_data['sub-standard'] == token].head(1).values.tolist()
if found:
sub_standard=[]
sub_standard.append(found[0][0])
sub_standard.append(found[0][1])
materiality.append(sub_standard)
break
else:
found=standards_data.loc[standards_data['text'] == token.lemma_].head(1).values.tolist()
if found:
sub_standard=[]
sub_standard.append(found[0][0])
sub_standard.append(found[0][1])
materiality.append(sub_standard)
else:
continue
if not materiality:
return 0
return materiality
#CALCULATE SENTIMENTS
def calculate_sentiments(self,TREE):
#add afinn
count_descriptive_words=0
sentiment=0
for item in reversed(TREE):
item_sentiment=0
try:
found=sentiments_data.loc[sentiments_data["keyword"] == item.lemma_].head(1).values.tolist()
if found:
item_sentiment=item_sentiment+found[0][1]
count_descriptive_words=count_descriptive_words+1
elif item in data:
item_sentiment=item_sentiment+data[item]
count_descriptive_words=count_descriptive_words+1
except TypeError:
pass
if not item_sentiment:
item_sentiment=1
if str(item)=="not" or str(item)=="nor":
print(item)
sentiment=item_sentiment+(-1)*sentiment
else:
sentiment=item_sentiment+sentiment
# print(sentiment,count_descriptive_words)
if count_descriptive_words:
sentiment=sentiment/count_descriptive_words
return sentiment
# CREATE DATABASE
def create_database(self,df_master_list):
# database to excel
with pd.ExcelWriter(self.company+"tweets.xlsx") as writer:
df_master_list.to_excel(writer)
writer.save()
ITC = reports("Marico")
print(ITC.create_database(ITC.read_tweets()))