Skip to content

Commit

Permalink
sentiment roberta
Browse files Browse the repository at this point in the history
  • Loading branch information
lmBored committed May 27, 2024
1 parent 5c5c835 commit 971b5f9
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 33 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ users_dataset.csv
config/.DS_Store
.vscode/
preprocess/__pycache__/
Untitled0.ipynb
Untitled0.ipynb
word_counts.csv
70 changes: 38 additions & 32 deletions preprocess/preprocessor.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,48 @@
import re
import logging
# import time
# from googletrans import Translator

airlines_dict = {"KLM": 56377143 ,
"AirFrance": 106062176 ,
"British_Airways": 18332190 ,
"AmericanAir": 22536055 ,
"Lufthansa": 124476322 ,
"AirBerlin": 26223583 ,
"AirBerlin assist": 2182373406 ,
"easyJet": 38676903 ,
"RyanAir": 1542862735 ,
"SingaporeAir": 253340062 ,
"Qantas": 218730857 ,
"EtihadAirways": 45621423 ,
"VirginAtlantic": 20626359
"AirFrance": 106062176 ,
"British_Airways": 18332190 ,
"AmericanAir": 22536055 ,
"Lufthansa": 124476322 ,
"AirBerlin": 26223583 ,
"AirBerlin assist": 2182373406 ,
"easyJet": 38676903 ,
"RyanAir": 1542862735 ,
"SingaporeAir": 253340062 ,
"Qantas": 218730857 ,
"EtihadAirways": 45621423 ,
"VirginAtlantic": 20626359
}

tweets_keys = ['id',
'text',
'in_reply_to_status_id',
'coordinates',
'timestamp_ms',
'quoted_status_id']
'text',
'in_reply_to_status_id',
'coordinates',
'timestamp_ms',
'quoted_status_id']

users_keys = ['id',
'verified',
'followers_count',
'statuses_count']
'verified',
'followers_count',
'statuses_count']

airlines_list_dict = {"KLM": ['klm'],
"AirFrance": ['airfrance', 'air france'],
"British_Airways": ['british_airways', 'british airways'],
"AmericanAir": ['americanair', 'american airlines', 'american air'],
"Lufthansa": ['lufthansa'],
"AirBerlin": ['airberlin', 'air berlin'],
"AirBerlin assist": ['airberlin assist', 'air berlin assist', 'airberlinassist'],
"easyJet": ['easyjet', 'easy jet'],
"RyanAir": ['ryanair', 'ryan air'],
"SingaporeAir": ['singaporeair', 'singapore airlines', 'singapore air'],
"Qantas": ['qantas'],
"EtihadAirways": ['etihad airways', 'etihadairways', 'etihad'],
"VirginAtlantic": ['virgin atlantic', 'virginatlantic']}
"AirFrance": ['airfrance', 'air france'],
"British_Airways": ['british_airways', 'british airways'],
"AmericanAir": ['americanair', 'american airlines', 'american air'],
"Lufthansa": ['lufthansa'],
"AirBerlin": ['airberlin', 'air berlin'],
"AirBerlin assist": ['airberlin assist', 'air berlin assist', 'airberlinassist'],
"easyJet": ['easyjet', 'easy jet'],
"RyanAir": ['ryanair', 'ryan air'],
"SingaporeAir": ['singaporeair', 'singapore airlines', 'singapore air'],
"Qantas": ['qantas'],
"EtihadAirways": ['etihad airways', 'etihadairways', 'etihad'],
"VirginAtlantic": ['virgin atlantic', 'virginatlantic']}

languages_list = ['en', 'de', 'es', 'fr', 'in', 'nl', 'it', 'pt']

Expand Down Expand Up @@ -107,6 +108,7 @@ def preprocess_users_in_retweeted_status(tweet):
def preprocessor_tweets(tweet):
try:
if 'delete' not in tweet:
# start_time = time.time()
# Get the text from the tweet
text = tweet['text']
if 'retweeted_status' in tweet:
Expand Down Expand Up @@ -158,6 +160,8 @@ def preprocessor_tweets(tweet):
if tweet.get('entities') and tweet['entities'].get('user_mentions'): # Check if 'entities' and 'user_mentions' exist and are not None
mentioned_id = [i['id'] for i in tweet['entities']['user_mentions']] # Get the IDs of mentioned users

# score = sentiment_score.roberta(text) # Get the sentiment score of the tweet

# Initialize a dictionary to store extended tweet information
extended_tweets = {'text':text, 'language':lang, 'mentioned_airlines':airlines_mentioned, 'user_mentions':mentioned_id}
tweets_info.update(extended_tweets) # Update the tweet information dictionary with extended tweet information
Expand Down Expand Up @@ -228,6 +232,8 @@ def preprocessor_tweets(tweet):
if tweets_info[i] == None:
tweets_info[i] = 'NULL' # Set nullable values to 'NULL'

# print(f"Preprocessor time taken: {time.time() - start_time}")

return tweets_info # Return the processed tweet information

except Exception as e:
Expand Down
26 changes: 26 additions & 0 deletions remnant/words_clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import csv
import re
from collections import Counter
import mysql.connector
from config import config

connection = mysql.connector.connect(host=config.get('HOST'), user=config.get('USERNAME'), password=config.get('PASSWORD'),database=config.get('DATABASE'), allow_local_infile=True)
cursor = connection.cursor()
cursor.execute("SELECT * FROM tweets ORDER BY timestamp_ms DESC")
tweets = cursor.fetchall()

word_count = Counter()

# Iterate through the tweets and update the word count
for tweet in tweets:
text = tweet[1]
# Remove any special characters and split by whitespace to get words
words = re.findall(r'\b\w+\b', text.lower())
word_count.update(words)

# Write the word counts to a CSV file
with open('word_counts.csv', 'w', newline='') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(['Word', 'Count'])
for word, count in word_count.items():
csvwriter.writerow([word, count])
38 changes: 38 additions & 0 deletions sentiment/sentiment_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from scipy.special import softmax
import re
# import time

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

def transform_text(text):
text = re.sub(r'([A-Za-z])\1{2,}', r'\1', text) # replace repeated texts, normalization
text = re.sub(r'[^A-Za-z ]', '', text) # remove special characters
text = re.sub(r'@\S+', '@user', text) # replace user mentions
return text

def roberta(text):
# start_time = time.time()
ptext = transform_text(text)
input = tokenizer(ptext, return_tensors='pt')
with torch.no_grad():
output = model(**input)
score = torch.nn.functional.softmax(output.logits, dim=-1)

# labels = ["negative", "neutral", "positive"]
# sentiment_mapping = {"negative": -1, "neutral": 0, "positive": 1}
# sentiment_probabilities = [(labels[i], prob.item()) for i, prob in enumerate(score[0])]
sentiment_score = score[0][2].item() - score[0][0].item()
# sentiment_score = sum(sentiment_mapping[label] * prob.item() for label, prob in zip(labels, score[0]))

# Log-odds
# odds_pos = score[0][2].item() / (1 - score[0][2].item() + 1e-6)
# odds_neg = score[0][0].item() / (1 - score[0][0].item() + 1e-6)
# log_odds_score = np.log(odds_pos) - np.log(odds_neg)
# normalized_score = 1 / (1 + np.exp(-log_odds_score))
# print(f"Roberta time taken: {time.time() - start_time}")

return sentiment_score

0 comments on commit 971b5f9

Please sign in to comment.