sentiment roberta

lmBored · May 27, 2024 · 971b5f9 · 971b5f9
1 parent 5c5c835
commit 971b5f9
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 33 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,5 @@ users_dataset.csv
 config/.DS_Store
 .vscode/
 preprocess/__pycache__/
-Untitled0.ipynb
+Untitled0.ipynb
+word_counts.csv
diff --git a/preprocess/preprocessor.py b/preprocess/preprocessor.py
@@ -1,47 +1,48 @@
 import re
 import logging
+# import time
 # from googletrans import Translator
 
 airlines_dict = {"KLM": 56377143 ,
-                 "AirFrance": 106062176 ,
-                 "British_Airways": 18332190 ,
-                 "AmericanAir": 22536055 ,
-                 "Lufthansa": 124476322 ,
-                 "AirBerlin": 26223583 ,
-                 "AirBerlin assist": 2182373406 ,
-                 "easyJet": 38676903 ,
-                 "RyanAir": 1542862735 ,
-                 "SingaporeAir": 253340062 ,
-                 "Qantas": 218730857 ,
-                 "EtihadAirways": 45621423 ,
-                 "VirginAtlantic": 20626359
+                "AirFrance": 106062176 ,
+                "British_Airways": 18332190 ,
+                "AmericanAir": 22536055 ,
+                "Lufthansa": 124476322 ,
+                "AirBerlin": 26223583 ,
+                "AirBerlin assist": 2182373406 ,
+                "easyJet": 38676903 ,
+                "RyanAir": 1542862735 ,
+                "SingaporeAir": 253340062 ,
+                "Qantas": 218730857 ,
+                "EtihadAirways": 45621423 ,
+                "VirginAtlantic": 20626359
                 }
 
 tweets_keys = ['id',
-               'text',
-               'in_reply_to_status_id',
-               'coordinates',
-               'timestamp_ms',
-               'quoted_status_id']
+                'text',
+                'in_reply_to_status_id',
+                'coordinates',
+                'timestamp_ms',
+                'quoted_status_id']
 
 users_keys = ['id',
-              'verified',
-              'followers_count',
-              'statuses_count']
+            'verified',
+            'followers_count',
+            'statuses_count']
 
 airlines_list_dict = {"KLM":  ['klm'],
-                      "AirFrance":  ['airfrance', 'air france'],
-                      "British_Airways":  ['british_airways', 'british airways'],
-                      "AmericanAir":  ['americanair', 'american airlines', 'american air'],
-                      "Lufthansa":  ['lufthansa'],
-                      "AirBerlin":  ['airberlin', 'air berlin'],
-                      "AirBerlin assist":  ['airberlin assist', 'air berlin assist', 'airberlinassist'],
-                      "easyJet":  ['easyjet', 'easy jet'],
-                      "RyanAir":  ['ryanair', 'ryan air'],
-                      "SingaporeAir":  ['singaporeair', 'singapore airlines', 'singapore air'],
-                      "Qantas":  ['qantas'],
-                      "EtihadAirways":  ['etihad airways', 'etihadairways', 'etihad'],
-                      "VirginAtlantic":  ['virgin atlantic', 'virginatlantic']}
+                    "AirFrance":  ['airfrance', 'air france'],
+                    "British_Airways":  ['british_airways', 'british airways'],
+                    "AmericanAir":  ['americanair', 'american airlines', 'american air'],
+                    "Lufthansa":  ['lufthansa'],
+                    "AirBerlin":  ['airberlin', 'air berlin'],
+                    "AirBerlin assist":  ['airberlin assist', 'air berlin assist', 'airberlinassist'],
+                    "easyJet":  ['easyjet', 'easy jet'],
+                    "RyanAir":  ['ryanair', 'ryan air'],
+                    "SingaporeAir":  ['singaporeair', 'singapore airlines', 'singapore air'],
+                    "Qantas":  ['qantas'],
+                    "EtihadAirways":  ['etihad airways', 'etihadairways', 'etihad'],
+                    "VirginAtlantic":  ['virgin atlantic', 'virginatlantic']}
 
 languages_list = ['en', 'de', 'es', 'fr', 'in', 'nl', 'it', 'pt']
 
@@ -107,6 +108,7 @@ def preprocess_users_in_retweeted_status(tweet):
 def preprocessor_tweets(tweet):
     try:
         if 'delete' not in tweet:
+            # start_time = time.time()
             # Get the text from the tweet
             text = tweet['text']
             if 'retweeted_status' in tweet:
@@ -158,6 +160,8 @@ def preprocessor_tweets(tweet):
             if tweet.get('entities') and tweet['entities'].get('user_mentions'):  # Check if 'entities' and 'user_mentions' exist and are not None
                 mentioned_id = [i['id'] for i in tweet['entities']['user_mentions']]  # Get the IDs of mentioned users
 
+            # score = sentiment_score.roberta(text)  # Get the sentiment score of the tweet
+
             # Initialize a dictionary to store extended tweet information
             extended_tweets = {'text':text, 'language':lang, 'mentioned_airlines':airlines_mentioned, 'user_mentions':mentioned_id}
             tweets_info.update(extended_tweets)  # Update the tweet information dictionary with extended tweet information
@@ -228,6 +232,8 @@ def preprocessor_tweets(tweet):
                 if tweets_info[i] == None:
                     tweets_info[i] = 'NULL'  # Set nullable values to 'NULL'
 
+            # print(f"Preprocessor time taken: {time.time() - start_time}")
+
             return tweets_info  # Return the processed tweet information
 
     except Exception as e:

diff --git a/remnant/words_clustering.py b/remnant/words_clustering.py
@@ -0,0 +1,26 @@
+import csv
+import re
+from collections import Counter
+import mysql.connector
+from config import config
+
+connection = mysql.connector.connect(host=config.get('HOST'), user=config.get('USERNAME'), password=config.get('PASSWORD'),database=config.get('DATABASE'), allow_local_infile=True)
+cursor = connection.cursor()
+cursor.execute("SELECT * FROM tweets ORDER BY timestamp_ms DESC")
+tweets = cursor.fetchall()
+
+word_count = Counter()
+
+# Iterate through the tweets and update the word count
+for tweet in tweets:
+    text = tweet[1]
+    # Remove any special characters and split by whitespace to get words
+    words = re.findall(r'\b\w+\b', text.lower())
+    word_count.update(words)
+
+# Write the word counts to a CSV file
+with open('word_counts.csv', 'w', newline='') as csvfile:
+    csvwriter = csv.writer(csvfile)
+    csvwriter.writerow(['Word', 'Count'])
+    for word, count in word_count.items():
+        csvwriter.writerow([word, count])
diff --git a/sentiment/sentiment_score.py b/sentiment/sentiment_score.py
@@ -0,0 +1,38 @@
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+import numpy as np
+from scipy.special import softmax
+import re
+# import time
+
+tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
+model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
+
+def transform_text(text):
+    text = re.sub(r'([A-Za-z])\1{2,}', r'\1', text) # replace repeated texts, normalization
+    text = re.sub(r'[^A-Za-z ]', '', text) # remove special characters
+    text = re.sub(r'@\S+', '@user', text) # replace user mentions
+    return text
+
+def roberta(text):
+    # start_time = time.time()
+    ptext = transform_text(text)
+    input = tokenizer(ptext, return_tensors='pt')
+    with torch.no_grad():
+        output = model(**input)
+    score = torch.nn.functional.softmax(output.logits, dim=-1)
+
+    # labels = ["negative", "neutral", "positive"]
+    # sentiment_mapping = {"negative": -1, "neutral": 0, "positive": 1}
+    # sentiment_probabilities = [(labels[i], prob.item()) for i, prob in enumerate(score[0])]
+    sentiment_score = score[0][2].item() - score[0][0].item()
+    # sentiment_score = sum(sentiment_mapping[label] * prob.item() for label, prob in zip(labels, score[0]))
+
+    # Log-odds
+    # odds_pos = score[0][2].item() / (1 - score[0][2].item() + 1e-6)
+    # odds_neg = score[0][0].item() / (1 - score[0][0].item() + 1e-6)
+    # log_odds_score = np.log(odds_pos) - np.log(odds_neg)
+    # normalized_score = 1 / (1 + np.exp(-log_odds_score))
+    # print(f"Roberta time taken: {time.time() - start_time}")
+
+    return sentiment_score