-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
104 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import csv | ||
import re | ||
from collections import Counter | ||
import mysql.connector | ||
from config import config | ||
|
||
connection = mysql.connector.connect(host=config.get('HOST'), user=config.get('USERNAME'), password=config.get('PASSWORD'),database=config.get('DATABASE'), allow_local_infile=True) | ||
cursor = connection.cursor() | ||
cursor.execute("SELECT * FROM tweets ORDER BY timestamp_ms DESC") | ||
tweets = cursor.fetchall() | ||
|
||
word_count = Counter() | ||
|
||
# Iterate through the tweets and update the word count | ||
for tweet in tweets: | ||
text = tweet[1] | ||
# Remove any special characters and split by whitespace to get words | ||
words = re.findall(r'\b\w+\b', text.lower()) | ||
word_count.update(words) | ||
|
||
# Write the word counts to a CSV file | ||
with open('word_counts.csv', 'w', newline='') as csvfile: | ||
csvwriter = csv.writer(csvfile) | ||
csvwriter.writerow(['Word', 'Count']) | ||
for word, count in word_count.items(): | ||
csvwriter.writerow([word, count]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
from transformers import AutoTokenizer, AutoModelForSequenceClassification | ||
import torch | ||
import numpy as np | ||
from scipy.special import softmax | ||
import re | ||
# import time | ||
|
||
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment") | ||
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment") | ||
|
||
def transform_text(text): | ||
text = re.sub(r'([A-Za-z])\1{2,}', r'\1', text) # replace repeated texts, normalization | ||
text = re.sub(r'[^A-Za-z ]', '', text) # remove special characters | ||
text = re.sub(r'@\S+', '@user', text) # replace user mentions | ||
return text | ||
|
||
def roberta(text): | ||
# start_time = time.time() | ||
ptext = transform_text(text) | ||
input = tokenizer(ptext, return_tensors='pt') | ||
with torch.no_grad(): | ||
output = model(**input) | ||
score = torch.nn.functional.softmax(output.logits, dim=-1) | ||
|
||
# labels = ["negative", "neutral", "positive"] | ||
# sentiment_mapping = {"negative": -1, "neutral": 0, "positive": 1} | ||
# sentiment_probabilities = [(labels[i], prob.item()) for i, prob in enumerate(score[0])] | ||
sentiment_score = score[0][2].item() - score[0][0].item() | ||
# sentiment_score = sum(sentiment_mapping[label] * prob.item() for label, prob in zip(labels, score[0])) | ||
|
||
# Log-odds | ||
# odds_pos = score[0][2].item() / (1 - score[0][2].item() + 1e-6) | ||
# odds_neg = score[0][0].item() / (1 - score[0][0].item() + 1e-6) | ||
# log_odds_score = np.log(odds_pos) - np.log(odds_neg) | ||
# normalized_score = 1 / (1 + np.exp(-log_odds_score)) | ||
# print(f"Roberta time taken: {time.time() - start_time}") | ||
|
||
return sentiment_score |