From e1c4397200f27efe799c534d76b0b21ec718cc7c Mon Sep 17 00:00:00 2001 From: "Nguyen Hoang Bao Khoi, Khoi" Date: Wed, 5 Jun 2024 14:46:01 +0200 Subject: [PATCH] remove backslash, update file splitting based on os and sentiment analyssi --- .gitignore | 1 + file_splitting/split_files.py | 9 ++++++-- preprocess/old_tweet_loader.py | 2 ++ preprocess/preprocessor.py | 2 ++ preprocess/test_loading_gpu.py | 2 ++ preprocess/tweets_gpu_testing.ipynb | 2 ++ preprocess/ultimate_tweet_loader.py | 2 ++ sentiment/sentiment_analysis.ipynb | 6 +++--- sentiment_evolution.ipynb | 32 ----------------------------- 9 files changed, 21 insertions(+), 37 deletions(-) delete mode 100644 sentiment_evolution.ipynb diff --git a/.gitignore b/.gitignore index 89ce50f..c41a360 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ tweets_dataset_gpu.csv tweets_dataset_all.csv json_files.txt combined_dataset.csv +sentiment_evolution.ipynb diff --git a/file_splitting/split_files.py b/file_splitting/split_files.py index 5bcea6f..62217f3 100644 --- a/file_splitting/split_files.py +++ b/file_splitting/split_files.py @@ -8,6 +8,11 @@ data = [Path("data/"+file) for file in os.listdir('data')] data.sort(key=lambda x: x.name) -with open('json_files.txt', 'w') as f: - f.write('\n'.join(str(file).replace('data/', '') for file in data)) +if os.name == 'nt': # Windows + with open('json_files.txt', 'w') as f: + f.write('\n'.join(str(file).replace('data\\', '') for file in data)) +else: # Other operating systems + with open('json_files.txt', 'w') as f: + f.write('\n'.join(str(file).replace('data/', '') for file in data)) + print("Files written to json_files.txt.") \ No newline at end of file diff --git a/preprocess/old_tweet_loader.py b/preprocess/old_tweet_loader.py index fa32eb5..d281e90 100644 --- a/preprocess/old_tweet_loader.py +++ b/preprocess/old_tweet_loader.py @@ -141,6 +141,8 @@ def text_transformer(text): # text = re.sub(r'[^A-Za-z ]', '', text) # remove special characters text = re.sub(r'\n', '', text) text = re.sub(r'[,.!?]', '', text) + text = re.sub(r"\'", "", text) + text = re.sub(r'\\', '', text) text = text.strip() text = text.lower() return text diff --git a/preprocess/preprocessor.py b/preprocess/preprocessor.py index 864f574..7fd75f4 100644 --- a/preprocess/preprocessor.py +++ b/preprocess/preprocessor.py @@ -69,6 +69,8 @@ def text_transformer(text): # text = re.sub(r'[^A-Za-z ]', '', text) # remove special characters text = re.sub(r'\n', '', text) text = re.sub(r'[,.!?]', '', text) + text = re.sub(r"\'", "", text) + text = re.sub(r'\\', '', text) text = text.strip() text = text.lower() return text diff --git a/preprocess/test_loading_gpu.py b/preprocess/test_loading_gpu.py index e4b30b8..7d5d26e 100644 --- a/preprocess/test_loading_gpu.py +++ b/preprocess/test_loading_gpu.py @@ -169,6 +169,8 @@ def text_transformer(text): # text = re.sub(r'[^A-Za-z ]', '', text) # remove special characters text = re.sub(r'\n', '', text) text = re.sub(r'[,.!?]', '', text) + text = re.sub(r"\'", "", text) + text = re.sub(r'\\', '', text) text = text.strip() text = text.lower() return text diff --git a/preprocess/tweets_gpu_testing.ipynb b/preprocess/tweets_gpu_testing.ipynb index 26c7ba7..e6b6d9c 100644 --- a/preprocess/tweets_gpu_testing.ipynb +++ b/preprocess/tweets_gpu_testing.ipynb @@ -256,6 +256,8 @@ " # text = re.sub(r'[^A-Za-z ]', '', text) # remove special characters\n", " text = re.sub(r'\\n', '', text)\n", " text = re.sub(r'[,.!?]', '', text)\n", + " text = re.sub(r\"\\'\", \"\", text)\n", + " text = re.sub(r'\\\\', '', text)\n", " text = text.strip()\n", " text = text.lower()\n", " return text" diff --git a/preprocess/ultimate_tweet_loader.py b/preprocess/ultimate_tweet_loader.py index 12bcfd8..43f816c 100644 --- a/preprocess/ultimate_tweet_loader.py +++ b/preprocess/ultimate_tweet_loader.py @@ -141,6 +141,8 @@ def text_transformer(text): # text = re.sub(r'[^A-Za-z ]', '', text) # remove special characters text = re.sub(r'\n', '', text) text = re.sub(r'[,.!?]', '', text) + text = re.sub(r"\'", "", text) + text = re.sub(r'\\', '', text) text = text.strip() text = text.lower() return text diff --git a/sentiment/sentiment_analysis.ipynb b/sentiment/sentiment_analysis.ipynb index 80085fc..b9ff2ba 100644 --- a/sentiment/sentiment_analysis.ipynb +++ b/sentiment/sentiment_analysis.ipynb @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 6, "metadata": {}, "outputs": [ { diff --git a/sentiment_evolution.ipynb b/sentiment_evolution.ipynb deleted file mode 100644 index 7a7836a..0000000 --- a/sentiment_evolution.ipynb +++ /dev/null @@ -1,32 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "jbg030", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}