-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_glove_to_my_dataset.py
72 lines (56 loc) · 1.93 KB
/
train_glove_to_my_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import re
import string
import nltk
from nltk.corpus import stopwords
from glove import Corpus, Glove
from pprint import pprint
# nltk.download('stopwords')
# nltk.download('punkt')
stops = stopwords.words('english')
def preprocess_text(text: str, remove_stopwords: bool) -> str:
"""
Function that cleans the input text by going to:
- remove links
- remove special characters
- remove numbers
- remove stopwords
- convert to lowercase
- remove excessive white spaces
Arguments:
text (str): text to clean
remove_stopwords (bool): whether to remove stopwords
Returns:
str: cleaned text
"""
# remove links
text = re.sub(r'http\S+', "", text)
# remove \n
text = text.replace(r"\n", " ")
# remove \n
text = text.replace(r"\t", " ")
# remove numbers and special characters
text = re.sub("[^A-Za-z]+", " ", text)
# remove stopwords
if remove_stopwords:
# 1. create tokens
tokens = nltk.word_tokenize(text)
#2. check if it is a stopword
tokens = [w.lower().strip() for w in tokens if w.lower() not in stops]
return tokens
import pandas as pd
df = pd.read_csv("Final_Dataset_Misinformation.csv")
# pprint(preprocess_text(df.Document[0], True))
# Apply preprocess function to the documents with pandas
df["cleaned"] = df.Document.apply(
lambda x: preprocess_text(str(x), remove_stopwords=True)
)
# I am transforming the cleaned to list in order to use them to train the word2vec
texts = df.cleaned.tolist()
#Creating a corpus object
corpus = Corpus()
#Training the corpus to generate the co-occurrence matrix which is used in GloVe
corpus.fit(texts, window=10)
glove = Glove(no_components=5, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')