-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
46 lines (40 loc) · 1.27 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import re, string, unicodedata
# import contractions
# import inflect
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer, WordNetLemmatizer
def remove_html(text):
soup = BeautifulSoup(text, "html.parser")
return soup.get_text()
def remove_links_characters(text):
text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
text = re.sub('\[[^]]*\]', '', text)
return text
def regular_preprocess(text):
text = remove_html(text)
text = remove_links_characters(text)
return text
def remove_stopwords(docs, stopwords):
docs_ref = []
for doc in docs:
word_list = doc.lower().split()
word_list_ref = [word for word in word_list if word not in stopwords]
word_str_ref = ' '.join(word_list_ref)
docs_ref.append(word_str_ref)
return docs_ref
def stem_words(docs):
stemmer = PorterStemmer()
stems = []
for doc in docs:
word_list = doc.lower().split()
for word in word_list:
stem = stemmer.stem(word)
stems.append(stem)
stems_str = ' '.join(stems)
stems.append(stems_str)
return stems_str
def preprocess(data):
refined_data = []
for dp in data:
refined_data.append(regular_preprocess(dp))
return refined_data