-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtext_preprocessing.py
152 lines (119 loc) · 6.09 KB
/
text_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# Import Module
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
# read file.txt and store it in array
names = []
with open('names.txt', 'r') as f:
for line in f:
name = line.replace('\n', '')
names.append(name.lower())
business = []
with open('business.txt', 'r') as f:
for line in f:
name = line.replace('\n', '')
names.append(name.lower())
# print(names)
numbers = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve",
"thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "twentyone",
"twentytwo", "twentythree", "twentyfour", "twentyfive", "twentysix", "twentyseven", "twentyeight",
"twentynine", "thirty", "thirtyone", "thirtytwo", "thirtythree", "thirtyfour", "thirtyfive", "thirtysix",
"thirtyseven", "thirtyeight", "thirtynine", "forty", "fortyone", "fortytwo", "fortythree", "fortyfour",
"fortyfive", "fortysix", "fortyseven", "fortyeight", "fortynine", "fifty", "fiftyone", "fiftytwo",
"fiftythree", "fiftyfour", "fiftyfive", "fiftysix", "fiftyseven", "fiftyeight", "fiftynine",
"sixty", "sixtyone", "sixtytwo", "sixtythree", "sixtyfour", "sixtyfive", "sixtysix", "sixtyseven",
"sixtyeight", "sixtynine", "seventy", "seventyone", "seventytwo", "seventythree", "seventyfour",
"seventyfive",
"seventysix", "seventyseven", "seventyeight", "seventynine", "eighty", "eightyone", "eightytwo",
"eightythree",
"eightyfour", "eightyfive", "eightysix", "eightyseven", "eightyeight", "eightynine", "ninety", "ninetyone",
"ninetytwo", "ninetythree", "ninetyfour", "ninetyfive", "ninetysix", "ninetyseven", "ninetyeight",
"ninetynine", "onehundred", "twenty-one", "twenty-two", "twenty-three", "twenty-four", "twenty-five",
"twenty-six", "twenty-seven", "twenty-eight", "twenty-nine", "thirty-one", "thirty-two", "thirty-three",
"thirty-four", "thirty-five", "thirty-six", "thirty-seven", "thirty-eight", "thirty-nine", "forty-one",
"forty-two", "forty-three", "forty-four", "forty-five", "forty-six", "forty-seven", "forty-eight",
"forty-nine", "fifty-one", "fifty-two", "fifty-three", "fifty-four", "fifty-five", "fifty-six",
"fifty-seven", "fifty-eight", "fifty-nine", "sixty-one", "sixty-two", "sixty-three", "sixty-four",
"sixty-five", "sixty-six", "sixty-seven", "sixty-eight", "sixty-nine", "seventy-one", "seventy-two",
"seventy-three", "seventy-four", "seventy-five", "seventy-six", "seventy-seven", "seventy-eight",
"seventy-nine", "eighty-one", "eighty-two", "eighty-three", "eighty-four", "eighty-five", "eighty-six",
"eighty-seven", "eighty-eight", "eighty-nine", "ninety-one", "ninety-two", "ninety-three", "ninety-four",
"ninety-five", "ninety-six", "ninety-seven", "ninety-eight", "ninety-nine", "one-hundred"]
punctuation = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?',
'@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']
lemmatizer = WordNetLemmatizer()
stopwords_en = stopwords.words('english')
stopwords_en.extend(numbers)
stopwords_en.extend(punctuation)
stopwords_en.extend(names)
stopwords_en.extend(business)
def print_stopword():
print(stopwords_en)
print(len(stopwords_en))
def remove_whitespace(text):
return " ".join(text.split())
def tokenization(text):
# lower-casing
lower_text = text.lower()
# list of token
return nltk.word_tokenize(lower_text)
def stopword_removing(tokenized_text):
# we prepare an empty list, which will contain the words after the stopwords removal
stopword_cleaned = []
# we iterate into the list of tokens obtained through the tokenization
for token in tokenized_text:
# if a token is not a stopword, we insert it in the list
if token not in stopwords_en:
stopword_cleaned.append(token)
# the output is a list of all the tokens of the original text excluding the stopwords
return stopword_cleaned
def simpler_pos_tag(nltk_tag):
if nltk_tag.startswith('J'):
return "a"
elif nltk_tag.startswith('V'):
return "v"
elif nltk_tag.startswith('N'):
return "n"
elif nltk_tag.startswith('R'):
return "r"
else:
return None
def checkInt(str):
try:
int(str)
return True
except ValueError:
return False
def pos_tagging(stopword_cleaned):
pos_tagging = nltk.pos_tag(stopword_cleaned)
cleaned_POS_text = []
for tuple in pos_tagging:
# POS tagged text is a list of tuples, where the first element tuple[0] is a token and the second one tuple[1] is
# the Part of Speech. If the POS has length == 1, the token is punctuation, otherwise it is not, and we insert it
# in the list cleaned_POS_text
if len(tuple[1]) > 1 and tuple[0] != '’' and tuple[0] != '“' and tuple[0] != '”' and checkInt(
tuple[0][0]) == False:
cleaned_POS_text.append(tuple)
simpler_POS_text = []
# for each tuple of the list, we create a new tuple: the first element is the token, the second is
# the simplified pos tag, obtained calling the function simpler_pos_tag()
# then we append the new created tuple to a new list, which will be the output
for tuple in cleaned_POS_text:
POS_tuple = (tuple[0], simpler_pos_tag(tuple[1]))
if (POS_tuple[1] == "n"):
simpler_POS_text.append(POS_tuple)
return simpler_POS_text
def lemmatization(simpler_POS_text):
lemmatized_text = []
for tuple in simpler_POS_text:
if (tuple[1] == None):
lemmatized_text.append(lemmatizer.lemmatize(tuple[0]))
else:
lemmatized_text.append(lemmatizer.lemmatize(tuple[0], pos=tuple[1]))
return lemmatized_text