-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleaner.py
51 lines (47 loc) · 1.72 KB
/
cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# -*- coding: utf-8 -*-
import preprocessor as p
import re
from emoji_list import emojis
class cleaning(emojis):
def __init__(self, tweets):
super(cleaning, self).__init__()
self.tweets=tweets
self.clean_tweets=[]
self.basic_clean_tweets=[]
for _ in self.tweets:
tmp=self.basic_clean(_.full_text)
if tmp [0:2] != 'RT':#remove retweets for better wordcloud and sentiment analysis
_.clean_text=tmp
self.basic_clean_tweets.append(_)
self.complete_cleaner()
#Removing URLs and smileys
def basic_clean(self, sentence):
p.set_options(p.OPT.URL, p.OPT.SMILEY)
tmp=p.clean(sentence)
tmp=re.sub(r"via @","@",tmp)
return(tmp)
#Seperating words for WordCloud
def word_clean(self, sentence):
p.set_options(p.OPT.HASHTAG,p.OPT.MENTION)
tmp=p.clean(sentence)
emoji_list=self.emojis_sub_word
emoji_list.update({"\s\s+":" "})
for key,val in emoji_list.items():
tmp=val.join(tmp.split(key))
return([_.lower() for _ in (re.split(r'[`\-=~!$%^&*()_+\[\]{};\\\:"|<,./<>? ]',tmp)) if _ !=""])
#Preprocessing tweets for sentiment analysis
def sentence_clean(self, sentence):
p.set_options(p.OPT.MENTION)
tmp=p.clean(sentence)
emoji_list=self.emojis_sub_sentence
emoji_list.update({"#":"","\s\s+":" "})
for key,val in emoji_list.items():
tmp=val.join(tmp.split(key))
return(tmp)
#Creating class obejct with words and clean tweets
def complete_cleaner(self):
for _ in self.basic_clean_tweets:
_.words=(self.word_clean(_.clean_text))
tmp=self.sentence_clean(_.clean_text)
_.clean_text=(" ".join([_.lower() for _ in (re.split(r'[`\-=~!$%^&*()_+\[\]{};\\\:"|<,/<>? ]',tmp)) if _ !=""]))
self.clean_tweets.append(_)