-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathEmotionStats.py
141 lines (112 loc) · 4.08 KB
/
EmotionStats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import nltk
from nltk.collocations import *
from nltk.corpus import stopwords
import os
import re
def main():
input_dir = "emotions/utterances"
for dir_file in os.listdir(input_dir):
input_file = os.path.join(input_dir, dir_file)
file_name = os.path.splitext(os.path.basename(input_file))[0]
output_file = "emotions/frequencies/" + file_name + "_freqs.txt"
get_output_stats(input_file, output_file)
def get_output_stats(input_file, output_file):
input_text = ''
#bigrams = []
#trigrams = []
#words = []
#collocations =[]
with open(input_file, 'r') as in_f:
utterance_list = []
asr_list = []
for line in in_f.readlines():
split_line = line.split('|')
asr = split_line[2]
utterance = split_line[3]
asr_list.append(asr)
utterance_list.append(utterance)
clean_utterance = clean_text(utterance)
input_text += clean_utterance + ' '
bigrams = get_freq_bigrams(input_text.lower())
trigrams = get_freq_trigrams(input_text.lower())
words = get_freq_words(input_text.lower())
#collocations = get_collocations(input_text)
with open(output_file, 'w') as out_f:
out_f.write('Bigrams:\n')
out_f.write(str(bigrams[0:10]))
out_f.write('\n')
out_f.write('Trigrams:\n')
out_f.write(str(trigrams[0:10]))
out_f.write('\n')
out_f.write('Words:\n')
out_f.write(str(words))
out_f.write('\n')
#out_f.write(str(collocations[0:10]))
def clean_text(text):
text = text.replace('"', '')
text = text.replace("'", "")
#text = re.sub("[\[].*?[\]]", "", text)
text = re.sub("[/\[].*?[/\]]", "", text)
text = re.sub("[-+\[\]]+", "", text)
return text
# method to return the most frequent bigrams
def get_freq_bigrams(text):
tokens = nltk.word_tokenize(text)
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)
finder.apply_freq_filter(3)
most_freq_bigrams = finder.nbest(bigram_measures.pmi, 20)
return most_freq_bigrams
# method to return the most frequent trigrams
def get_freq_trigrams(text):
tokens = nltk.word_tokenize(text)
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_words(tokens)
finder.apply_freq_filter(3)
most_freq_trigrams = finder.nbest(trigram_measures.pmi, 20)
return most_freq_trigrams
# method to return the most frequent words, excluding stop words
def get_freq_words(text):
text_list = nltk.word_tokenize(text)
keep_words = []
for t in text_list:
if t not in stopwords.words('english'):
keep_words.append(t)
fdist = nltk.FreqDist(keep_words)
return fdist.most_common(20)
def get_collocations(tokens):
text = nltk.Text(tokens)
return text.collocations()
# method to get the number of modal verbs
def count_modals(text):
modals = ['can', 'could', 'may', 'might', 'must', 'will']
modal_counts = {}
tokens = nltk.word_tokenize(text)
fdist = nltk.FreqDist(tokens)
for m in modals:
modal_counts[m] = fdist[m]
return modal_counts
# method to get the length of the longest utterance
def get_longest_utterance(utterance_list):
longest_len = max(len(utt) for utt in utterance_list)
return longest_len
# method to get the average length of utterances
def get_average_utterance_length(utterance_list):
total_length = 0
for utt in utterance_list:
total_length += len(utt)
average = total_length / len(utterance_list)
return average
# method to get the count of misrecognized utterances
def get_misrecognition_count(asr_list, utterance_list):
sent_count = 0
for i in range(0, len(asr_list)):
asr = clean_text(asr_list[i]).lower()
utt = clean_text(utterance_list[i]).lower()
if asr != utt:
sent_count += 1
#sent_percentage = float(sent_count) / len(asr_list)
return sent_count
#def get_sentiment(text):
if __name__ == '__main__':
main()