-
Notifications
You must be signed in to change notification settings - Fork 0
/
alice.py
156 lines (135 loc) · 5.17 KB
/
alice.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#! /usr/bin/python
#
# A program to analyze the text of Alice in Wonderland and do
# interesting things with the data.
from string import punctuation
from random import choice, randint
class Text(object):
def __init__(self, text):
self.text = text
self.all_words = []
self.word_frequency = {} #{word: count}
self.second_words = {} #{word: [follower, follower]
self.end_words = {} #{bareword: punct}
self.distribution = {}
self.my_biagrams = {}
def PrepareData(self):
words = self.text.lower().replace('--', ' ').split()
prev = words[0].strip(punctuation)
self.all_words.append(prev)
self.word_frequency[prev] = 1
self.second_words[prev] = []
for word2 in words[1:]:
word = word2.strip(punctuation)
word2 = word2.lstrip(punctuation)
word2 = word2.rstrip(
''.join([c for c in punctuation if c in punctuation if c not in '.!?']))
if word:
self.all_words.append(word)
if word in self.word_frequency:
self.word_frequency[word] += 1
else:
self.word_frequency[word] = 1
if word not in self.second_words and word == word2:
self.second_words[word] = []
if prev in self.second_words:
self.second_words[prev].append(word2)
if word != word2:
if word in self.end_words:
self.end_words[word].append(word2[-1])
else:
self.end_words[word] = [word2[-1]]
prev = word
def GetUniqueWords(self):
return list(self.second_words.keys())
def GetAllWords(self):
return self.all_words
def GetSecondWords(self):
return self.second_words
def GetWordFreq(self):
return self.word_frequency
def Freq(self, my_dict, type, top=10):
if type == 'most':
a = sorted(list(my_dict.values()))[-top:]
elif type == 'least':
a = sorted(list(my_dict.values()))[:top]
words = []
for word in my_dict.keys():
if my_dict[word] in a:
words.append(word)
return words[:top]
def GenerateText(self, sentences):
generated = ''
for _ in range(sentences):
word = choice(self.all_words)
generated += word[0].upper() + word[1:]
k = 0
while (generated[-1] not in '.!?' and word not in self.end_words) or k < 3:
if word in self.second_words:
word = choice(self.second_words[word])
generated += ' ' + word
else:
word = choice(self.all_words)
k += 1
if generated[-1] not in punctuation:
generated += choice(self.end_words[word])
generated += ' '
return generated[:-1]
def GenerateText2(self, sentences):
d = {}
generated = ''
text = self.text
for i in ['.', ',', '!', '?', ':']:
text = text.replace(i, ' ' + i)
for i in ['"']:
text = text.replace(i, i + ' ')
text = text.split()
for i in range(len(text) - 1):
if text[i] in d:
d[text[i]].append(text[i + 1])
else:
d[text[i]] = [text[i + 1]]
i = 0
word = choice(d['.'])
generated += word[0].upper() + word[1:]
while i < sentences:
word = choice(d[word])
generated += ' ' + word
if word in ('.', '?', '!', '...'):
i += 1
return generated
def Palindrome(self):
words = self.GetUniqueWords()
return [w for w in words if w == w[::-1] and len(w) > 1]
def DistributionLength(self):
for word, count in self.word_frequency.items():
if len(word) in self.distribution:
self.distribution[len(word)] += count
else:
self.distribution[len(word)] = count
return self.distribution
def DistributionULength(self):
for word in self.word_frequency.keys():
if len(word) in self.distribution:
self.distribution[len(word)] += 1
else:
self.distribution[len(word)] = 1
return self.distribution
def DistributionFreq(self):
for amt in self.word_frequency.values():
if amt in self.distribution:
self.distribution[amt] += 1
else:
self.distribution[amt] = 1
return self.distribution
def Bigrams(self, text):
i = 0
while i < len(text) - 1:
bigram = '{0} {1}'.format(text[i], text[i + 1])
if bigram in self.my_bigrams:
self.my_bigrams[bigram] = self.my_bigrams[bigram] + 1
i += 1
else:
self.my_bigrams[bigram] = 1
i += 1
return self.my_bigrams