-
Notifications
You must be signed in to change notification settings - Fork 1
/
gen_gutenberg.py
executable file
·119 lines (96 loc) · 4.26 KB
/
gen_gutenberg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import numpy as np
import os
import re
import string
from sklearn.model_selection import train_test_split as tts
def is_all_upper(l):
return all([c.isupper() or c == " " for c in l])
def expand(word):
return word + "_" * (phrase_word_bounds[1] - len(word))
phrase_size = 3
phrase_word_bounds = 3, 6
encoding = 'abcdefghijklmnopqrstuvwxyz_'
def encode(phrase):
out = np.zeros([1, 27, phrase_size*phrase_word_bounds[1]], dtype=np.float32)
for i, word in enumerate(phrase):
for j, letter in enumerate(word):
out[0, encoding.index(letter), j+(phrase_word_bounds[1]*i)] = 1.0
return out
def test_train_split(author_data, author_idx, return_metainfo=False):
author_splits = {}
for author in author_data.keys():
idxs = np.arange(len(author_data[author]['encodings']))
train, test = tts(idxs, train_size=11100, test_size=1000)
author_splits[author] = train, test
train_xs, train_ys, test_xs, test_ys = [], [], [], []
metainfo = []
for k, v in author_splits.items():
train_embeddings = [author_data[k]['encodings'][i] for i in v[0]]
train_phrases = [author_data[k]['phrases'][i] for i in v[0]]
test_embeddings = [author_data[k]['encodings'][i] for i in v[1]]
train_xs += train_embeddings
train_ys += [author_idx[k]]*len(train_embeddings)
metainfo += train_phrases
test_xs += test_embeddings
test_ys += [author_idx[k]]*len(test_embeddings)
train_xs = np.array(train_xs).astype(np.float32)
train_ys = np.array(train_ys).astype(np.long)
test_xs = np.array(test_xs).astype(np.float32)
test_ys = np.array(test_ys).astype(np.long)
if return_metainfo:
return (train_xs, train_ys), (test_xs, test_ys), metainfo
else:
return (train_xs, train_ys), (test_xs, test_ys)
def load_gutenberg(metainfo=False):
texts = {}
authors = {}
for text in os.listdir('texts'):
with open('texts/' + text, "r") as f:
author = text.split(".")[0][:-1]
if author in ['aquinas', 'confucius', 'hawthorne', 'plato', 'shakespeare', 'tolstoy']:
texts[text] = f.readlines()[95:]
authors[text] = author
all_authors = sorted(list(set(authors.values())))
author_idx = {a: float(i) for i, a in enumerate(all_authors)}
lat_letters = 'abcdefghijklmnopqrstuvwxyz '
_RE_COMBINE_WHITESPACE = re.compile(r"\s+")
punc = string.punctuation
invalids = ['prologue', 'epilogue', 'chapter', 'scene', 'act', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x',
'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx']
text_proc = {}
for k, v in texts.items():
t = [l.strip().translate(str.maketrans(punc, " " * len(punc))) for l in v]
t = [l for l in t if l and not is_all_upper(l)]
t = " ".join(t).lower()
t = "".join([c for c in t if c in lat_letters])
t = _RE_COMBINE_WHITESPACE.sub(" ", t).strip()
t = [w for w in t.split() if w not in invalids]
text_proc[k] = t
# extract matching phrases
author_phrase_set = {}
for k, v in text_proc.items():
author = authors[k]
if author not in author_phrase_set:
author_phrase_set[author] = set()
for word in range(0, len(v) - phrase_size):
phrase = [w for w in v[word:word + phrase_size]]
if all([phrase_word_bounds[0] <= len(w) <= phrase_word_bounds[1] for w in phrase]):
exp_phrase = tuple([expand(w) for w in phrase])
author_phrase_set[author].add(exp_phrase)
# filter overlapping phrases
author_unique_phrases = {}
for a, ws in author_phrase_set.items():
no_overlap = ws
for other_a, other_ws in author_phrase_set.items():
if a != other_a:
no_overlap = no_overlap.difference(other_ws)
author_unique_phrases[a] = no_overlap
# encode phrases
author_data = {}
for k, v in author_unique_phrases.items():
if k not in author_data:
author_data[k] = {'encodings':[], 'phrases':[]}
for phrase in v:
author_data[k]['encodings'].append(encode(phrase))
author_data[k]['phrases'].append(phrase)
return test_train_split(author_data, author_idx, metainfo)