-
Notifications
You must be signed in to change notification settings - Fork 11
/
processwiki.py
executable file
·84 lines (72 loc) · 2.46 KB
/
processwiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# functions to convert wikipedia articles saved as
# (article_as_string, title)
# to
# (title, ({wordid : trainwordct}, {wordid: testwordct}))
# and save them back to disk
from __future__ import division
from collections import Counter, deque
from itertools import islice
from wikirandom import get_random_wikipedia_article
import cPickle as pickle
import re
def online_wiki_docs():
# This is slow, for illustration only
while True:
yield get_random_wikipedia_article()
def load_list(pkl_file, num_items=3300000):
with open(pkl_file,'rb') as f:
while num_items > 0:
try:
yield pickle.load(f)
num_items -= 1
except EOFError:
raise StopIteration
def clean_doc(doc, vocab):
doc = doc.lower()
doc = re.sub(r'-', ' ', doc)
doc = re.sub(r'[^a-z ]', '', doc)
doc = re.sub(r' +', ' ', doc)
return [word for word in doc.split() if word in vocab]
def parse_docs(name_docs,vocab):
for doc, name in name_docs:
words = clean_doc(doc, vocab)
# Hold back every 10th word to get an online estimate of perplexity
train_words = [vocab[w] for (i,w) in enumerate(words) if i % 10 != 0]
test_words = [vocab[w] for (i,w) in enumerate(words) if i % 10 == 0]
train_cntr = Counter(train_words)
test_cntr = Counter(test_words)
yield (name, train_cntr, test_cntr)
def take_every(n, iterable):
i = iter(iterable)
piece = list(islice(i, n))
while piece:
yield piece
piece = list(islice(i, n))
def group_up(n, N, iterable):
i = iter(iterable)
ctr = 0
while ctr < N:
piece = list(islice(i,n))
ctr += n
yield piece
def consume(iterator, n):
"Advance the iterator n-steps ahead. If n is none, consume entirely."
# Use functions that consume iterators at C speed.
if n is None:
# feed the entire iterator into a zero-length deque
deque(iterator, maxlen=0)
else:
# advance to the empty slice starting at position n
next(islice(iterator, n, n), None)
def save_data(source, out_file):
with open(out_file,'wb') as out:
for item in source:
pickle.dump(item, out, protocol=-1)
def create_vocab(vocab_file):
dictnostops = open(vocab_file).readlines()
vocab = dict()
for idx,word in enumerate(dictnostops):
word = word.lower()
word = re.sub(r'[^a-z]', '', word)
vocab[word] = idx
return vocab