-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse_google_ngram.py
executable file
·138 lines (121 loc) · 4.46 KB
/
parse_google_ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python
#
# Copyright 2012 B. J. Potter
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Process Google ngram data.
Google ngram data can be found at
http://books.google.com/ngrams/datasets
This was used to generate the word list included with this code. It
generates a list of the most commonly used words in the chosen set by
inputing 1-gram files from Google. This can be used to generate other
word lists suitable for use in making passphrases.
Example usage:
google_ngram.py -o word_list.txt googlebooks-eng-fiction-all-1gram-*
processing: googlebooks-eng-fiction-all-1gram-20120701-a
Reading line 1 million
Reading line 2 million
Reading line 3 million
Reading line 4 million
Reading line 5 million
Reading line 6 million
Reading line 7 million
Reading line 8 million
Reading line 9 million
Reading line 10 million
processing: googlebooks-eng-fiction-all-1gram-20120701-b
Reading line 1 million
Reading line 2 million
Reading line 3 million
Reading line 4 million
Reading line 5 million
Reading line 6 million
Reading line 7 million
Reading line 8 million
Reading line 9 million
This will write 'ngram_processed.txt' with all of the words from the
input ngram files and the 'word_list.txt' with only the top 10,000
words, excluding words less than three letters and those with
punctuation.
"""
import os
import sys
import string
from optparse import OptionParser
from operator import itemgetter
# Don't analyze words from books before this year
START_YEAR = 1980
# How many words in the output file
MAX_WORDS = 10000
# Exclude words shorter than this from the final list
MIN_WORD_LENGTH = 3
def unwanted_characters_in_word(word):
"""Return boolean indicating unwanted characters in the word."""
for letter in word:
if ((letter in string.punctuation) or
(letter not in string.letters)):
return True
return False
def process_ngram_file(file_name, word_list):
"""Process a Google ngram file to make a word list.
The ngram data comes from http://books.google.com/ngrams/datasets"""
line_count = 0
for line in open(file_name):
line_count += 1
if line_count % 1000000 == 0:
print "Reading line {0:3d} million".format(line_count / 1000000)
line = line.split()
word = line[0].lower()
year = int(line[1])
occurances = int(line[2])
if (year < START_YEAR or
unwanted_characters_in_word(word)):
continue
word_list[word] = word_list.get(word, 0) + occurances
def build_word_list(input_word_list, word_list_file_name):
"""Write final list of words from processed ngram words.
input_word_list: The given list has the most frequent words at the
beginning and less frequent words later.
word_list_file_name: The file to be written to. It will contain just
the most common MAX_WORDS."""
final_word_list = []
for word in input_word_list:
word = word[0]
if len(final_word_list) >= MAX_WORDS:
break
if len(word) < MIN_WORD_LENGTH:
continue
final_word_list.append(word)
out_file = open(word_list_file_name, 'w')
for word in final_word_list:
out_file.write('{0}\n'.format(word))
# Running from command line (main)
usage = "usage: %prog -o word_list.txt input_ngram_file[s]"
parser = OptionParser(usage)
parser.add_option("-o", "--out", default='',
help="Output file name")
(options, args) = parser.parse_args()
if len(args) == 0:
print 'ERROR: Must have an input file'
sys.exit(1)
if options.out == '':
print ('ERROR: Must have an output file "-o word_list.txt"')
sys.exit(1)
word_list = {}
for file_name in args:
print 'processing:', file_name
assert os.path.isfile(file_name)
process_ngram_file(file_name, word_list)
sorted_list = sorted(word_list.iteritems(), key=itemgetter(1), reverse=True)
build_word_list(sorted_list, options.out)