-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwiki-wordlist.py
executable file
·139 lines (109 loc) · 3.7 KB
/
wiki-wordlist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/bin/python
"""Wiki Wordlists
Usage:
wiki-wordlists.py <file> [options]
wiki-wordlists.py -h
Options:
-h --help Show this screen.
-n <number>, --number <number> Specify number of words [default: 1000]
-c <config>, --config <config> Use config file
"""
import xml.etree.ElementTree as etree
from collections import Counter
import regex
import string
from sys import argv, maxunicode
from os import fstat
import unicodedata
import logging
import json
from docopt import docopt
#@profile
def __main__():
logging.basicConfig(format='%(message)s')
arguments = docopt(__doc__, version='Wiki Wordlists 0.5')
# Process flags
words_to_output = int(arguments['--number'])
if words_to_output < 1:
logging.warning('Error: The number of words to output must be more than 0')
exit()
config_filename = arguments['--config']
if config_filename != None:
try:
config_file = open(config_filename, 'r')
config = json.load(config_file)
except:
logging.warning("Error: {0} could not be opened as a config file".format(config_filename))
exit()
else:
config = { 'excluded': ['Wikipedia'] }
# Gather list of unicode data
punc = ''
for i in range(maxunicode):
char = chr(i)
cat = unicodedata.category(char)[:2]
if cat[0] == 'P':
punc += char
elif cat in ['Sc', 'Sm', 'Nd']:
punc += char
punctuation = set(punc)
punctuation.remove("'") # Dumb apostrophe
punctuation.remove("’") # Smart apostrophe
# Try to open main file
try:
xmlfile = open(arguments['<file>'])
context = etree.iterparse(xmlfile, events=('start', 'end'))
except IOError:
logging.warning('Error: {:s} could not be read'.format(argv[1]))
exit()
# Regex to clean everything up
whitespace_regex = regex.compile('\s+')
quotes_regex = regex.compile(r'\'\'\'?')
cleanup_regex = regex.compile(r"""\[\[.+?\]\]|\{\{.+?\}\} # markup tags like [[]] or {{}}
| ^[ |*{!=}].*?$ # tables, links and headers
| <!--.*?--> # comment tags
| <.*?>.*?</.*?> # xml tags
| <.*?/> # self-closing xml tag
| <.*?>.*?$ # xml tags spanning a line
| ^.*?</.*?> # xml tags spanning a line
""", regex.VERBOSE | regex.M)
# If a robot tag was set in a config file, look for it, too
if config.get('robottag') != None:
robot_regex = regex.compile(r'\{\{' + config['robottag'] + r'.*?\}\}')
else:
robot_regex = None
counter = Counter()
event, root = next(context)
url_length = root.tag.find('}') + 1
tags_found = 0
robot_tags_found = 0
xmlfilesize = float(fstat(xmlfile.fileno()).st_size)
update_interval = 100 if xmlfilesize < 1e7 else 250 if xmlfilesize < 1e10 else 350
for event, elem in context:
if event == 'end' and elem.tag[url_length:] == 'text' and elem.text != None:
text = elem.text
# Update tag count
tags_found += 1
# Check if contains the robot tag
if robot_regex != None:
if robot_regex.search(text) != None:
robot_tags_found += 1
root.clear()
continue
text = cleanup_regex.sub('', text)
text = quotes_regex.sub(' ', text)
text = ''.join([ch for ch in text if ch not in punctuation])
text = whitespace_regex.sub('\n', text)
words = regex.split('\n+', text)
if tags_found % update_interval == 0:
logging.warning("Tags found: {0:8n} Progress: {1:6.4f}".format(tags_found, float(xmlfile.tell())/xmlfilesize))
for word in words:
counter[word] += 1
root.clear()
config['excluded'].extend(['', "''", "'"])
for word in config['excluded']:
del counter[word]
logging.warning("Total tags found:\t{0:10n}\nTotal robot tags:\t{1:10n}\nTotal words found:\t{2:10n}".format(tags_found, robot_tags_found, len(counter)))
for keypair in counter.most_common(words_to_output):
print(keypair[0])
__main__()