forked from Morail/wiki-network
-
Notifications
You must be signed in to change notification settings - Fork 10
/
signature2graph.py
executable file
·159 lines (124 loc) · 5.09 KB
/
signature2graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python
##########################################################################
# #
# This program is free software; you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation; version 2 of the License. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
##########################################################################
from bz2 import BZ2File
## PROJECT LIBS
from sonet.edgecache import EdgeCache
import sonet.mediawiki as mwlib
from sonet.mediawiki import PageProcessor
from sonet import lib
from sonet.timr import Timr
class CurrentPageProcessor(PageProcessor):
"""
Inherits PageProcessor to process "current" dumps of wikipedia to find
signatures on UTP.
"""
_skip = False
user = None
sig_finder = None
def __init__(self, *args, **kwargs):
super(CurrentPageProcessor, self).__init__(*args, **kwargs)
sf_kwargs = {'lang': self.lang}
if 'signature' in kwargs:
sf_kwargs['signature'] = kwargs['signature']
self.sig_finder = mwlib.SignatureFinder(self.search, **sf_kwargs)
def process_title(self, elem):
text = elem.text
if not text:
self._skip = True
return
a_title = text.split('/')[0].split(':')
if len(a_title) > 1 and a_title[0].encode("utf8") in \
self.user_talk_names and a_title[1]:
self.user = a_title[1]
else:
self._skip = True
def process_text(self, elem):
assert self.user, "User still not defined"
text = elem.text
if not (text and self.user):
return
if (mwlib.isHardRedirect(text) or mwlib.isSoftRedirect(text)):
return
try:
talks = self.sig_finder.find(text)
self.ecache.add(mwlib.normalize_pagename(self.user), talks)
# Checks if self.user is a valid pagename
except AttributeError:
self._skip = True
return
self.count += 1
if not self.count % 500:
logging.info("Counter: %d", self.count)
def process_page(self, _):
"""
Called at the end of every <page> tag.
"""
self._skip = False
def end(self):
self.ecache.flush()
def main():
import optparse
p = optparse.OptionParser(usage="usage: %prog file")
p.add_option('-v', action="store_true", dest="verbose", default=False,
help="Verbose output (like timings)")
p.add_option('-s', action="store", dest="signature", default=None,
help="Signature in this language (e.g. sig, firma..)")
opts, files = p.parse_args()
if opts.verbose:
import sys
import logging
logging.basicConfig(stream=sys.stderr,
level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
try:
xml = files[0]
except IndexError:
p.error("Give me one file, please")
en_user, en_user_talk = u"User", u"User talk"
lang, date, type_ = mwlib.explode_dump_filename(xml)
src = BZ2File(xml)
tag = mwlib.get_tags(src)
ns_translation = mwlib.get_translations(src)
lang_user, lang_user_talk = ns_translation['User'], \
ns_translation['User talk']
assert lang_user, "User namespace not found"
assert lang_user_talk, "User Talk namespace not found"
lang_user = unicode(lang_user, "utf8")
en_user = unicode(en_user)
# open dump with an external process to use multiple cores
_fast = True
if _fast:
src.close()
src = lib.BZ2FileExt(xml)
if opts.signature is not None:
processor = CurrentPageProcessor(ecache=EdgeCache(), tag=tag,
user_talk_names=(lang_user_talk, en_user_talk),
search=(lang_user, en_user), lang=lang,
signature=opts.signature)
else:
processor = CurrentPageProcessor(ecache=EdgeCache(), tag=tag,
user_talk_names=(lang_user_talk, en_user_talk),
search=(lang_user, en_user), lang=lang)
with Timr('Processing'):
processor.start(src)
with Timr('Create network'):
g = processor.ecache.get_network()
logging.info("Len:", len(g.vs))
logging.info("Edges:", len(g.es))
g.write("%swiki-%s%s.pickle" % (lang, date, type_), format="pickle")
if __name__ == "__main__":
#import cProfile as profile
#profile.run('main()', 'mainprof')
main()