-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtextual_feature.py
122 lines (110 loc) · 5.79 KB
/
textual_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pickle
import os
import re
from inspect import signature
from os.path import dirname, abspath, join
from collections import OrderedDict
from io import StringIO
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktLanguageVars
decorated_features = OrderedDict()
#The current python file must be in the same directory as tokenizers/
sentence_tokenizer_dir = join(dirname(abspath(__file__)), 'tokenizers')
word_tokenizer = None
sentence_tokenizers = None
def setup_tokenizers(terminal_punctuation):
PunktLanguageVars.sent_end_chars = terminal_punctuation
PunktLanguageVars.re_boundary_realignment = re.compile(r'[›»》’”\'\")\)\]\}\>]+?(?:\s+|(?=--)|$)', re.MULTILINE)
global word_tokenizer
global sentence_tokenizers
#Accessing private variables of PunktLanguageVars because nltk has a faulty design pattern that necessitates it.
#Issue reported here: https://github.com/nltk/nltk/issues/2068
word_tokenizer = PunktLanguageVars()
word_tokenizer._re_word_tokenizer = re.compile(PunktLanguageVars._word_tokenize_fmt % {
'NonWord': r"(?:[\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])",
'MultiChar': PunktLanguageVars._re_multi_char_punct,
'WordStart': r"[^\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡]",
}, re.UNICODE | re.VERBOSE)
word_tokenizer._re_period_context = re.compile(PunktLanguageVars._period_context_fmt % {
'NonWord': r"(?:[\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])",
'SentEndChars': word_tokenizer._re_sent_end_chars,
}, re.UNICODE | re.VERBOSE)
x = PunktLanguageVars()
x._re_word_tokenizer = re.compile(PunktLanguageVars._word_tokenize_fmt % {
'NonWord': r"(?:[\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])",
'MultiChar': PunktLanguageVars._re_multi_char_punct,
'WordStart': r"[^\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡]",
}, re.UNICODE | re.VERBOSE)
x._re_period_context = re.compile(PunktLanguageVars._period_context_fmt % {
'NonWord': r"(?:[\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])",
'SentEndChars': x._re_sent_end_chars,
}, re.UNICODE | re.VERBOSE)
#Read tokenizers from pickle files (also include an untrained tokenizer). Mapping from language name to tokenizer
sentence_tokenizers = dict({None: PunktSentenceTokenizer(lang_vars=PunktLanguageVars())}, **{
current_file_name[:current_file_name.index('.')]: pickle.load(open(join(current_path, current_file_name), mode='rb'))
for current_path, current_dir_names, current_file_names in os.walk(sentence_tokenizer_dir)
for current_file_name in current_file_names if current_file_name.endswith('.pickle')
})
for s in sentence_tokenizers.values():
s._lang_vars._re_period_context = x._re_period_context
s._lang_vars._re_word_tokenizer = x._re_word_tokenizer
def reset_tokenizers():
global word_tokenizer
global sentence_tokenizers
word_tokenizer = None
sentence_tokenizers = None
tokenize_types = {
None: {
'func': lambda lang, file: file,
'prev_filename': None,
'tokens': None,
},
'sentences': {
'func': lambda lang, file: sentence_tokenizers[lang].tokenize(file),
'prev_filename': None,
'tokens': None,
},
'words': {
'func': lambda lang, file: word_tokenizer.word_tokenize(file),
'prev_filename': None,
'tokens': None,
},
'sentence_words': {
'func': lambda lang, file: [word_tokenizer.word_tokenize(s) for s in sentence_tokenizers[lang].tokenize(file)],
'prev_filename': None,
'tokens': None,
},
}
debug_output = StringIO()
def clear_cache(cache, debug):
for k, v in cache.items():
v['prev_filename'] = None
v['tokens'] = None
debug.truncate(0)
debug.seek(0)
def textual_feature(tokenize_type=None, lang=None, debug=False):
#TODO convert these to ValueErrors
if not (word_tokenizer and sentence_tokenizers):
raise ValueError('Tokenizers not initialized: Use "setup_tokenizers(<collection of punctutation>)"')
if tokenize_type not in tokenize_types:
raise ValueError('"' + str(tokenize_type) + '" is not a valid tokenize type: Choose from among ' +
str(list(tokenize_types.keys())))
if lang not in sentence_tokenizers:
raise ValueError('"' + str(lang) + '" is not an available language: Choose from among ' +
str(list(sentence_tokenizers.keys())))
def decor(f):
#TODO make this more extensible. Use keyword args somehow instead of 'file' parameter?
if 'file' not in signature(f).parameters: raise ValueError('Decorated functions must take a "file" parameter')
def wrapper(file, filename=None):
if filename:
#Cache the tokenized version of this file if this filename is new
if tokenize_types[tokenize_type]['prev_filename'] != filename:
tokenize_types[tokenize_type]['prev_filename'] = filename
tokenize_types[tokenize_type]['tokens'] = tokenize_types[tokenize_type]['func'](lang, file)
elif debug:
debug_output.write('Cache hit! ' + 'function: <' + f.__name__ + '>, filename: ' + filename + '\n')
return f(tokenize_types[tokenize_type]['tokens'])
else:
return f(tokenize_types[tokenize_type]['func'](lang, file))
decorated_features[f.__name__] = wrapper
return wrapper
return decor