-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathduckling.py
288 lines (250 loc) · 10.5 KB
/
duckling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import os
import imp
import jpype
import socket
import threading
from six import string_types
from distutils.util import strtobool
from dateutil import parser
from .dim import Dim
from .language import Language
socket.setdefaulttimeout(15)
class Duckling(object):
"""Python wrapper for Duckling by wit.ai.
Attributes:
jvm_started: Optional attribute to specify if the JVM has already been
started (with all Java dependencies loaded).
parse_datetime: Optional attribute to specify if datetime string should
be parsed with datetime.strptime(). Default is False.
minimum_heap_size: Optional attribute to set initial and minimum heap
size. Default is 128m.
maximum_heap_size: Optional attribute to set maximum heap size. Default
is 2048m.
"""
def __init__(self,
jvm_started=False,
parse_datetime=False,
minimum_heap_size='128m',
maximum_heap_size='2048m'):
"""Initializes Duckling.
"""
self.parse_datetime = parse_datetime
self._is_loaded = False
self._lock = threading.Lock()
if not jvm_started:
self._classpath = self._create_classpath()
self._start_jvm(minimum_heap_size, maximum_heap_size)
try:
# make it thread-safe
if threading.activeCount() > 1:
if not jpype.isThreadAttachedToJVM():
jpype.attachThreadToJVM()
self._lock.acquire()
self.clojure = jpype.JClass('clojure.java.api.Clojure')
# require the duckling Clojure lib
require = self.clojure.var("clojure.core", "require")
require.invoke(self.clojure.read("duckling.core"))
finally:
self._lock.release()
def _start_jvm(self, minimum_heap_size, maximum_heap_size):
jvm_options = [
'-Xms{minimum_heap_size}'.format(minimum_heap_size=minimum_heap_size),
'-Xmx{maximum_heap_size}'.format(maximum_heap_size=maximum_heap_size),
'-Djava.class.path={classpath}'.format(
classpath=self._classpath)
]
if not jpype.isJVMStarted():
jpype.startJVM(
jpype.getDefaultJVMPath(),
*jvm_options
)
def _create_classpath(self):
jars = []
for top, dirs, files in os.walk(os.path.join(imp.find_module('duckling')[1], 'jars')):
for file_name in files:
if file_name.endswith('.jar'):
jars.append(os.path.join(top, file_name))
return os.pathsep.join(jars)
def load(self, languages=[]):
"""Loads the Duckling corpus.
Languages can be specified, defaults to all.
Args:
languages: Optional parameter to specify languages,
e.g. [Duckling.ENGLISH, Duckling.FRENCH] or supported ISO 639-1 Codes (e.g. ["en", "fr"])
"""
duckling_load = self.clojure.var("duckling.core", "load!")
clojure_hashmap = self.clojure.var("clojure.core", "hash-map")
clojure_list = self.clojure.var("clojure.core", "list")
if languages:
# Duckling's load function expects ISO 639-1 Language Codes (e.g. "en")
iso_languages = [Language.convert_to_iso(lang) for lang in languages]
duckling_load.invoke(
clojure_hashmap.invoke(
self.clojure.read(':languages'),
clojure_list.invoke(*iso_languages)
)
)
else:
duckling_load.invoke()
self._is_loaded = True
def parse(self, input_str, language=Language.ENGLISH, dim_filter=None, reference_time=''):
"""Parses datetime information out of string input.
It invokes the Duckling.parse() function in Clojure.
A language can be specified, default is English.
Args:
input_str: The input as string that has to be parsed.
language: Optional parameter to specify language,
e.g. Duckling.ENGLISH or supported ISO 639-1 Code (e.g. "en")
dim_filter: Optional parameter to specify a single filter or
list of filters for dimensions in Duckling.
reference_time: Optional reference time for Duckling.
Returns:
A list of dicts with the result from the Duckling.parse() call.
Raises:
RuntimeError: An error occurres when Duckling model is not loaded
via load().
"""
if self._is_loaded is False:
raise RuntimeError(
'Please load the model first by calling load()')
if threading.activeCount() > 1:
if not jpype.isThreadAttachedToJVM():
jpype.attachThreadToJVM()
language = Language.convert_to_duckling_language_id(language)
duckling_parse = self.clojure.var("duckling.core", "parse")
duckling_time = self.clojure.var("duckling.time.obj", "t")
clojure_hashmap = self.clojure.var("clojure.core", "hash-map")
filter_str = '[]'
if isinstance(dim_filter, string_types):
filter_str = '[:{filter}]'.format(filter=dim_filter)
elif isinstance(dim_filter, list):
filter_str = '[{filter}]'.format(filter=' :'.join(dim_filter))
if reference_time:
duckling_result = duckling_parse.invoke(
language,
input_str,
self.clojure.read(filter_str),
clojure_hashmap.invoke(
self.clojure.read(':reference-time'),
duckling_time.invoke(
*self._parse_reference_time(reference_time))
)
)
else:
duckling_result = duckling_parse.invoke(
language, input_str, self.clojure.read(filter_str))
return self._parse_result(duckling_result)
def _parse_reference_time(self, reference_time):
date_info = parser.parse(reference_time)
utc_offset = int(date_info.utcoffset().total_seconds()) // 3600 if date_info.utcoffset() else 0
return (utc_offset, date_info.year,
date_info.month, date_info.day,
date_info.hour, date_info.minute,
date_info.second)
def _parse_result(self, duckling_result):
_functions = {
u'dim': self._parse_symbol,
u'body': self._parse_string,
u'start': self._parse_int,
u'end': self._parse_int,
u'latent': self._parse_boolean
}
result = []
for duckling_entry in duckling_result.iterator():
entry = {}
for field in duckling_entry.iterator():
key = field.getKey().toString()[1:]
if key == u'value':
entry[key] = self._parse_dict(
field.getValue(), entry[u'dim'])
else:
entry[key] = _functions[key](field.getValue())
result.append(entry)
return result
def _parse_dict(self, java_dict, dim=None):
_functions = {
u'day': self._parse_int,
u'grain': self._parse_symbol,
u'hour': self._parse_int,
u'minute': self._parse_int,
u'month': self._parse_int,
u'product': self._parse_string,
u'quarter': self._parse_int,
u'second': self._parse_int,
u'type': self._parse_string,
u'values': self._parse_list,
u'week': self._parse_int,
u'year': self._parse_int
}
_functions_with_dim = {
u'from': self._parse_dict,
u'normalized': self._parse_dict,
u'to': self._parse_dict,
u'unit': self._parse_keyword,
u'value': self._parse_value,
u'values': self._parse_list
}
result = {}
for field in java_dict.iterator():
key = field.getKey().toString()[1:]
if key in _functions_with_dim.keys():
result[key] = _functions_with_dim[key](field.getValue(), dim)
else:
result[key] = _functions[key](field.getValue())
return result
def _parse_list(self, java_list, dim=None):
result = []
for entry in java_list.iterator():
result.append(self._parse_dict(entry, dim))
return result
def _parse_float(self, java_number):
return float(java_number.toString())
def _parse_int(self, java_number):
return int(java_number.toString())
def _parse_value(self, java_value, dim=None):
_dims = {
Dim.AMOUNTOFMONEY: self._parse_float,
Dim.CYCLE: self._parse_string,
Dim.DISTANCE: self._parse_float,
Dim.DURATION: self._parse_float,
Dim.EMAIL: self._parse_string,
Dim.LEVENPRODUCT: self._parse_string,
Dim.LEVENUNIT: self._parse_string,
Dim.NUMBER: self._parse_float,
Dim.ORDINAL: self._parse_int,
Dim.PHONENUMBER: self._parse_string,
Dim.QUANTITY: self._parse_string,
Dim.TEMPERATURE: self._parse_float,
Dim.TIME: self._parse_time,
Dim.TIMEZONE: self._parse_string,
Dim.UNITOFDURATION: self._parse_string,
Dim.URL: self._parse_string,
Dim.VOLUME: self._parse_float
}
if not dim or dim not in _dims:
return self._parse_string(java_value)
try:
return _dims[dim](java_value)
except AttributeError:
return 'ERROR: {msg}'.format(msg=self._parse_string(java_value))
def _parse_time(self, time):
if self.parse_datetime:
try:
return parser.parse(time)
except ValueError:
return None
else:
return self._parse_string(time)
def _parse_string(self, java_string):
return java_string
def _parse_keyword(self, java_keyword, dim=None):
if dim == Dim.DURATION:
if isinstance(java_keyword, string_types):
return self._parse_string(java_keyword)
return self._parse_symbol(java_keyword)
else:
return self._parse_string(java_keyword)
def _parse_symbol(self, java_symbol):
return java_symbol.toString()[1:]
def _parse_boolean(self, java_boolean):
return bool(strtobool(java_boolean.toString()))