@@ -153,6 +153,25 @@ impl Extractor {
153
153
} )
154
154
}
155
155
156
+
157
+ /// Extracts text from a file path. Returns a tuple with stream of the extracted text
158
+ /// the stream is decoded using the extractor's `encoding` and tika metadata.
159
+ pub fn extract_file_with_metadata ( & self , filename : & str ) -> PyResult < ( StreamReader , PyObject ) > {
160
+ let ( reader, metadata) = self . 0
161
+ . extract_file_with_metadata ( filename)
162
+ . map_err ( |e| PyErr :: new :: < PyTypeError , _ > ( format ! ( "{:?}" , e) ) ) ?;
163
+
164
+ // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
165
+ Python :: with_gil ( |py| {
166
+ let py_metadata = metadata_hashmap_to_pydict ( py, & metadata) ?;
167
+ Ok ( ( StreamReader {
168
+ reader,
169
+ buffer : Vec :: with_capacity ( ecore:: DEFAULT_BUF_SIZE ) ,
170
+ py_bytes : None ,
171
+ } , py_metadata. into ( ) ) )
172
+ } )
173
+ }
174
+
156
175
/// Extracts text from a file path. Returns a string that is of maximum length
157
176
/// of the extractor's `extract_string_max_length`
158
177
pub fn extract_file_to_string ( & self , filename : & str ) -> PyResult < String > {
@@ -169,7 +188,7 @@ impl Extractor {
169
188
. map_err ( |e| PyErr :: new :: < PyTypeError , _ > ( format ! ( "{:?}" , e) ) ) ?;
170
189
171
190
Python :: with_gil ( |py| {
172
- let py_metadata = hashmap_to_pydict ( py, & metadata) ;
191
+ let py_metadata = metadata_hashmap_to_pydict ( py, & metadata) ? ;
173
192
Ok ( ( content, py_metadata. into ( ) ) )
174
193
} )
175
194
}
@@ -191,6 +210,25 @@ impl Extractor {
191
210
} )
192
211
}
193
212
213
+ /// Extracts text from a bytearray. Returns a tuple with stream of the extracted text
214
+ /// the stream is decoded using the extractor's `encoding` and tika metadata.
215
+ pub fn extract_bytes_with_metadata ( & self , buffer : & Bound < ' _ , PyByteArray > ) -> PyResult < ( StreamReader , PyObject ) > {
216
+ let slice = buffer. to_vec ( ) ;
217
+ let ( reader, metadata) = self . 0
218
+ . extract_bytes_with_metadata ( & slice)
219
+ . map_err ( |e| PyErr :: new :: < PyTypeError , _ > ( format ! ( "{:?}" , e) ) ) ?;
220
+
221
+ // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
222
+ Python :: with_gil ( |py| {
223
+ let py_metadata = metadata_hashmap_to_pydict ( py, & metadata) ?;
224
+ Ok ( ( StreamReader {
225
+ reader,
226
+ buffer : Vec :: with_capacity ( ecore:: DEFAULT_BUF_SIZE ) ,
227
+ py_bytes : None ,
228
+ } , py_metadata. into ( ) ) )
229
+ } )
230
+ }
231
+
194
232
/// Extracts text from a url. Returns a string that is of maximum length
195
233
/// of the extractor's `extract_string_max_length`
196
234
pub fn extract_url ( & self , url : & str ) -> PyResult < StreamReader > {
@@ -207,15 +245,34 @@ impl Extractor {
207
245
} )
208
246
}
209
247
248
+ /// Extracts text from a url. Returns a tuple with string that is of maximum length
249
+ /// of the extractor's `extract_string_max_length` and tika metdata.
250
+ pub fn extract_url_with_metadata ( & self , url : & str ) -> PyResult < ( StreamReader , PyObject ) > {
251
+ let ( reader, metadata) = self . 0
252
+ . extract_url_with_metadata ( & url)
253
+ . map_err ( |e| PyErr :: new :: < PyTypeError , _ > ( format ! ( "{:?}" , e) ) ) ?;
254
+
255
+ // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
256
+ Python :: with_gil ( |py| {
257
+ let py_metadata = metadata_hashmap_to_pydict ( py, & metadata) ?;
258
+ Ok ( ( StreamReader {
259
+ reader,
260
+ buffer : Vec :: with_capacity ( ecore:: DEFAULT_BUF_SIZE ) ,
261
+ py_bytes : None ,
262
+ } , py_metadata. into ( ) ) )
263
+ } )
264
+ }
265
+
210
266
fn __repr__ ( & self ) -> String {
211
267
format ! ( "{:?}" , self . 0 )
212
268
}
213
269
}
214
270
215
- fn hashmap_to_pydict < ' py > ( py : Python < ' py > , hashmap : & HashMap < String , String > ) -> & ' py PyDict {
271
+ fn metadata_hashmap_to_pydict < ' py > ( py : Python < ' py > , hashmap : & HashMap < String , Vec < String > > ) -> Result < & ' py PyDict , PyErr > {
216
272
let pydict = PyDict :: new ( py) ;
217
273
for ( key, value) in hashmap {
218
- pydict. set_item ( key, value) . unwrap ( ) ;
274
+ pydict. set_item ( key, value)
275
+ . map_err ( |e| PyErr :: new :: < PyTypeError , _ > ( format ! ( "{:?}" , e) ) ) ?;
219
276
}
220
- pydict
277
+ Ok ( pydict)
221
278
}
0 commit comments