Skip to content

Commit 9d8971e

Browse files
authored
Merge pull request #1 from s4zuk3/3-return-tika-metadata-extend
Changed HashMap to Tika Metadata. Extended functionality for bytes, f…
2 parents 1b8039c + f2901cf commit 9d8971e

33 files changed

+1256
-2853
lines changed

bindings/extractous-python/src/extractor.rs

+61-4
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,25 @@ impl Extractor {
153153
})
154154
}
155155

156+
157+
/// Extracts text from a file path. Returns a tuple with stream of the extracted text
158+
/// the stream is decoded using the extractor's `encoding` and tika metadata.
159+
pub fn extract_file_with_metadata(&self, filename: &str) -> PyResult<(StreamReader, PyObject)> {
160+
let (reader, metadata) = self.0
161+
.extract_file_with_metadata(filename)
162+
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
163+
164+
// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
165+
Python::with_gil(|py| {
166+
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
167+
Ok((StreamReader {
168+
reader,
169+
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
170+
py_bytes: None,
171+
}, py_metadata.into()))
172+
})
173+
}
174+
156175
/// Extracts text from a file path. Returns a string that is of maximum length
157176
/// of the extractor's `extract_string_max_length`
158177
pub fn extract_file_to_string(&self, filename: &str) -> PyResult<String> {
@@ -169,7 +188,7 @@ impl Extractor {
169188
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
170189

171190
Python::with_gil(|py| {
172-
let py_metadata = hashmap_to_pydict(py, &metadata);
191+
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
173192
Ok((content, py_metadata.into()))
174193
})
175194
}
@@ -191,6 +210,25 @@ impl Extractor {
191210
})
192211
}
193212

213+
/// Extracts text from a bytearray. Returns a tuple with stream of the extracted text
214+
/// the stream is decoded using the extractor's `encoding` and tika metadata.
215+
pub fn extract_bytes_with_metadata(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult<(StreamReader, PyObject)> {
216+
let slice = buffer.to_vec();
217+
let (reader, metadata) = self.0
218+
.extract_bytes_with_metadata(&slice)
219+
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
220+
221+
// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
222+
Python::with_gil(|py| {
223+
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
224+
Ok((StreamReader {
225+
reader,
226+
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
227+
py_bytes: None,
228+
}, py_metadata.into()))
229+
})
230+
}
231+
194232
/// Extracts text from a url. Returns a string that is of maximum length
195233
/// of the extractor's `extract_string_max_length`
196234
pub fn extract_url(&self, url: &str) -> PyResult<StreamReader> {
@@ -207,15 +245,34 @@ impl Extractor {
207245
})
208246
}
209247

248+
/// Extracts text from a url. Returns a tuple with string that is of maximum length
249+
/// of the extractor's `extract_string_max_length` and tika metdata.
250+
pub fn extract_url_with_metadata(&self, url: &str) -> PyResult<(StreamReader, PyObject)> {
251+
let (reader, metadata) = self.0
252+
.extract_url_with_metadata(&url)
253+
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
254+
255+
// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
256+
Python::with_gil(|py| {
257+
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
258+
Ok((StreamReader {
259+
reader,
260+
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
261+
py_bytes: None,
262+
}, py_metadata.into()))
263+
})
264+
}
265+
210266
fn __repr__(&self) -> String {
211267
format!("{:?}", self.0)
212268
}
213269
}
214270

215-
fn hashmap_to_pydict<'py>(py: Python<'py>, hashmap: &HashMap<String, String>) -> &'py PyDict {
271+
fn metadata_hashmap_to_pydict<'py>(py: Python<'py>, hashmap: &HashMap<String, Vec<String>>) -> Result<&'py PyDict, PyErr> {
216272
let pydict = PyDict::new(py);
217273
for (key, value) in hashmap {
218-
pydict.set_item(key, value).unwrap();
274+
pydict.set_item(key, value)
275+
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
219276
}
220-
pydict
277+
Ok(pydict)
221278
}

bindings/extractous-python/tests/test_extract_bytes_to_stream.py

+31-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1+
import json
12
import pytest
23

34
from extractous import Extractor
4-
from utils import cosine_similarity, read_to_string, read_file_to_bytearray
5+
from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray
56

67
TEST_CASES = [
78
("2022_Q3_AAPL.pdf", 0.9),
@@ -38,3 +39,32 @@ def test_extract_bytes_to_stream(file_name, target_dist):
3839
assert cosine_similarity(result, expected) > target_dist, \
3940
f"Cosine similarity is less than {target_dist} for file: {file_name}"
4041

42+
43+
TEST_CASES_METADATA = [
44+
("2022_Q3_AAPL.pdf", 0.9),
45+
("science-exploration-1p.pptx", 0.9),
46+
("simple.odt", 0.9),
47+
("table-multi-row-column-cells-actual.csv", 0.6),
48+
("vodafone.xlsx", 0.8),
49+
("category-level.docx", 0.9),
50+
("simple.doc", 0.9),
51+
("simple.pptx", 0.9),
52+
("table-multi-row-column-cells.png", 0.9),
53+
("winter-sports.epub", 0.8),
54+
("bug_16.docx", 0.9),
55+
]
56+
57+
58+
@pytest.mark.parametrize("file_name, similarity_percent", TEST_CASES_METADATA)
59+
def test_extract_bytes_to_stream(file_name, similarity_percent):
60+
"""Test the extraction from bytes of various file types."""
61+
original_filepath = f"../../test_files/documents/{file_name}"
62+
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
63+
file_bytes = read_file_to_bytearray(original_filepath)
64+
extractor = Extractor()
65+
_reader, metadata = extractor.extract_bytes_with_metadata(file_bytes)
66+
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
67+
expected_metadata = json.load(file)
68+
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
69+
assert percent_similarity > similarity_percent, \
70+
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

bindings/extractous-python/tests/test_extract_file_to_string.py

+29-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1+
import json
12
import pytest
23

34
from extractous import Extractor
4-
from utils import cosine_similarity
5+
from utils import calculate_similarity_percent, cosine_similarity
56

67
TEST_CASES = [
78
("2022_Q3_AAPL.pdf", 0.9),
@@ -32,3 +33,30 @@ def test_extract_file_to_string(file_name, target_dist):
3233
assert cosine_similarity(result, expected) > target_dist, \
3334
f"Cosine similarity is less than {target_dist} for file: {file_name}"
3435

36+
37+
TEST_CASES_METADATA = [
38+
("2022_Q3_AAPL.pdf", 0.9),
39+
("science-exploration-1p.pptx", 0.9),
40+
("simple.odt", 0.9),
41+
("table-multi-row-column-cells-actual.csv", 0.6),
42+
("vodafone.xlsx", 0.8),
43+
("category-level.docx", 0.9),
44+
("simple.doc", 0.9),
45+
("simple.pptx", 0.9),
46+
("table-multi-row-column-cells.png", 0.9),
47+
("winter-sports.epub", 0.8),
48+
("bug_16.docx", 0.9),
49+
]
50+
51+
@pytest.mark.parametrize("file_name, similarity_percent", TEST_CASES_METADATA)
52+
def test_extract_file_to_string_with_metadata(file_name, similarity_percent):
53+
"""Test the extraction and comparison of various file types."""
54+
original_filepath = f"../../test_files/documents/{file_name}"
55+
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
56+
extractor = Extractor()
57+
_result, metadata = extractor.extract_file_to_string_with_metadata(original_filepath)
58+
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
59+
expected_metadata = json.load(file)
60+
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
61+
assert percent_similarity > similarity_percent, \
62+
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

bindings/extractous-python/tests/test_extract_file_to_string_with_metadata.py

-35
This file was deleted.

bindings/extractous-python/tests/test_extract_url.py

+5
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,8 @@ def test_extract_url():
88
result = read_to_string(reader)
99

1010
assert "Google" in result
11+
12+
def test_extract_url_with_metadata():
13+
extractor = Extractor()
14+
_reader, metadata = extractor.extract_url_with_metadata("https://www.google.com")
15+
assert len(metadata.keys()) > 0

bindings/extractous-python/tests/utils.py

+18
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,21 @@ def read_file_to_bytearray(file_path: str):
4545
with open(file_path, 'rb') as file:
4646
file_content = bytearray(file.read())
4747
return file_content
48+
49+
50+
def calculate_similarity_percent(expected, current):
51+
matches = 0
52+
total = 0
53+
54+
# Iterate over all keys in the 'expected' dictionary
55+
for key, value1 in expected.items():
56+
if key in current:
57+
total += 1
58+
if value1 == current[key]:
59+
matches += 1
60+
61+
if total == 0:
62+
return 0.0
63+
64+
# Return the similarity percentage
65+
return matches / total

0 commit comments

Comments
 (0)