Skip to content

Commit

Permalink
Merge pull request #1 from s4zuk3/3-return-tika-metadata-extend
Browse files Browse the repository at this point in the history
Changed HashMap to Tika Metadata. Extended functionality for bytes, f…
  • Loading branch information
s4zuk3 authored Nov 14, 2024
2 parents 1b8039c + f2901cf commit 9d8971e
Show file tree
Hide file tree
Showing 33 changed files with 1,256 additions and 2,853 deletions.
65 changes: 61 additions & 4 deletions bindings/extractous-python/src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,25 @@ impl Extractor {
})
}


/// Extracts text from a file path. Returns a tuple with stream of the extracted text
/// the stream is decoded using the extractor's `encoding` and tika metadata.
pub fn extract_file_with_metadata(&self, filename: &str) -> PyResult<(StreamReader, PyObject)> {
let (reader, metadata) = self.0
.extract_file_with_metadata(filename)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
Python::with_gil(|py| {
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((StreamReader {
reader,
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
py_bytes: None,
}, py_metadata.into()))
})
}

/// Extracts text from a file path. Returns a string that is of maximum length
/// of the extractor's `extract_string_max_length`
pub fn extract_file_to_string(&self, filename: &str) -> PyResult<String> {
Expand All @@ -169,7 +188,7 @@ impl Extractor {
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

Python::with_gil(|py| {
let py_metadata = hashmap_to_pydict(py, &metadata);
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((content, py_metadata.into()))
})
}
Expand All @@ -191,6 +210,25 @@ impl Extractor {
})
}

/// Extracts text from a bytearray. Returns a tuple with stream of the extracted text
/// the stream is decoded using the extractor's `encoding` and tika metadata.
pub fn extract_bytes_with_metadata(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult<(StreamReader, PyObject)> {
let slice = buffer.to_vec();
let (reader, metadata) = self.0
.extract_bytes_with_metadata(&slice)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
Python::with_gil(|py| {
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((StreamReader {
reader,
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
py_bytes: None,
}, py_metadata.into()))
})
}

/// Extracts text from a url. Returns a string that is of maximum length
/// of the extractor's `extract_string_max_length`
pub fn extract_url(&self, url: &str) -> PyResult<StreamReader> {
Expand All @@ -207,15 +245,34 @@ impl Extractor {
})
}

/// Extracts text from a url. Returns a tuple with string that is of maximum length
/// of the extractor's `extract_string_max_length` and tika metdata.
pub fn extract_url_with_metadata(&self, url: &str) -> PyResult<(StreamReader, PyObject)> {
let (reader, metadata) = self.0
.extract_url_with_metadata(&url)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
Python::with_gil(|py| {
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((StreamReader {
reader,
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
py_bytes: None,
}, py_metadata.into()))
})
}

fn __repr__(&self) -> String {
format!("{:?}", self.0)
}
}

fn hashmap_to_pydict<'py>(py: Python<'py>, hashmap: &HashMap<String, String>) -> &'py PyDict {
fn metadata_hashmap_to_pydict<'py>(py: Python<'py>, hashmap: &HashMap<String, Vec<String>>) -> Result<&'py PyDict, PyErr> {
let pydict = PyDict::new(py);
for (key, value) in hashmap {
pydict.set_item(key, value).unwrap();
pydict.set_item(key, value)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
}
pydict
Ok(pydict)
}
32 changes: 31 additions & 1 deletion bindings/extractous-python/tests/test_extract_bytes_to_stream.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import json
import pytest

from extractous import Extractor
from utils import cosine_similarity, read_to_string, read_file_to_bytearray
from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
Expand Down Expand Up @@ -38,3 +39,32 @@ def test_extract_bytes_to_stream(file_name, target_dist):
assert cosine_similarity(result, expected) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"


TEST_CASES_METADATA = [
("2022_Q3_AAPL.pdf", 0.9),
("science-exploration-1p.pptx", 0.9),
("simple.odt", 0.9),
("table-multi-row-column-cells-actual.csv", 0.6),
("vodafone.xlsx", 0.8),
("category-level.docx", 0.9),
("simple.doc", 0.9),
("simple.pptx", 0.9),
("table-multi-row-column-cells.png", 0.9),
("winter-sports.epub", 0.8),
("bug_16.docx", 0.9),
]


@pytest.mark.parametrize("file_name, similarity_percent", TEST_CASES_METADATA)
def test_extract_bytes_to_stream(file_name, similarity_percent):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
file_bytes = read_file_to_bytearray(original_filepath)
extractor = Extractor()
_reader, metadata = extractor.extract_bytes_with_metadata(file_bytes)
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity > similarity_percent, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
30 changes: 29 additions & 1 deletion bindings/extractous-python/tests/test_extract_file_to_string.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import json
import pytest

from extractous import Extractor
from utils import cosine_similarity
from utils import calculate_similarity_percent, cosine_similarity

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
Expand Down Expand Up @@ -32,3 +33,30 @@ def test_extract_file_to_string(file_name, target_dist):
assert cosine_similarity(result, expected) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"


TEST_CASES_METADATA = [
("2022_Q3_AAPL.pdf", 0.9),
("science-exploration-1p.pptx", 0.9),
("simple.odt", 0.9),
("table-multi-row-column-cells-actual.csv", 0.6),
("vodafone.xlsx", 0.8),
("category-level.docx", 0.9),
("simple.doc", 0.9),
("simple.pptx", 0.9),
("table-multi-row-column-cells.png", 0.9),
("winter-sports.epub", 0.8),
("bug_16.docx", 0.9),
]

@pytest.mark.parametrize("file_name, similarity_percent", TEST_CASES_METADATA)
def test_extract_file_to_string_with_metadata(file_name, similarity_percent):
"""Test the extraction and comparison of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
extractor = Extractor()
_result, metadata = extractor.extract_file_to_string_with_metadata(original_filepath)
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity > similarity_percent, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

This file was deleted.

5 changes: 5 additions & 0 deletions bindings/extractous-python/tests/test_extract_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,8 @@ def test_extract_url():
result = read_to_string(reader)

assert "Google" in result

def test_extract_url_with_metadata():
extractor = Extractor()
_reader, metadata = extractor.extract_url_with_metadata("https://www.google.com")
assert len(metadata.keys()) > 0
18 changes: 18 additions & 0 deletions bindings/extractous-python/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,21 @@ def read_file_to_bytearray(file_path: str):
with open(file_path, 'rb') as file:
file_content = bytearray(file.read())
return file_content


def calculate_similarity_percent(expected, current):
matches = 0
total = 0

# Iterate over all keys in the 'expected' dictionary
for key, value1 in expected.items():
if key in current:
total += 1
if value1 == current[key]:
matches += 1

if total == 0:
return 0.0

# Return the similarity percentage
return matches / total
Loading

0 comments on commit 9d8971e

Please sign in to comment.