Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/3 return tika metadata #26

Merged
merged 12 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,23 @@ result = extractor.extract_file_to_string("../../test_files/documents/eng-ocr.pd
print(result)
```

* Extracting with metadata:

You can extend the functionality with `_with_metadata` to return the file's metadata.

```python
from extractous import Extractor

# Create a new extractor
extractor = Extractor()
extractor.set_extract_string_max_length(1000)

# Extract text from a file
result, metadata = extractor.extract_file_to_string_with_metadata("README.md")
print(result)
print(metadata)
```

#### Rust
* Extract a file content to a string:
```rust
Expand All @@ -129,6 +146,22 @@ fn main() {
}
```

* Extracting with metadata:

```rust
use extractous::Extractor;

fn main() {
// Create a new extractor. Note it uses a consuming builder pattern
let mut extractor = Extractor::new().set_extract_string_max_length(1000);

// Extract text from a file
let (text, metadata) = extractor.extract_file_to_string_with_metadata("README.md").unwrap();
println!("{}", text);
println!("{:?}", metadata);
}
```

* Extract a content of a file(URL/ bytes) to a `StreamReader` and perform buffered reading
```rust
use std::io::{BufReader, Read};
Expand Down
2 changes: 1 addition & 1 deletion bindings/extractous-python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ doc = false

[dependencies]
# "abi3-py310" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.10
pyo3 = { version = "0.22.2", features = ["abi3", "abi3-py38"] }
pyo3 = { version = "0.22.2", features = ["abi3", "abi3-py38", "gil-refs"] }
extractous = { path = "../../extractous-core" }
114 changes: 109 additions & 5 deletions bindings/extractous-python/src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ use crate::{ecore, OfficeParserConfig, PdfParserConfig, TesseractOcrConfig};
use pyo3::exceptions::PyTypeError;
use pyo3::prelude::*;
use pyo3::types::PyByteArray;
use pyo3::types::PyDict;
use std::collections::HashMap;
use std::io::Read;

// PyO3 supports unit-only enums (which contain only unit variants)
Expand Down Expand Up @@ -80,13 +82,12 @@ impl StreamReader {
pub fn readinto<'py>(&mut self, buf: Bound<'py, PyByteArray>) -> PyResult<usize> {
let bs = unsafe { buf.as_bytes_mut() };

let bytes_read = self.reader.read(bs)
.map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(
format!("{}", e))
)?;
let bytes_read = self
.reader
.read(bs)
.map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))?;
Ok(bytes_read)
}

}

/// `Extractor` is the entry for all extract APIs
Expand Down Expand Up @@ -151,6 +152,30 @@ impl Extractor {
})
}

/// Extracts text from a file path. Returns a tuple with stream of the extracted text
/// the stream is decoded using the extractor's `encoding` and tika metadata.
pub fn extract_file_with_metadata<'py>(
&self,
filename: &str,
py: Python<'py>,
) -> PyResult<(StreamReader, PyObject)> {
let (reader, metadata) = self
.0
.extract_file_with_metadata(filename)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((
StreamReader {
reader,
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
py_bytes: None,
},
py_metadata.into(),
))
}

/// Extracts text from a file path. Returns a string that is of maximum length
/// of the extractor's `extract_string_max_length`
pub fn extract_file_to_string(&self, filename: &str) -> PyResult<String> {
Expand All @@ -159,6 +184,22 @@ impl Extractor {
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))
}

/// Extracts text from a file path. Returns a tuple with string and dict that is of maximum length
/// of the extractor's `extract_string_max_length` and the metadata.
pub fn extract_file_to_string_with_metadata<'py>(
&self,
filename: &str,
py: Python<'py>,
) -> PyResult<(String, PyObject)> {
let (content, metadata) = self
.0
.extract_file_to_string_with_metadata(filename)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((content, py_metadata.into()))
}

/// Extracts text from a bytearray. Returns a stream of the extracted text
/// the stream is decoded using the extractor's `encoding`
pub fn extract_bytes(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult<StreamReader> {
Expand All @@ -176,6 +217,31 @@ impl Extractor {
})
}

/// Extracts text from a bytearray. Returns a tuple with stream of the extracted text
/// the stream is decoded using the extractor's `encoding` and tika metadata.
pub fn extract_bytes_with_metadata<'py>(
&self,
buffer: &Bound<'_, PyByteArray>,
py: Python<'py>,
) -> PyResult<(StreamReader, PyObject)> {
let slice = buffer.to_vec();
let (reader, metadata) = self
.0
.extract_bytes_with_metadata(&slice)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((
StreamReader {
reader,
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
py_bytes: None,
},
py_metadata.into(),
))
}

/// Extracts text from a url. Returns a string that is of maximum length
/// of the extractor's `extract_string_max_length`
pub fn extract_url(&self, url: &str) -> PyResult<StreamReader> {
Expand All @@ -192,7 +258,45 @@ impl Extractor {
})
}

/// Extracts text from a url. Returns a tuple with string that is of maximum length
/// of the extractor's `extract_string_max_length` and tika metdata.
pub fn extract_url_with_metadata<'py>(
&self,
url: &str,
py: Python<'py>,
) -> PyResult<(StreamReader, PyObject)> {
let (reader, metadata) = self
.0
.extract_url_with_metadata(&url)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((
StreamReader {
reader,
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
py_bytes: None,
},
py_metadata.into(),
))
}

fn __repr__(&self) -> String {
format!("{:?}", self.0)
}
}

/// Converts HashMap<String, Vec<String> to PyDict
fn metadata_hashmap_to_pydict<'py>(
py: Python<'py>,
hashmap: &HashMap<String, Vec<String>>,
) -> Result<Bound<'py, PyDict>, PyErr> {
let pydict = PyDict::new_bound(py);
for (key, value) in hashmap {
pydict
.set_item(key, value)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
}
Ok(pydict)
}
32 changes: 31 additions & 1 deletion bindings/extractous-python/tests/test_extract_bytes_to_stream.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import json
import pytest

from extractous import Extractor
from utils import cosine_similarity, read_to_string, read_file_to_bytearray
from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
Expand Down Expand Up @@ -38,3 +39,32 @@ def test_extract_bytes_to_stream(file_name, target_dist):
assert cosine_similarity(result, expected) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"


TEST_CASES_METADATA = [
("2022_Q3_AAPL.pdf", 0.9),
("science-exploration-1p.pptx", 0.9),
("simple.odt", 0.9),
("table-multi-row-column-cells-actual.csv", 0.6),
("vodafone.xlsx", 0.8),
("category-level.docx", 0.9),
("simple.doc", 0.9),
("simple.pptx", 0.9),
("table-multi-row-column-cells.png", 0.9),
("winter-sports.epub", 0.8),
("bug_16.docx", 0.9),
]


@pytest.mark.parametrize("file_name, similarity_percent", TEST_CASES_METADATA)
def test_extract_bytes_to_stream(file_name, similarity_percent):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
file_bytes = read_file_to_bytearray(original_filepath)
extractor = Extractor()
_reader, metadata = extractor.extract_bytes_with_metadata(file_bytes)
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity > similarity_percent, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
30 changes: 29 additions & 1 deletion bindings/extractous-python/tests/test_extract_file_to_string.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import json
import pytest

from extractous import Extractor
from utils import cosine_similarity
from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
Expand Down Expand Up @@ -32,3 +33,30 @@ def test_extract_file_to_string(file_name, target_dist):
assert cosine_similarity(result, expected) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"


TEST_CASES_METADATA = [
"2022_Q3_AAPL.pdf",
"science-exploration-1p.pptx",
"simple.odt",
"table-multi-row-column-cells-actual.csv",
"vodafone.xlsx",
"category-level.docx",
"simple.doc",
"simple.pptx",
"table-multi-row-column-cells.png",
"winter-sports.epub",
"bug_16.docx",
]

@pytest.mark.parametrize("file_name", TEST_CASES_METADATA)
def test_extract_file_to_string_with_metadata(file_name):
"""Test the extraction and comparison of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
extractor = Extractor()
_result, metadata = extractor.extract_file_to_string_with_metadata(original_filepath)
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

#metadata.pop("dc:format")
assert is_expected_metadata_contained(expected_metadata, metadata)
5 changes: 5 additions & 0 deletions bindings/extractous-python/tests/test_extract_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,8 @@ def test_extract_url():
result = read_to_string(reader)

assert "Google" in result

def test_extract_url_with_metadata():
extractor = Extractor()
_reader, metadata = extractor.extract_url_with_metadata("https://www.google.com")
assert len(metadata.keys()) > 0
33 changes: 33 additions & 0 deletions bindings/extractous-python/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,36 @@ def read_file_to_bytearray(file_path: str):
with open(file_path, 'rb') as file:
file_content = bytearray(file.read())
return file_content


def is_expected_metadata_contained(expected: dict, current: dict) -> bool:
"""
Check if all keys in `expected` are present in `current` and have identical values.
"""
for key, expected_values in expected.items():
actual_values = current.get(key)
if actual_values is None:
print(f"\nexpected key = {key} not found !!")
return False
elif actual_values != expected_values:
print(f"\nvalues for key = {key} differ!! expected = {expected_values} and actual = {actual_values}")
return False
return True


def calculate_similarity_percent(expected, current):
matches = 0
total = 0

# Iterate over all keys in the 'expected' dictionary
for key, value1 in expected.items():
if key in current:
total += 1
if value1 == current[key]:
matches += 1

if total == 0:
return 0.0

# Return the similarity percentage
return matches / total
Loading
Loading