Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/3 return tika metadata #26

Merged
merged 12 commits into from
Nov 15, 2024
Merged
33 changes: 33 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,23 @@ result = extractor.extract_file_to_string("../../test_files/documents/eng-ocr.pd
print(result)
```

* Extracting with metadata:

You can extend the functionality with `_with_metadata` to return the file's metadata.

```python
from extractous import Extractor

# Create a new extractor
extractor = Extractor()
extractor.set_extract_string_max_length(1000)

# Extract text from a file
result, metadata = extractor.extract_file_to_string_with_metadata("README.md")
print(result)
print(metadata)
```

#### Rust
* Extract a file content to a string:
```rust
Expand All @@ -129,6 +146,22 @@ fn main() {
}
```

* Extracting with metadata:

```rust
use extractous::Extractor;

fn main() {
// Create a new extractor. Note it uses a consuming builder pattern
let mut extractor = Extractor::new().set_extract_string_max_length(1000);

// Extract text from a file
let (text, metadata) = extractor.extract_file_to_string_with_metadata("README.md").unwrap();
println!("{}", text);
println!("{:?}", metadata);
}
```

* Extract a content of a file(URL/ bytes) to a `StreamReader` and perform buffered reading
```rust
use std::io::{BufReader, Read};
Expand Down
2 changes: 1 addition & 1 deletion bindings/extractous-python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ doc = false

[dependencies]
# "abi3-py310" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.10
pyo3 = { version = "0.22.2", features = ["abi3", "abi3-py38"] }
pyo3 = { version = "0.22.2", features = ["abi3", "abi3-py38", "gil-refs"] }
extractous = { path = "../../extractous-core" }
23 changes: 23 additions & 0 deletions bindings/extractous-python/src/extractor.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
use std::collections::HashMap;
use crate::{ecore, OfficeParserConfig, PdfParserConfig, TesseractOcrConfig};
use pyo3::exceptions::PyTypeError;
use pyo3::prelude::*;
use pyo3::types::PyByteArray;
use pyo3::types::PyDict;
use std::io::Read;

// PyO3 supports unit-only enums (which contain only unit variants)
Expand Down Expand Up @@ -159,6 +161,19 @@ impl Extractor {
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))
}

/// Extracts text from a file path. Returns a tuple with string and dict that is of maximum length
/// of the extractor's `extract_string_max_length` and the metadata.
pub fn extract_file_to_string_with_metadata(&self, filename: &str) -> PyResult<(String, PyObject)> {
let (content, metadata) = self.0
.extract_file_to_string_with_metadata(filename)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

Python::with_gil(|py| {
let py_metadata = hashmap_to_pydict(py, &metadata);
Ok((content, py_metadata.into()))
})
}

/// Extracts text from a bytearray. Returns a stream of the extracted text
/// the stream is decoded using the extractor's `encoding`
pub fn extract_bytes(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult<StreamReader> {
Expand Down Expand Up @@ -196,3 +211,11 @@ impl Extractor {
format!("{:?}", self.0)
}
}

fn hashmap_to_pydict<'py>(py: Python<'py>, hashmap: &HashMap<String, String>) -> &'py PyDict {
let pydict = PyDict::new(py);
for (key, value) in hashmap {
pydict.set_item(key, value).unwrap();
s4zuk3 marked this conversation as resolved.
Show resolved Hide resolved
}
pydict
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pytest
import json
from extractous import Extractor
from utils import cosine_similarity

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
("science-exploration-1p.pptx", 0.9),
("simple.odt", 0.9),
("table-multi-row-column-cells-actual.csv", 0.9),
("vodafone.xlsx", 0.4),
("category-level.docx", 0.9),
("simple.doc", 0.9),
("simple.pptx", 0.9),
("table-multi-row-column-cells.png", -1.0),
("winter-sports.epub", 0.9),
("bug_16.docx", 0.9),
("deu-ocr.pdf", 0.9),
]

@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_file_to_string_with_metadata(file_name, target_dist):
"""Test the extraction and comparison of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
extractor = Extractor()
result, metadata = extractor.extract_file_to_string_with_metadata(original_filepath)
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected_content = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)
assert cosine_similarity(result, expected_content) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"
assert metadata == expected_metadata, "The metadata are not equal!"
2 changes: 2 additions & 0 deletions extractous-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ strum_macros = { version = "0.26.2" }
textdistance = "1.1.0"
test-case = "3.0"
criterion = "0.5.1"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"

[build-dependencies]
fs_extra = { version = "1.3.0" }
Expand Down
21 changes: 19 additions & 2 deletions extractous-core/src/extractor.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::collections::HashMap;
use crate::errors::ExtractResult;
use crate::tika;
use crate::tika::JReaderInputStream;
Expand Down Expand Up @@ -159,6 +160,18 @@ impl Extractor {
&self.ocr_config,
)
}

/// Extracts text from a file path. Returns a string that is of maximum length
/// of the extractor's `extract_string_max_length` and metadata HashMap.
pub fn extract_file_to_string_with_metadata(&self, file_path: &str) -> ExtractResult<(String, HashMap<String, String>)> {
tika::parse_file_to_string_with_metadata(
file_path,
self.extract_string_max_length,
&self.pdf_config,
&self.office_config,
&self.ocr_config,
)
}
}

#[cfg(test)]
Expand Down Expand Up @@ -208,9 +221,13 @@ mod tests {

// Parse the files using extractous
let extractor = Extractor::new();
let result = extractor.extract_file_to_string(TEST_FILE);
let content = result.unwrap();
let result = extractor.extract_file_to_string_with_metadata(TEST_FILE);
s4zuk3 marked this conversation as resolved.
Show resolved Hide resolved
let (content, metadata) = result.unwrap();
assert_eq!(content.trim(), expected_content.trim());
assert!(
metadata.len() > 0,
"Metadata should contain at least one entry"
);
}

fn read_file_as_bytes(path: &str) -> io::Result<Vec<u8>> {
Expand Down
20 changes: 19 additions & 1 deletion extractous-core/src/tika/jni_utils.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use std::os::raw::{c_char, c_void};

use jni::errors::jni_error_code_to_result;
use jni::objects::{JByteBuffer, JObject, JString, JValue, JValueOwned};
use jni::objects::{JMap, JByteBuffer, JObject, JString, JValue, JValueOwned};
use jni::{sys, JNIEnv, JavaVM};

use std::collections::HashMap;
use crate::errors::{Error, ExtractResult};

/// Calls a static method and prints any thrown exceptions to stderr
Expand Down Expand Up @@ -92,6 +93,23 @@ pub fn jni_jobject_to_string<'local>(
Ok(output_str.to_string())
}

/// Converts a java HashMap to a rust HashMap
pub fn jni_jobject_hashmap_to_hashmap<'local>(
env: &mut JNIEnv<'local>,
jobject: JObject<'local>,
) -> ExtractResult<HashMap<String, String>> {
let jmap = JMap::from_env(env, &jobject)?;
let mut metadata = HashMap::new();
let mut iter = jmap.iter(env)?;
while let Ok(Some(entry)) = iter.next(env) {
let (key_object, value_object) = entry;
let key = jni_jobject_to_string(env, key_object)?;
let value = jni_jobject_to_string(env, value_object)?;
metadata.insert(key, value);
}
Ok(metadata)
}

/// Checks if there is an exception in the jni environment, describes it to
/// the stderr and finally clears it
pub fn jni_check_exception(env: &mut JNIEnv) -> ExtractResult<bool> {
Expand Down
31 changes: 27 additions & 4 deletions extractous-core/src/tika/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::sync::OnceLock;

use jni::objects::JValue;
use jni::{AttachGuard, JavaVM};

use std::collections::HashMap;
use crate::errors::ExtractResult;
use crate::tika::jni_utils::*;
use crate::tika::wrappers::*;
Expand Down Expand Up @@ -89,14 +89,14 @@ pub fn parse_file(
)
}

/// Parses a file to a string using the Apache Tika library.
pub fn parse_file_to_string(
/// Parses a file to a JStringResult using the Apache Tika library.
pub fn parse_file_to_j_string_result(
file_path: &str,
max_length: i32,
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
) -> ExtractResult<String> {
) -> ExtractResult<JStringResult> {
let mut env = get_vm_attach_current_thread()?;

// Create a new Java string from the Rust string
Expand Down Expand Up @@ -124,10 +124,33 @@ pub fn parse_file_to_string(

// Create and process the JStringResult
let result = JStringResult::new(&mut env, call_result_obj)?;
Ok(result)
}

/// Parses a file to a string using the Apache Tika library.
pub fn parse_file_to_string(
file_path: &str,
max_length: i32,
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
) -> ExtractResult<String> {
let result = parse_file_to_j_string_result(file_path, max_length, pdf_conf, office_conf, ocr_conf)?;
Ok(result.content)
}

/// Parses a file to a tuple (string, metadata) using the Apache Tika library.
pub fn parse_file_to_string_with_metadata(
file_path: &str,
max_length: i32,
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
) -> ExtractResult<(String, HashMap<String, String>)> {
let result = parse_file_to_j_string_result(file_path, max_length, pdf_conf, office_conf, ocr_conf)?;
Ok((result.content, result.metadata))
}

pub fn parse_bytes(
buffer: &[u8],
char_set: &CharSet,
Expand Down
16 changes: 12 additions & 4 deletions extractous-core/src/tika/wrappers.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
use crate::errors::{Error, ExtractResult};
use crate::tika::jni_utils::{jni_call_method, jni_jobject_to_string, jni_new_string_as_jvalue};
use crate::tika::jni_utils::{jni_call_method, jni_jobject_hashmap_to_hashmap, jni_jobject_to_string, jni_new_string_as_jvalue};
use crate::tika::vm;
use crate::{OfficeParserConfig, PdfParserConfig, TesseractOcrConfig, DEFAULT_BUF_SIZE};
use bytemuck::cast_slice_mut;
use jni::objects::{GlobalRef, JByteArray, JObject, JValue};
use jni::sys::jsize;
use jni::JNIEnv;
use std::collections::HashMap;

/// Alias Metadata
type Metadata = HashMap<String, String>;

/// Wrapper for [`JObject`]s that contain `org.apache.commons.io.input.ReaderInputStream`
/// It saves a GlobalRef to the java object, which is cleared when the last GlobalRef is dropped
Expand Down Expand Up @@ -104,8 +108,9 @@ impl Drop for JReaderInputStream {

/// Wrapper for the Java class `ai.yobix.StringResult`
/// Upon creation it parses the java StringResult object and saves the converted Rust string
pub(crate) struct JStringResult {
pub(crate) content: String,
pub struct JStringResult {
pub content: String,
pub metadata: Metadata,
}

impl<'local> JStringResult {
Expand All @@ -130,7 +135,10 @@ impl<'local> JStringResult {

let content = jni_jobject_to_string(env, call_result_obj)?;

Ok(Self { content })
let metadata_obj_hashmap: JObject = env.call_method(&obj, "getMetadata", "()Ljava/util/HashMap;", &[])?
.l()?;
let metadata = jni_jobject_hashmap_to_hashmap(env, metadata_obj_hashmap)?;
Ok(Self { content, metadata })
}
}
}
Expand Down
40 changes: 39 additions & 1 deletion extractous-core/tests/extract_to_string_tests.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
extern crate test_case;
extern crate textdistance;

use std::collections::HashMap;
use extractous::{Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig};
use std::fs;
use test_case::test_case;
Expand Down Expand Up @@ -39,6 +39,44 @@ fn test_extract_file_to_string(file_name: &str, target_dist: f64) {
println!("{}: {}", file_name, dist);
}

#[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")]
#[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")]
#[test_case("simple.odt", 0.8; "Test ODT file")]
#[test_case("table-multi-row-column-cells-actual.csv", 0.8; "Test CSV file")]
#[test_case("vodafone.xlsx", 0.4; "Test XLSX file")]
#[test_case("category-level.docx", 0.9; "Test DOCX file")]
#[test_case("simple.doc", 0.9; "Test DOC file")]
#[test_case("simple.pptx", 0.9; "Test another PPTX file")]
#[test_case("table-multi-row-column-cells.png", -1.0; "Test PNG file")]
#[test_case("winter-sports.epub", 0.9; "Test EPUB file")]
#[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")]
//#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")]
fn test_extract_file_to_string_with_metadata(file_name: &str, target_dist: f64) {
let extractor = Extractor::new().set_extract_string_max_length(1000000);
// extract file with extractor
let (extracted_content, extracted_metadata) = extractor
.extract_file_to_string_with_metadata(&format!("../test_files/documents/{}", file_name))
.unwrap();
// read expected content
let expected_content =
fs::read_to_string(format!("../test_files/expected_result/{}.txt", file_name)).unwrap();
// read expected metadata
let expected_metadata_string =
fs::read_to_string(format!("../test_files/expected_result/{}.metadata.json", file_name)).unwrap();
let expected_metadata: HashMap<String, String> = serde_json::from_str(&expected_metadata_string).expect("JSON was not well-formatted");

let dist = cosine(&expected_content, &extracted_content);
assert!(
dist > target_dist,
"Cosine similarity is less than {} for file: {}, dist: {}",
target_dist,
file_name,
dist
);
assert_eq!(extracted_metadata, expected_metadata, "The HashMaps do not contain the same data!");
//println!("{:?}", extracted_metadata)
}

#[test]
fn test_extract_file_to_string_ara_ocr_png() {
let extractor = Extractor::new()
Expand Down
Loading
Loading