Skip to content

Commit

Permalink
Added function to return tika metadata for basic string result
Browse files Browse the repository at this point in the history
  • Loading branch information
s4zuk3 committed Nov 13, 2024
1 parent a08e218 commit 1b8039c
Show file tree
Hide file tree
Showing 29 changed files with 672 additions and 16 deletions.
33 changes: 33 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,23 @@ result = extractor.extract_file_to_string("../../test_files/documents/eng-ocr.pd
print(result)
```

* Extracting with metadata:

You can extend the functionality with `_with_metadata` to return the file's metadata.

```python
from extractous import Extractor

# Create a new extractor
extractor = Extractor()
extractor.set_extract_string_max_length(1000)

# Extract text from a file
result, metadata = extractor.extract_file_to_string_with_metadata("README.md")
print(result)
print(metadata)
```

#### Rust
* Extract a file content to a string:
```rust
Expand All @@ -129,6 +146,22 @@ fn main() {
}
```

* Extracting with metadata:

```rust
use extractous::Extractor;

fn main() {
// Create a new extractor. Note it uses a consuming builder pattern
let mut extractor = Extractor::new().set_extract_string_max_length(1000);

// Extract text from a file
let (text, metadata) = extractor.extract_file_to_string_with_metadata("README.md").unwrap();
println!("{}", text);
println!("{:?}", metadata);
}
```

* Extract a content of a file(URL/ bytes) to a `StreamReader` and perform buffered reading
```rust
use std::io::{BufReader, Read};
Expand Down
2 changes: 1 addition & 1 deletion bindings/extractous-python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ doc = false

[dependencies]
# "abi3-py310" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.10
pyo3 = { version = "0.22.2", features = ["abi3", "abi3-py38"] }
pyo3 = { version = "0.22.2", features = ["abi3", "abi3-py38", "gil-refs"] }
extractous = { path = "../../extractous-core" }
23 changes: 23 additions & 0 deletions bindings/extractous-python/src/extractor.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
use std::collections::HashMap;
use crate::{ecore, OfficeParserConfig, PdfParserConfig, TesseractOcrConfig};
use pyo3::exceptions::PyTypeError;
use pyo3::prelude::*;
use pyo3::types::PyByteArray;
use pyo3::types::PyDict;
use std::io::Read;

// PyO3 supports unit-only enums (which contain only unit variants)
Expand Down Expand Up @@ -159,6 +161,19 @@ impl Extractor {
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))
}

/// Extracts text from a file path. Returns a tuple with string and dict that is of maximum length
/// of the extractor's `extract_string_max_length` and the metadata.
pub fn extract_file_to_string_with_metadata(&self, filename: &str) -> PyResult<(String, PyObject)> {
let (content, metadata) = self.0
.extract_file_to_string_with_metadata(filename)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

Python::with_gil(|py| {
let py_metadata = hashmap_to_pydict(py, &metadata);
Ok((content, py_metadata.into()))
})
}

/// Extracts text from a bytearray. Returns a stream of the extracted text
/// the stream is decoded using the extractor's `encoding`
pub fn extract_bytes(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult<StreamReader> {
Expand Down Expand Up @@ -196,3 +211,11 @@ impl Extractor {
format!("{:?}", self.0)
}
}

fn hashmap_to_pydict<'py>(py: Python<'py>, hashmap: &HashMap<String, String>) -> &'py PyDict {
let pydict = PyDict::new(py);
for (key, value) in hashmap {
pydict.set_item(key, value).unwrap();
}
pydict
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pytest
import json
from extractous import Extractor
from utils import cosine_similarity

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
("science-exploration-1p.pptx", 0.9),
("simple.odt", 0.9),
("table-multi-row-column-cells-actual.csv", 0.9),
("vodafone.xlsx", 0.4),
("category-level.docx", 0.9),
("simple.doc", 0.9),
("simple.pptx", 0.9),
("table-multi-row-column-cells.png", -1.0),
("winter-sports.epub", 0.9),
("bug_16.docx", 0.9),
("deu-ocr.pdf", 0.9),
]

@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_file_to_string_with_metadata(file_name, target_dist):
"""Test the extraction and comparison of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
extractor = Extractor()
result, metadata = extractor.extract_file_to_string_with_metadata(original_filepath)
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected_content = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)
assert cosine_similarity(result, expected_content) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"
assert metadata == expected_metadata, "The metadata are not equal!"
2 changes: 2 additions & 0 deletions extractous-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ strum_macros = { version = "0.26.2" }
textdistance = "1.1.0"
test-case = "3.0"
criterion = "0.5.1"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"

[build-dependencies]
fs_extra = { version = "1.3.0" }
Expand Down
21 changes: 19 additions & 2 deletions extractous-core/src/extractor.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::collections::HashMap;
use crate::errors::ExtractResult;
use crate::tika;
use crate::tika::JReaderInputStream;
Expand Down Expand Up @@ -159,6 +160,18 @@ impl Extractor {
&self.ocr_config,
)
}

/// Extracts text from a file path. Returns a string that is of maximum length
/// of the extractor's `extract_string_max_length` and metadata HashMap.
pub fn extract_file_to_string_with_metadata(&self, file_path: &str) -> ExtractResult<(String, HashMap<String, String>)> {
tika::parse_file_to_string_with_metadata(
file_path,
self.extract_string_max_length,
&self.pdf_config,
&self.office_config,
&self.ocr_config,
)
}
}

#[cfg(test)]
Expand Down Expand Up @@ -208,9 +221,13 @@ mod tests {

// Parse the files using extractous
let extractor = Extractor::new();
let result = extractor.extract_file_to_string(TEST_FILE);
let content = result.unwrap();
let result = extractor.extract_file_to_string_with_metadata(TEST_FILE);
let (content, metadata) = result.unwrap();
assert_eq!(content.trim(), expected_content.trim());
assert!(
metadata.len() > 0,
"Metadata should contain at least one entry"
);
}

fn read_file_as_bytes(path: &str) -> io::Result<Vec<u8>> {
Expand Down
20 changes: 19 additions & 1 deletion extractous-core/src/tika/jni_utils.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use std::os::raw::{c_char, c_void};

use jni::errors::jni_error_code_to_result;
use jni::objects::{JByteBuffer, JObject, JString, JValue, JValueOwned};
use jni::objects::{JMap, JByteBuffer, JObject, JString, JValue, JValueOwned};
use jni::{sys, JNIEnv, JavaVM};

use std::collections::HashMap;
use crate::errors::{Error, ExtractResult};

/// Calls a static method and prints any thrown exceptions to stderr
Expand Down Expand Up @@ -92,6 +93,23 @@ pub fn jni_jobject_to_string<'local>(
Ok(output_str.to_string())
}

/// Converts a java HashMap to a rust HashMap
pub fn jni_jobject_hashmap_to_hashmap<'local>(
env: &mut JNIEnv<'local>,
jobject: JObject<'local>,
) -> ExtractResult<HashMap<String, String>> {
let jmap = JMap::from_env(env, &jobject)?;
let mut metadata = HashMap::new();
let mut iter = jmap.iter(env)?;
while let Ok(Some(entry)) = iter.next(env) {
let (key_object, value_object) = entry;
let key = jni_jobject_to_string(env, key_object)?;
let value = jni_jobject_to_string(env, value_object)?;
metadata.insert(key, value);
}
Ok(metadata)
}

/// Checks if there is an exception in the jni environment, describes it to
/// the stderr and finally clears it
pub fn jni_check_exception(env: &mut JNIEnv) -> ExtractResult<bool> {
Expand Down
31 changes: 27 additions & 4 deletions extractous-core/src/tika/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::sync::OnceLock;

use jni::objects::JValue;
use jni::{AttachGuard, JavaVM};

use std::collections::HashMap;
use crate::errors::ExtractResult;
use crate::tika::jni_utils::*;
use crate::tika::wrappers::*;
Expand Down Expand Up @@ -89,14 +89,14 @@ pub fn parse_file(
)
}

/// Parses a file to a string using the Apache Tika library.
pub fn parse_file_to_string(
/// Parses a file to a JStringResult using the Apache Tika library.
pub fn parse_file_to_j_string_result(
file_path: &str,
max_length: i32,
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
) -> ExtractResult<String> {
) -> ExtractResult<JStringResult> {
let mut env = get_vm_attach_current_thread()?;

// Create a new Java string from the Rust string
Expand Down Expand Up @@ -124,10 +124,33 @@ pub fn parse_file_to_string(

// Create and process the JStringResult
let result = JStringResult::new(&mut env, call_result_obj)?;
Ok(result)
}

/// Parses a file to a string using the Apache Tika library.
pub fn parse_file_to_string(
file_path: &str,
max_length: i32,
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
) -> ExtractResult<String> {
let result = parse_file_to_j_string_result(file_path, max_length, pdf_conf, office_conf, ocr_conf)?;
Ok(result.content)
}

/// Parses a file to a tuple (string, metadata) using the Apache Tika library.
pub fn parse_file_to_string_with_metadata(
file_path: &str,
max_length: i32,
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
) -> ExtractResult<(String, HashMap<String, String>)> {
let result = parse_file_to_j_string_result(file_path, max_length, pdf_conf, office_conf, ocr_conf)?;
Ok((result.content, result.metadata))
}

pub fn parse_bytes(
buffer: &[u8],
char_set: &CharSet,
Expand Down
16 changes: 12 additions & 4 deletions extractous-core/src/tika/wrappers.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
use crate::errors::{Error, ExtractResult};
use crate::tika::jni_utils::{jni_call_method, jni_jobject_to_string, jni_new_string_as_jvalue};
use crate::tika::jni_utils::{jni_call_method, jni_jobject_hashmap_to_hashmap, jni_jobject_to_string, jni_new_string_as_jvalue};
use crate::tika::vm;
use crate::{OfficeParserConfig, PdfParserConfig, TesseractOcrConfig, DEFAULT_BUF_SIZE};
use bytemuck::cast_slice_mut;
use jni::objects::{GlobalRef, JByteArray, JObject, JValue};
use jni::sys::jsize;
use jni::JNIEnv;
use std::collections::HashMap;

/// Alias Metadata
type Metadata = HashMap<String, String>;

/// Wrapper for [`JObject`]s that contain `org.apache.commons.io.input.ReaderInputStream`
/// It saves a GlobalRef to the java object, which is cleared when the last GlobalRef is dropped
Expand Down Expand Up @@ -104,8 +108,9 @@ impl Drop for JReaderInputStream {

/// Wrapper for the Java class `ai.yobix.StringResult`
/// Upon creation it parses the java StringResult object and saves the converted Rust string
pub(crate) struct JStringResult {
pub(crate) content: String,
pub struct JStringResult {
pub content: String,
pub metadata: Metadata,
}

impl<'local> JStringResult {
Expand All @@ -130,7 +135,10 @@ impl<'local> JStringResult {

let content = jni_jobject_to_string(env, call_result_obj)?;

Ok(Self { content })
let metadata_obj_hashmap: JObject = env.call_method(&obj, "getMetadata", "()Ljava/util/HashMap;", &[])?
.l()?;
let metadata = jni_jobject_hashmap_to_hashmap(env, metadata_obj_hashmap)?;
Ok(Self { content, metadata })
}
}
}
Expand Down
40 changes: 39 additions & 1 deletion extractous-core/tests/extract_to_string_tests.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
extern crate test_case;
extern crate textdistance;

use std::collections::HashMap;
use extractous::{Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig};
use std::fs;
use test_case::test_case;
Expand Down Expand Up @@ -39,6 +39,44 @@ fn test_extract_file_to_string(file_name: &str, target_dist: f64) {
println!("{}: {}", file_name, dist);
}

#[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")]
#[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")]
#[test_case("simple.odt", 0.8; "Test ODT file")]
#[test_case("table-multi-row-column-cells-actual.csv", 0.8; "Test CSV file")]
#[test_case("vodafone.xlsx", 0.4; "Test XLSX file")]
#[test_case("category-level.docx", 0.9; "Test DOCX file")]
#[test_case("simple.doc", 0.9; "Test DOC file")]
#[test_case("simple.pptx", 0.9; "Test another PPTX file")]
#[test_case("table-multi-row-column-cells.png", -1.0; "Test PNG file")]
#[test_case("winter-sports.epub", 0.9; "Test EPUB file")]
#[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")]
//#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")]
fn test_extract_file_to_string_with_metadata(file_name: &str, target_dist: f64) {
let extractor = Extractor::new().set_extract_string_max_length(1000000);
// extract file with extractor
let (extracted_content, extracted_metadata) = extractor
.extract_file_to_string_with_metadata(&format!("../test_files/documents/{}", file_name))
.unwrap();
// read expected content
let expected_content =
fs::read_to_string(format!("../test_files/expected_result/{}.txt", file_name)).unwrap();
// read expected metadata
let expected_metadata_string =
fs::read_to_string(format!("../test_files/expected_result/{}.metadata.json", file_name)).unwrap();
let expected_metadata: HashMap<String, String> = serde_json::from_str(&expected_metadata_string).expect("JSON was not well-formatted");

let dist = cosine(&expected_content, &extracted_content);
assert!(
dist > target_dist,
"Cosine similarity is less than {} for file: {}, dist: {}",
target_dist,
file_name,
dist
);
assert_eq!(extracted_metadata, expected_metadata, "The HashMaps do not contain the same data!");
//println!("{:?}", extracted_metadata)
}

#[test]
fn test_extract_file_to_string_ara_ocr_png() {
let extractor = Extractor::new()
Expand Down
Loading

0 comments on commit 1b8039c

Please sign in to comment.