Added function to return tika metadata for basic string result

yobix-ai · Nov 13, 2024 · 1b8039c · 1b8039c
1 parent a08e218
commit 1b8039c
Show file tree

Hide file tree

Showing 29 changed files with 672 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -114,6 +114,23 @@ result = extractor.extract_file_to_string("../../test_files/documents/eng-ocr.pd
 print(result)
 ```
 
+* Extracting with metadata:
+
+You can extend the functionality with `_with_metadata` to return the file's metadata.
+
+```python
+from extractous import Extractor
+
+# Create a new extractor
+extractor = Extractor()
+extractor.set_extract_string_max_length(1000)
+
+# Extract text from a file
+result, metadata = extractor.extract_file_to_string_with_metadata("README.md")
+print(result)
+print(metadata)
+```
+
 #### Rust
 * Extract a file content to a string:
 ```rust
@@ -129,6 +146,22 @@ fn main() {
 }
 ```
 
+* Extracting with metadata:
+
+```rust
+use extractous::Extractor;
+
+fn main() {
+    // Create a new extractor. Note it uses a consuming builder pattern
+    let mut extractor = Extractor::new().set_extract_string_max_length(1000);
+
+    // Extract text from a file
+    let (text, metadata) = extractor.extract_file_to_string_with_metadata("README.md").unwrap();
+    println!("{}", text);
+    println!("{:?}", metadata);
+}
+```
+
 * Extract a content of a file(URL/ bytes) to a `StreamReader` and perform buffered reading
 ```rust
 use std::io::{BufReader, Read};

diff --git a/bindings/extractous-python/Cargo.toml b/bindings/extractous-python/Cargo.toml
@@ -18,5 +18,5 @@ doc = false
 
 [dependencies]
 # "abi3-py310" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.10
-pyo3 = { version = "0.22.2", features = ["abi3", "abi3-py38"] }
+pyo3 = { version = "0.22.2", features = ["abi3", "abi3-py38", "gil-refs"] }
 extractous = { path = "../../extractous-core" }
diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs
@@ -1,7 +1,9 @@
+use std::collections::HashMap;
 use crate::{ecore, OfficeParserConfig, PdfParserConfig, TesseractOcrConfig};
 use pyo3::exceptions::PyTypeError;
 use pyo3::prelude::*;
 use pyo3::types::PyByteArray;
+use pyo3::types::PyDict;
 use std::io::Read;
 
 // PyO3 supports unit-only enums (which contain only unit variants)
@@ -159,6 +161,19 @@ impl Extractor {
             .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))
     }
 
+    /// Extracts text from a file path. Returns a tuple with string and dict that is of maximum length
+    /// of the extractor's `extract_string_max_length` and the metadata.
+    pub fn extract_file_to_string_with_metadata(&self, filename: &str) -> PyResult<(String, PyObject)> {
+        let (content, metadata) = self.0
+            .extract_file_to_string_with_metadata(filename)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        Python::with_gil(|py| {
+            let py_metadata = hashmap_to_pydict(py, &metadata);
+            Ok((content, py_metadata.into()))
+        })
+    }
+
     /// Extracts text from a bytearray. Returns a stream of the extracted text
     /// the stream is decoded using the extractor's `encoding`
     pub fn extract_bytes(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult<StreamReader> {
@@ -196,3 +211,11 @@ impl Extractor {
         format!("{:?}", self.0)
     }
 }
+
+fn hashmap_to_pydict<'py>(py: Python<'py>, hashmap: &HashMap<String, String>) -> &'py PyDict {
+    let pydict = PyDict::new(py);
+    for (key, value) in hashmap {
+        pydict.set_item(key, value).unwrap();
+    }
+    pydict
+}
diff --git a/bindings/extractous-python/tests/test_extract_file_to_string_with_metadata.py b/bindings/extractous-python/tests/test_extract_file_to_string_with_metadata.py
@@ -0,0 +1,35 @@
+import pytest
+import json
+from extractous import Extractor
+from utils import cosine_similarity
+
+TEST_CASES = [
+    ("2022_Q3_AAPL.pdf", 0.9),
+    ("science-exploration-1p.pptx", 0.9),
+    ("simple.odt", 0.9),
+    ("table-multi-row-column-cells-actual.csv", 0.9),
+    ("vodafone.xlsx", 0.4),
+    ("category-level.docx", 0.9),
+    ("simple.doc", 0.9),
+    ("simple.pptx", 0.9),
+    ("table-multi-row-column-cells.png", -1.0),
+    ("winter-sports.epub", 0.9),
+    ("bug_16.docx", 0.9),
+    ("deu-ocr.pdf", 0.9),
+]
+
+@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
+def test_extract_file_to_string_with_metadata(file_name, target_dist):
+    """Test the extraction and comparison of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+    extractor = Extractor()
+    result, metadata = extractor.extract_file_to_string_with_metadata(original_filepath)
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected_content = file.read()
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+    assert cosine_similarity(result, expected_content) > target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+    assert metadata == expected_metadata, "The metadata are not equal!"
diff --git a/extractous-core/Cargo.toml b/extractous-core/Cargo.toml
@@ -34,6 +34,8 @@ strum_macros = { version = "0.26.2" }
 textdistance = "1.1.0"
 test-case = "3.0"
 criterion = "0.5.1"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
 
 [build-dependencies]
 fs_extra = { version = "1.3.0" }

diff --git a/extractous-core/src/extractor.rs b/extractous-core/src/extractor.rs
@@ -1,3 +1,4 @@
+use std::collections::HashMap;
 use crate::errors::ExtractResult;
 use crate::tika;
 use crate::tika::JReaderInputStream;
@@ -159,6 +160,18 @@ impl Extractor {
             &self.ocr_config,
         )
     }
+
+    /// Extracts text from a file path. Returns a string that is of maximum length
+    /// of the extractor's `extract_string_max_length` and metadata HashMap.
+    pub fn extract_file_to_string_with_metadata(&self, file_path: &str) -> ExtractResult<(String, HashMap<String, String>)> {
+        tika::parse_file_to_string_with_metadata(
+            file_path,
+            self.extract_string_max_length,
+            &self.pdf_config,
+            &self.office_config,
+            &self.ocr_config,
+        )
+    }
 }
 
 #[cfg(test)]
@@ -208,9 +221,13 @@ mod tests {
 
         // Parse the files using extractous
         let extractor = Extractor::new();
-        let result = extractor.extract_file_to_string(TEST_FILE);
-        let content = result.unwrap();
+        let result = extractor.extract_file_to_string_with_metadata(TEST_FILE);
+        let (content, metadata) = result.unwrap();
         assert_eq!(content.trim(), expected_content.trim());
+        assert!(
+            metadata.len() > 0,
+            "Metadata should contain at least one entry"
+        );
     }
 
     fn read_file_as_bytes(path: &str) -> io::Result<Vec<u8>> {

diff --git a/extractous-core/src/tika/jni_utils.rs b/extractous-core/src/tika/jni_utils.rs
@@ -1,9 +1,10 @@
 use std::os::raw::{c_char, c_void};
 
 use jni::errors::jni_error_code_to_result;
-use jni::objects::{JByteBuffer, JObject, JString, JValue, JValueOwned};
+use jni::objects::{JMap, JByteBuffer, JObject, JString, JValue, JValueOwned};
 use jni::{sys, JNIEnv, JavaVM};
 
+use std::collections::HashMap;
 use crate::errors::{Error, ExtractResult};
 
 /// Calls a static method and prints any thrown exceptions to stderr
@@ -92,6 +93,23 @@ pub fn jni_jobject_to_string<'local>(
     Ok(output_str.to_string())
 }
 
+/// Converts a java HashMap to a rust HashMap
+pub fn jni_jobject_hashmap_to_hashmap<'local>(
+    env: &mut JNIEnv<'local>,
+    jobject: JObject<'local>,
+) -> ExtractResult<HashMap<String, String>> {
+    let jmap = JMap::from_env(env, &jobject)?;
+    let mut metadata = HashMap::new();
+    let mut iter = jmap.iter(env)?;
+    while let Ok(Some(entry)) = iter.next(env) {
+        let (key_object, value_object) = entry;
+        let key = jni_jobject_to_string(env, key_object)?;
+        let value = jni_jobject_to_string(env, value_object)?;
+        metadata.insert(key, value);
+    }
+    Ok(metadata)
+}
+
 /// Checks if there is an exception in the jni environment, describes it to
 /// the stderr and finally clears it
 pub fn jni_check_exception(env: &mut JNIEnv) -> ExtractResult<bool> {

diff --git a/extractous-core/src/tika/parse.rs b/extractous-core/src/tika/parse.rs
@@ -2,7 +2,7 @@ use std::sync::OnceLock;
 
 use jni::objects::JValue;
 use jni::{AttachGuard, JavaVM};
-
+use std::collections::HashMap;
 use crate::errors::ExtractResult;
 use crate::tika::jni_utils::*;
 use crate::tika::wrappers::*;
@@ -89,14 +89,14 @@ pub fn parse_file(
     )
 }
 
-/// Parses a file to a string using the Apache Tika library.
-pub fn parse_file_to_string(
+/// Parses a file to a JStringResult using the Apache Tika library.
+pub fn parse_file_to_j_string_result(
     file_path: &str,
     max_length: i32,
     pdf_conf: &PdfParserConfig,
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
-) -> ExtractResult<String> {
+) -> ExtractResult<JStringResult> {
     let mut env = get_vm_attach_current_thread()?;
 
     // Create a new Java string from the Rust string
@@ -124,10 +124,33 @@ pub fn parse_file_to_string(
 
     // Create and process the JStringResult
     let result = JStringResult::new(&mut env, call_result_obj)?;
+    Ok(result)
+}
 
+/// Parses a file to a string using the Apache Tika library.
+pub fn parse_file_to_string(
+    file_path: &str,
+    max_length: i32,
+    pdf_conf: &PdfParserConfig,
+    office_conf: &OfficeParserConfig,
+    ocr_conf: &TesseractOcrConfig,
+) -> ExtractResult<String> {
+    let result = parse_file_to_j_string_result(file_path, max_length, pdf_conf, office_conf, ocr_conf)?;
     Ok(result.content)
 }
 
+/// Parses a file to a tuple (string, metadata) using the Apache Tika library.
+pub fn parse_file_to_string_with_metadata(
+    file_path: &str,
+    max_length: i32,
+    pdf_conf: &PdfParserConfig,
+    office_conf: &OfficeParserConfig,
+    ocr_conf: &TesseractOcrConfig,
+) -> ExtractResult<(String, HashMap<String, String>)> {
+    let result = parse_file_to_j_string_result(file_path, max_length, pdf_conf, office_conf, ocr_conf)?;
+    Ok((result.content, result.metadata))
+}
+
 pub fn parse_bytes(
     buffer: &[u8],
     char_set: &CharSet,

diff --git a/extractous-core/src/tika/wrappers.rs b/extractous-core/src/tika/wrappers.rs
@@ -1,11 +1,15 @@
 use crate::errors::{Error, ExtractResult};
-use crate::tika::jni_utils::{jni_call_method, jni_jobject_to_string, jni_new_string_as_jvalue};
+use crate::tika::jni_utils::{jni_call_method, jni_jobject_hashmap_to_hashmap, jni_jobject_to_string, jni_new_string_as_jvalue};
 use crate::tika::vm;
 use crate::{OfficeParserConfig, PdfParserConfig, TesseractOcrConfig, DEFAULT_BUF_SIZE};
 use bytemuck::cast_slice_mut;
 use jni::objects::{GlobalRef, JByteArray, JObject, JValue};
 use jni::sys::jsize;
 use jni::JNIEnv;
+use std::collections::HashMap;
+
+/// Alias Metadata
+type Metadata = HashMap<String, String>;
 
 /// Wrapper for [`JObject`]s that contain `org.apache.commons.io.input.ReaderInputStream`
 /// It saves a GlobalRef to the java object, which is cleared when the last GlobalRef is dropped
@@ -104,8 +108,9 @@ impl Drop for JReaderInputStream {
 
 /// Wrapper for the Java class  `ai.yobix.StringResult`
 /// Upon creation it parses the java StringResult object and saves the converted Rust string
-pub(crate) struct JStringResult {
-    pub(crate) content: String,
+pub struct JStringResult {
+    pub content: String,
+    pub metadata: Metadata,
 }
 
 impl<'local> JStringResult {
@@ -130,7 +135,10 @@ impl<'local> JStringResult {
 
             let content = jni_jobject_to_string(env, call_result_obj)?;
 
-            Ok(Self { content })
+            let metadata_obj_hashmap: JObject = env.call_method(&obj, "getMetadata", "()Ljava/util/HashMap;", &[])?
+                .l()?;
+            let metadata = jni_jobject_hashmap_to_hashmap(env, metadata_obj_hashmap)?;
+            Ok(Self { content, metadata })
         }
     }
 }

diff --git a/extractous-core/tests/extract_to_string_tests.rs b/extractous-core/tests/extract_to_string_tests.rs
@@ -1,6 +1,6 @@
 extern crate test_case;
 extern crate textdistance;
-
+use std::collections::HashMap;
 use extractous::{Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig};
 use std::fs;
 use test_case::test_case;
@@ -39,6 +39,44 @@ fn test_extract_file_to_string(file_name: &str, target_dist: f64) {
     println!("{}: {}", file_name, dist);
 }
 
+#[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")]
+#[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")]
+#[test_case("simple.odt", 0.8; "Test ODT file")]
+#[test_case("table-multi-row-column-cells-actual.csv", 0.8; "Test CSV file")]
+#[test_case("vodafone.xlsx", 0.4; "Test XLSX file")]
+#[test_case("category-level.docx", 0.9; "Test DOCX file")]
+#[test_case("simple.doc", 0.9; "Test DOC file")]
+#[test_case("simple.pptx", 0.9; "Test another PPTX file")]
+#[test_case("table-multi-row-column-cells.png", -1.0; "Test PNG file")]
+#[test_case("winter-sports.epub", 0.9; "Test EPUB file")]
+#[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")]
+//#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")]
+fn test_extract_file_to_string_with_metadata(file_name: &str, target_dist: f64) {
+    let extractor = Extractor::new().set_extract_string_max_length(1000000);
+    // extract file with extractor
+    let (extracted_content, extracted_metadata) = extractor
+        .extract_file_to_string_with_metadata(&format!("../test_files/documents/{}", file_name))
+        .unwrap();
+    // read expected content
+    let expected_content =
+        fs::read_to_string(format!("../test_files/expected_result/{}.txt", file_name)).unwrap();
+    // read expected metadata
+    let expected_metadata_string =
+        fs::read_to_string(format!("../test_files/expected_result/{}.metadata.json", file_name)).unwrap();
+    let expected_metadata: HashMap<String, String> = serde_json::from_str(&expected_metadata_string).expect("JSON was not well-formatted");
+
+    let dist = cosine(&expected_content, &extracted_content);
+    assert!(
+        dist > target_dist,
+        "Cosine similarity is less than {} for file: {}, dist: {}",
+        target_dist,
+        file_name,
+        dist
+    );
+    assert_eq!(extracted_metadata, expected_metadata, "The HashMaps do not contain the same data!");
+    //println!("{:?}", extracted_metadata)
+}
+
 #[test]
 fn test_extract_file_to_string_ara_ocr_png() {
     let extractor = Extractor::new()