Merge pull request #1 from s4zuk3/3-return-tika-metadata-extend

Changed HashMap to Tika Metadata. Extended functionality for bytes, f…
yobix-ai · Nov 14, 2024 · 9d8971e · 9d8971e
2 parents 1b8039c + f2901cf
commit 9d8971e
Show file tree

Hide file tree

Showing 33 changed files with 1,256 additions and 2,853 deletions.
diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs
@@ -153,6 +153,25 @@ impl Extractor {
         })
     }
 
+
+    /// Extracts text from a file path. Returns a tuple with stream of the extracted text
+    /// the stream is decoded using the extractor's `encoding` and tika metadata.
+    pub fn extract_file_with_metadata(&self, filename: &str) -> PyResult<(StreamReader, PyObject)> {
+        let (reader, metadata) = self.0
+            .extract_file_with_metadata(filename)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
+        Python::with_gil(|py| {
+            let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
+            Ok((StreamReader {
+                reader,
+                buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
+                py_bytes: None,
+            }, py_metadata.into()))
+        })
+    }
+
     /// Extracts text from a file path. Returns a string that is of maximum length
     /// of the extractor's `extract_string_max_length`
     pub fn extract_file_to_string(&self, filename: &str) -> PyResult<String> {
@@ -169,7 +188,7 @@ impl Extractor {
             .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
 
         Python::with_gil(|py| {
-            let py_metadata = hashmap_to_pydict(py, &metadata);
+            let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
             Ok((content, py_metadata.into()))
         })
     }
@@ -191,6 +210,25 @@ impl Extractor {
         })
     }
 
+    /// Extracts text from a bytearray. Returns a tuple with stream of the extracted text
+    /// the stream is decoded using the extractor's `encoding` and tika metadata.
+    pub fn extract_bytes_with_metadata(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult<(StreamReader, PyObject)> {
+        let slice = buffer.to_vec();
+        let (reader, metadata) = self.0
+            .extract_bytes_with_metadata(&slice)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
+        Python::with_gil(|py| {
+            let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
+            Ok((StreamReader {
+                reader,
+                buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
+                py_bytes: None,
+            }, py_metadata.into()))
+        })
+    }
+
     /// Extracts text from a url. Returns a string that is of maximum length
     /// of the extractor's `extract_string_max_length`
     pub fn extract_url(&self, url: &str) -> PyResult<StreamReader> {
@@ -207,15 +245,34 @@ impl Extractor {
         })
     }
 
+    /// Extracts text from a url. Returns a tuple with string that is of maximum length
+    /// of the extractor's `extract_string_max_length` and tika metdata.
+    pub fn extract_url_with_metadata(&self, url: &str) -> PyResult<(StreamReader, PyObject)> {
+        let (reader, metadata) = self.0
+            .extract_url_with_metadata(&url)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
+        Python::with_gil(|py| {
+            let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
+            Ok((StreamReader {
+                reader,
+                buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
+                py_bytes: None,
+            }, py_metadata.into()))
+        })
+    }
+
     fn __repr__(&self) -> String {
         format!("{:?}", self.0)
     }
 }
 
-fn hashmap_to_pydict<'py>(py: Python<'py>, hashmap: &HashMap<String, String>) -> &'py PyDict {
+fn metadata_hashmap_to_pydict<'py>(py: Python<'py>, hashmap: &HashMap<String, Vec<String>>) -> Result<&'py PyDict, PyErr> {
     let pydict = PyDict::new(py);
     for (key, value) in hashmap {
-        pydict.set_item(key, value).unwrap();
+        pydict.set_item(key, value)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
     }
-    pydict
+    Ok(pydict)
 }
diff --git a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py b/bindings/extractous-python/tests/test_extract_bytes_to_stream.py
@@ -1,7 +1,8 @@
+import json
 import pytest
 
 from extractous import Extractor
-from utils import cosine_similarity, read_to_string, read_file_to_bytearray
+from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray
 
 TEST_CASES = [
     ("2022_Q3_AAPL.pdf", 0.9),
@@ -38,3 +39,32 @@ def test_extract_bytes_to_stream(file_name, target_dist):
     assert cosine_similarity(result, expected) > target_dist, \
         f"Cosine similarity is less than {target_dist} for file: {file_name}"
 
+
+TEST_CASES_METADATA = [
+    ("2022_Q3_AAPL.pdf", 0.9),
+    ("science-exploration-1p.pptx", 0.9),
+    ("simple.odt", 0.9),
+    ("table-multi-row-column-cells-actual.csv", 0.6),
+    ("vodafone.xlsx", 0.8),
+    ("category-level.docx", 0.9),
+    ("simple.doc", 0.9),
+    ("simple.pptx", 0.9),
+    ("table-multi-row-column-cells.png", 0.9),
+    ("winter-sports.epub", 0.8),
+    ("bug_16.docx", 0.9),
+]
+
+
+@pytest.mark.parametrize("file_name, similarity_percent", TEST_CASES_METADATA)
+def test_extract_bytes_to_stream(file_name, similarity_percent):
+    """Test the extraction from bytes of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+    file_bytes = read_file_to_bytearray(original_filepath)
+    extractor = Extractor()
+    _reader, metadata = extractor.extract_bytes_with_metadata(file_bytes)
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity > similarity_percent, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
diff --git a/bindings/extractous-python/tests/test_extract_file_to_string.py b/bindings/extractous-python/tests/test_extract_file_to_string.py
@@ -1,7 +1,8 @@
+import json
 import pytest
 
 from extractous import Extractor
-from utils import cosine_similarity
+from utils import calculate_similarity_percent, cosine_similarity
 
 TEST_CASES = [
     ("2022_Q3_AAPL.pdf", 0.9),
@@ -32,3 +33,30 @@ def test_extract_file_to_string(file_name, target_dist):
     assert cosine_similarity(result, expected) > target_dist, \
         f"Cosine similarity is less than {target_dist} for file: {file_name}"
 
+
+TEST_CASES_METADATA = [
+    ("2022_Q3_AAPL.pdf", 0.9),
+    ("science-exploration-1p.pptx", 0.9),
+    ("simple.odt", 0.9),
+    ("table-multi-row-column-cells-actual.csv", 0.6),
+    ("vodafone.xlsx", 0.8),
+    ("category-level.docx", 0.9),
+    ("simple.doc", 0.9),
+    ("simple.pptx", 0.9),
+    ("table-multi-row-column-cells.png", 0.9),
+    ("winter-sports.epub", 0.8),
+    ("bug_16.docx", 0.9),
+]
+
+@pytest.mark.parametrize("file_name, similarity_percent", TEST_CASES_METADATA)
+def test_extract_file_to_string_with_metadata(file_name, similarity_percent):
+    """Test the extraction and comparison of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+    extractor = Extractor()
+    _result, metadata = extractor.extract_file_to_string_with_metadata(original_filepath)
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity > similarity_percent, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
diff --git a/bindings/extractous-python/tests/test_extract_file_to_string_with_metadata.py b/bindings/extractous-python/tests/test_extract_file_to_string_with_metadata.py
diff --git a/bindings/extractous-python/tests/test_extract_url.py b/bindings/extractous-python/tests/test_extract_url.py
@@ -8,3 +8,8 @@ def test_extract_url():
     result = read_to_string(reader)
 
     assert "Google" in result
+
+def test_extract_url_with_metadata():
+    extractor = Extractor()
+    _reader, metadata = extractor.extract_url_with_metadata("https://www.google.com")
+    assert len(metadata.keys()) > 0
diff --git a/bindings/extractous-python/tests/utils.py b/bindings/extractous-python/tests/utils.py
@@ -45,3 +45,21 @@ def read_file_to_bytearray(file_path: str):
     with open(file_path, 'rb') as file:
         file_content = bytearray(file.read())
     return file_content
+
+
+def calculate_similarity_percent(expected, current):
+    matches = 0
+    total = 0
+
+    # Iterate over all keys in the 'expected' dictionary
+    for key, value1 in expected.items():
+        if key in current:
+            total += 1
+            if value1 == current[key]:
+                matches += 1
+
+    if total == 0:
+        return 0.0
+
+    # Return the similarity percentage
+    return matches / total