Merge pull request #38 from yobix-ai/parse-as-xml

fear: implemented extracting output as xml
yobix-ai · Dec 20, 2024 · 012ef11 · 012ef11
2 parents 9881d0b + be5d280
commit 012ef11
Show file tree

Hide file tree

Showing 24 changed files with 626 additions and 111 deletions.
diff --git a/.github/workflows/release_python.yml b/.github/workflows/release_python.yml
@@ -1,6 +1,6 @@
 # This file was autogenerated by maturin v1.6.0 using:
 #    maturin generate-ci github -o ../../.github/workflows/release_pyton_pytest.yml --pytest
-# 
+#
 # Then adapted to the project
 #
 name: CI
@@ -38,7 +38,7 @@ jobs:
           python-version: '3.8'
 
       # On linux we don't use graalvm/[email protected] action to install graalvm because it will install it
-      # on the runner machine and on linux the build will happen inside a manylinux docker. 
+      # on the runner machine and on linux the build will happen inside a manylinux docker.
       # Instead, we use a script to install graalvm inside the docker container
       # the script is launched by setting the before-script-linux config option of the maturin action
       - name: Build wheels
@@ -60,7 +60,7 @@ jobs:
         with:
           name: wheels-linux-${{ matrix.platform.target }}
           path: bindings/extractous-python/dist
-      
+
       - name: pytest
         if: ${{ startsWith(matrix.platform.target, 'x86_64') }}
         shell: bash
@@ -70,7 +70,7 @@ jobs:
           python3 -m venv .venv
           source .venv/bin/activate
           pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
-          pip install pytest scikit-learn
+          pip install pytest scikit-learn lxml
           cd bindings/extractous-python
           pytest -s
 
@@ -85,7 +85,7 @@ jobs:
             apt-get update
             apt-get install tesseract-ocr tesseract-ocr-deu tesseract-ocr-ara
             apt-get install -y --no-install-recommends python3 python3-pip
-            pip3 install -U pip pytest scikit-learn
+            pip3 install -U pip pytest scikit-learn lxml
           run: |
             set -e
             pip3 install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
@@ -129,7 +129,7 @@ jobs:
           python -m venv .venv
           .venv\Scripts\activate.bat
           pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
-          pip install pytest scikit-learn
+          pip install pytest scikit-learn lxml
           cd bindings\extractous-python
           pytest -s .
 
@@ -186,7 +186,7 @@ jobs:
           python3 -m venv .venv
           source .venv/bin/activate
           pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
-          pip install pytest scikit-learn
+          pip install pytest scikit-learn lxml
           cd bindings/extractous-python
           pytest -s
 
@@ -206,7 +206,7 @@ jobs:
           name: wheels-sdist
           path: bindings/extractous-python/dist
 
-  # Follows the guide on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/        
+  # Follows the guide on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
   # We use 2 actions one to publish on PyPi on tag pushes to main brnach and the other to publish on TestPyPi on any push
   publish-to-testpypi:
     name: Publish to TestPyPI

diff --git a/README.md b/README.md
@@ -72,6 +72,8 @@ from extractous import Extractor
 # Create a new extractor
 extractor = Extractor()
 extractor = extractor.set_extract_string_max_length(1000)
+# if you need an xml
+# extractor = extractor.set_xml_output(True)
 
 # Extract text from a file
 result, metadata = extractor.extract_file_to_string("README.md")
@@ -125,6 +127,8 @@ use extractous::Extractor;
 fn main() {
     // Create a new extractor. Note it uses a consuming builder pattern
     let mut extractor = Extractor::new().set_extract_string_max_length(1000);
+    // if you need an xml
+    // extractor = extractor.set_parse_string_as_xml(false);
 
     // Extract text from a file
     let (text, metadata) = extractor.extract_file_to_string("README.md").unwrap();

diff --git a/bindings/extractous-python/README.md b/bindings/extractous-python/README.md
@@ -20,7 +20,9 @@ from extractous import Extractor
 
 # Create a new extractor
 extractor = Extractor()
-extractor.set_extract_string_max_length(1000)
+extractor = extractor.set_extract_string_max_length(1000)
+# if you need an xml
+# extractor = extractor.set_xml_output(True)
 
 # Extract text from a file
 result, metadata = extractor.extract_file_to_string("README.md")

diff --git a/bindings/extractous-python/pyproject.toml b/bindings/extractous-python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "extractous"
-version = '0.2.0'
+version = '0.2.1'
 classifiers = [
   "Programming Language :: Rust",
   "Programming Language :: Python :: Implementation :: CPython",
@@ -50,19 +50,19 @@ module-name = "extractous._extractous"
 python-source = "python"
 
 # Setting skip-auditwheel=true is very important to instruct maturin to not run its auditwheel flow
-# maturin auditwheel flow changes any top level shared lib names which causes problems with our graalvm libs 
+# maturin auditwheel flow changes any top level shared lib names which causes problems with our graalvm libs
 # By skipping the wheel, we just get a plain _extracts_rs* lib, and we have to:
 #   * bundle our graalvm libs using the below include [] directive
 #   * change the RPATH of _extracts_rs* lib to be able to properly find the bundled graalvm libs
-skip-auditwheel=true 
+skip-auditwheel=true
 
 # This tells cargo to set the RPATH for the private module built lib _extractous.abi3.so
 # Set the RPATH to $ORIGIN because the graalvm libs will be bundled in the same dir as the _extractous.abi3.so
 rustc-args = ["-C", "link-arg=-Wl,-rpath,$ORIGIN"]
 
 # Maturin include command will start looking from the python/extractous folder
 # so to include the graalvm libs the rust build script must copy them to python/extractous folder
-include = [ 
+include = [
   {path = "**/*.so", format = ["wheel"]},
   {path = "**/*.dylib", format = ["wheel"]},
   {path = "**/*.dll", format = ["wheel"]}

diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs
@@ -136,6 +136,12 @@ impl Extractor {
         Ok(Self(inner))
     }
 
+    /// Set the configuration for the parse as xml
+    pub fn set_xml_output(&self, xml_output: bool) -> PyResult<Self> {
+        let inner = self.0.clone().set_xml_output(xml_output);
+        Ok(Self(inner))
+    }
+
     /// Extracts text from a file path. Returns a tuple with stream of the extracted text
     /// the stream is decoded using the extractor's `encoding` and tika metadata.
     pub fn extract_file<'py>(

diff --git a/bindings/extractous-python/tests/test_extract_bytes.py b/bindings/extractous-python/tests/test_extract_bytes.py
@@ -3,7 +3,7 @@
 
 from extractous import Extractor
 from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray, \
-    is_expected_metadata_contained
+    is_expected_metadata_contained, extract_body_text
 
 TEST_CASES = [
     ("2022_Q3_AAPL.pdf", 0.9, 0.8),
@@ -49,6 +49,36 @@ def test_extract_bytes_to_string(file_name, target_dist, metadata_dist):
     assert percent_similarity >= metadata_dist, \
         f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
 
+@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
+def test_extract_bytes_to_string_as_xml(file_name, target_dist, metadata_dist):
+    """Test the extraction from bytes of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+
+    # Extract
+    file_bytes = read_file_to_bytearray(original_filepath)
+
+    extractor = Extractor()
+    extractor = extractor.set_xml_output(True)
+    result_xml, metadata = extractor.extract_file_to_string(original_filepath)
+    result_text = extract_body_text(result_xml)
+
+    # Check Expected
+    assert cosine_similarity(result_text, expected) >= target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
+    # Check metadata
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity >= metadata_dist, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
+
 @pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
 def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist):
     """Test the extraction from bytes of various file types."""
@@ -76,4 +106,36 @@ def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist):
     # Check metadata
     percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
     assert percent_similarity >= metadata_dist, \
-        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
+
+
+@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
+def test_extract_bytes_to_stream_as_xml(file_name, target_dist, metadata_dist):
+    """Test the extraction from bytes to stream as xml of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+
+    # Extract
+    file_bytes = read_file_to_bytearray(original_filepath)
+
+    extractor = Extractor()
+    extractor = extractor.set_xml_output(True)
+    reader, metadata = extractor.extract_bytes(file_bytes)
+    result_xml = read_to_string(reader)
+    result_text = extract_body_text(result_xml)
+
+    # Check Expected
+    assert cosine_similarity(result_text, expected) >= target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
+    # Check metadata
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity >= metadata_dist, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
diff --git a/bindings/extractous-python/tests/test_extract_file.py b/bindings/extractous-python/tests/test_extract_file.py
@@ -2,7 +2,7 @@
 import pytest
 
 from extractous import Extractor
-from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string
+from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string, extract_body_text
 
 TEST_CASES = [
     ("2022_Q3_AAPL.pdf", 0.9, 0.8),
@@ -21,7 +21,7 @@
 
 @pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
 def test_extract_file_to_string(file_name, target_dist, metadata_dist):
-    """Test the extraction and comparison of various file types."""
+    """Test the extraction to string as plain text of various file types."""
     original_filepath = f"../../test_files/documents/{file_name}"
     expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
     expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
@@ -49,10 +49,39 @@ def test_extract_file_to_string(file_name, target_dist, metadata_dist):
     assert percent_similarity >= metadata_dist, \
         f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
 
+@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
+def test_extract_file_to_string_as_xml(file_name, target_dist, metadata_dist):
+    """Test the extraction to string as XML of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+
+    # Extract
+    extractor = Extractor()
+    extractor = extractor.set_xml_output(True)
+    result_xml, metadata = extractor.extract_file_to_string(original_filepath)
+    result_text = extract_body_text(result_xml)
+
+    # Check extracted
+    assert cosine_similarity(result_text, expected) >= target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
+    # Check metadata
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity >= metadata_dist, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
+
+
 
 @pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
 def test_extract_file_to_stream(file_name, target_dist, metadata_dist):
-    """Test the extraction from bytes of various file types."""
+    """Test the extraction from bytes to stream of various file types."""
     original_filepath = f"../../test_files/documents/{file_name}"
     expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
     expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
@@ -75,4 +104,34 @@ def test_extract_file_to_stream(file_name, target_dist, metadata_dist):
     # Check metadata
     percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
     assert percent_similarity >= metadata_dist, \
-        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
+
+
+@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
+def test_extract_file_to_stream_as_xml(file_name, target_dist, metadata_dist):
+    """Test the extraction from bytes of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+
+    # Extract
+    extractor = Extractor()
+    extractor = extractor.set_xml_output(True)
+    reader, metadata = extractor.extract_file(original_filepath)
+    result_xml = read_to_string(reader)
+    result_text = extract_body_text(result_xml)
+
+    # Check extracted
+    assert cosine_similarity(result_text, expected) >= target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
+    # Check metadata
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity >= metadata_dist, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
diff --git a/bindings/extractous-python/tests/test_extract_url.py b/bindings/extractous-python/tests/test_extract_url.py
@@ -16,4 +16,13 @@ def test_extract_url_to_string():
     content, metadata  = extractor.extract_url_to_string("https://www.google.com")
 
     assert "Google" in content
-    assert len(metadata.keys()) > 0
+    assert len(metadata.keys()) > 0
+
+def test_extract_url_to_string_as_xml():
+    extractor = Extractor()
+    extractor = extractor.set_xml_output(True)
+
+    content, metadata  = extractor.extract_url_to_string("https://www.google.com")
+
+    assert "Google" in content
+    assert len(metadata.keys()) > 0
diff --git a/bindings/extractous-python/tests/test_pdf.py b/bindings/extractous-python/tests/test_pdf.py
@@ -1,5 +1,5 @@
 from extractous import Extractor
-from utils import read_to_string
+from utils import read_to_string, extract_body_text
 
 
 def expected_result():
@@ -22,6 +22,17 @@ def test_extract_file():
     print(f"test_pdf:test_extract_file result = {result}")
     assert result == expected_result()
 
+def test_extract_file_as_xml():
+    extractor = Extractor()
+    extractor = extractor.set_xml_output(True)
+    reader, metadata = extractor.extract_file("tests/quarkus.pdf")
+
+    result_xml = read_to_string(reader)
+
+    print(f"test_pdf:test_extract_file_as_xml result = {result_xml}")
+    result_text = extract_body_text(result_xml)
+    assert result_text.strip() == expected_result().strip()
+
 def test_extract_bytes():
     extractor = Extractor()
 
@@ -33,3 +44,17 @@ def test_extract_bytes():
 
     print(f"test_pdf:test_extract_bytes result = {result}")
     assert result == expected_result()
+
+def test_extract_bytes_as_xml():
+    extractor = Extractor()
+    extractor = extractor.set_xml_output(True)
+
+    with open("tests/quarkus.pdf", "rb") as file:
+        buffer = bytearray(file.read())
+    reader, metadata = extractor.extract_bytes(buffer)
+
+    result_xml = read_to_string(reader)
+
+    print(f"test_pdf:test_extract_bytes_as_xml result = {result_xml}")
+    result_text = extract_body_text(result_xml)
+    assert result_text.strip() == expected_result().strip()