Skip to content

Commit

Permalink
Merge pull request #38 from yobix-ai/parse-as-xml
Browse files Browse the repository at this point in the history
fear: implemented extracting output as xml
  • Loading branch information
nmammeri authored Dec 20, 2024
2 parents 9881d0b + be5d280 commit 012ef11
Show file tree
Hide file tree
Showing 24 changed files with 626 additions and 111 deletions.
16 changes: 8 additions & 8 deletions .github/workflows/release_python.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This file was autogenerated by maturin v1.6.0 using:
# maturin generate-ci github -o ../../.github/workflows/release_pyton_pytest.yml --pytest
#
#
# Then adapted to the project
#
name: CI
Expand Down Expand Up @@ -38,7 +38,7 @@ jobs:
python-version: '3.8'

# On linux we don't use graalvm/[email protected] action to install graalvm because it will install it
# on the runner machine and on linux the build will happen inside a manylinux docker.
# on the runner machine and on linux the build will happen inside a manylinux docker.
# Instead, we use a script to install graalvm inside the docker container
# the script is launched by setting the before-script-linux config option of the maturin action
- name: Build wheels
Expand All @@ -60,7 +60,7 @@ jobs:
with:
name: wheels-linux-${{ matrix.platform.target }}
path: bindings/extractous-python/dist

- name: pytest
if: ${{ startsWith(matrix.platform.target, 'x86_64') }}
shell: bash
Expand All @@ -70,7 +70,7 @@ jobs:
python3 -m venv .venv
source .venv/bin/activate
pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
pip install pytest scikit-learn
pip install pytest scikit-learn lxml
cd bindings/extractous-python
pytest -s
Expand All @@ -85,7 +85,7 @@ jobs:
apt-get update
apt-get install tesseract-ocr tesseract-ocr-deu tesseract-ocr-ara
apt-get install -y --no-install-recommends python3 python3-pip
pip3 install -U pip pytest scikit-learn
pip3 install -U pip pytest scikit-learn lxml
run: |
set -e
pip3 install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
Expand Down Expand Up @@ -129,7 +129,7 @@ jobs:
python -m venv .venv
.venv\Scripts\activate.bat
pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
pip install pytest scikit-learn
pip install pytest scikit-learn lxml
cd bindings\extractous-python
pytest -s .
Expand Down Expand Up @@ -186,7 +186,7 @@ jobs:
python3 -m venv .venv
source .venv/bin/activate
pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
pip install pytest scikit-learn
pip install pytest scikit-learn lxml
cd bindings/extractous-python
pytest -s
Expand All @@ -206,7 +206,7 @@ jobs:
name: wheels-sdist
path: bindings/extractous-python/dist

# Follows the guide on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
# Follows the guide on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
# We use 2 actions one to publish on PyPi on tag pushes to main brnach and the other to publish on TestPyPi on any push
publish-to-testpypi:
name: Publish to TestPyPI
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ from extractous import Extractor
# Create a new extractor
extractor = Extractor()
extractor = extractor.set_extract_string_max_length(1000)
# if you need an xml
# extractor = extractor.set_xml_output(True)

# Extract text from a file
result, metadata = extractor.extract_file_to_string("README.md")
Expand Down Expand Up @@ -125,6 +127,8 @@ use extractous::Extractor;
fn main() {
// Create a new extractor. Note it uses a consuming builder pattern
let mut extractor = Extractor::new().set_extract_string_max_length(1000);
// if you need an xml
// extractor = extractor.set_parse_string_as_xml(false);

// Extract text from a file
let (text, metadata) = extractor.extract_file_to_string("README.md").unwrap();
Expand Down
4 changes: 3 additions & 1 deletion bindings/extractous-python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ from extractous import Extractor

# Create a new extractor
extractor = Extractor()
extractor.set_extract_string_max_length(1000)
extractor = extractor.set_extract_string_max_length(1000)
# if you need an xml
# extractor = extractor.set_xml_output(True)

# Extract text from a file
result, metadata = extractor.extract_file_to_string("README.md")
Expand Down
8 changes: 4 additions & 4 deletions bindings/extractous-python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "extractous"
version = '0.2.0'
version = '0.2.1'
classifiers = [
"Programming Language :: Rust",
"Programming Language :: Python :: Implementation :: CPython",
Expand Down Expand Up @@ -50,19 +50,19 @@ module-name = "extractous._extractous"
python-source = "python"

# Setting skip-auditwheel=true is very important to instruct maturin to not run its auditwheel flow
# maturin auditwheel flow changes any top level shared lib names which causes problems with our graalvm libs
# maturin auditwheel flow changes any top level shared lib names which causes problems with our graalvm libs
# By skipping the wheel, we just get a plain _extracts_rs* lib, and we have to:
# * bundle our graalvm libs using the below include [] directive
# * change the RPATH of _extracts_rs* lib to be able to properly find the bundled graalvm libs
skip-auditwheel=true
skip-auditwheel=true

# This tells cargo to set the RPATH for the private module built lib _extractous.abi3.so
# Set the RPATH to $ORIGIN because the graalvm libs will be bundled in the same dir as the _extractous.abi3.so
rustc-args = ["-C", "link-arg=-Wl,-rpath,$ORIGIN"]

# Maturin include command will start looking from the python/extractous folder
# so to include the graalvm libs the rust build script must copy them to python/extractous folder
include = [
include = [
{path = "**/*.so", format = ["wheel"]},
{path = "**/*.dylib", format = ["wheel"]},
{path = "**/*.dll", format = ["wheel"]}
Expand Down
6 changes: 6 additions & 0 deletions bindings/extractous-python/src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,12 @@ impl Extractor {
Ok(Self(inner))
}

/// Set the configuration for the parse as xml
pub fn set_xml_output(&self, xml_output: bool) -> PyResult<Self> {
let inner = self.0.clone().set_xml_output(xml_output);
Ok(Self(inner))
}

/// Extracts text from a file path. Returns a tuple with stream of the extracted text
/// the stream is decoded using the extractor's `encoding` and tika metadata.
pub fn extract_file<'py>(
Expand Down
66 changes: 64 additions & 2 deletions bindings/extractous-python/tests/test_extract_bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from extractous import Extractor
from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray, \
is_expected_metadata_contained
is_expected_metadata_contained, extract_body_text

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9, 0.8),
Expand Down Expand Up @@ -49,6 +49,36 @@ def test_extract_bytes_to_string(file_name, target_dist, metadata_dist):
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_bytes_to_string_as_xml(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
file_bytes = read_file_to_bytearray(original_filepath)

extractor = Extractor()
extractor = extractor.set_xml_output(True)
result_xml, metadata = extractor.extract_file_to_string(original_filepath)
result_text = extract_body_text(result_xml)

# Check Expected
assert cosine_similarity(result_text, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes of various file types."""
Expand Down Expand Up @@ -76,4 +106,36 @@ def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist):
# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"


@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_bytes_to_stream_as_xml(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes to stream as xml of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
file_bytes = read_file_to_bytearray(original_filepath)

extractor = Extractor()
extractor = extractor.set_xml_output(True)
reader, metadata = extractor.extract_bytes(file_bytes)
result_xml = read_to_string(reader)
result_text = extract_body_text(result_xml)

# Check Expected
assert cosine_similarity(result_text, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
67 changes: 63 additions & 4 deletions bindings/extractous-python/tests/test_extract_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest

from extractous import Extractor
from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string
from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string, extract_body_text

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9, 0.8),
Expand All @@ -21,7 +21,7 @@

@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_file_to_string(file_name, target_dist, metadata_dist):
"""Test the extraction and comparison of various file types."""
"""Test the extraction to string as plain text of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
Expand Down Expand Up @@ -49,10 +49,39 @@ def test_extract_file_to_string(file_name, target_dist, metadata_dist):
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_file_to_string_as_xml(file_name, target_dist, metadata_dist):
"""Test the extraction to string as XML of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
extractor = Extractor()
extractor = extractor.set_xml_output(True)
result_xml, metadata = extractor.extract_file_to_string(original_filepath)
result_text = extract_body_text(result_xml)

# Check extracted
assert cosine_similarity(result_text, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"



@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_file_to_stream(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes of various file types."""
"""Test the extraction from bytes to stream of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
Expand All @@ -75,4 +104,34 @@ def test_extract_file_to_stream(file_name, target_dist, metadata_dist):
# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"


@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_file_to_stream_as_xml(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
extractor = Extractor()
extractor = extractor.set_xml_output(True)
reader, metadata = extractor.extract_file(original_filepath)
result_xml = read_to_string(reader)
result_text = extract_body_text(result_xml)

# Check extracted
assert cosine_similarity(result_text, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
11 changes: 10 additions & 1 deletion bindings/extractous-python/tests/test_extract_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,13 @@ def test_extract_url_to_string():
content, metadata = extractor.extract_url_to_string("https://www.google.com")

assert "Google" in content
assert len(metadata.keys()) > 0
assert len(metadata.keys()) > 0

def test_extract_url_to_string_as_xml():
extractor = Extractor()
extractor = extractor.set_xml_output(True)

content, metadata = extractor.extract_url_to_string("https://www.google.com")

assert "Google" in content
assert len(metadata.keys()) > 0
27 changes: 26 additions & 1 deletion bindings/extractous-python/tests/test_pdf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from extractous import Extractor
from utils import read_to_string
from utils import read_to_string, extract_body_text


def expected_result():
Expand All @@ -22,6 +22,17 @@ def test_extract_file():
print(f"test_pdf:test_extract_file result = {result}")
assert result == expected_result()

def test_extract_file_as_xml():
extractor = Extractor()
extractor = extractor.set_xml_output(True)
reader, metadata = extractor.extract_file("tests/quarkus.pdf")

result_xml = read_to_string(reader)

print(f"test_pdf:test_extract_file_as_xml result = {result_xml}")
result_text = extract_body_text(result_xml)
assert result_text.strip() == expected_result().strip()

def test_extract_bytes():
extractor = Extractor()

Expand All @@ -33,3 +44,17 @@ def test_extract_bytes():

print(f"test_pdf:test_extract_bytes result = {result}")
assert result == expected_result()

def test_extract_bytes_as_xml():
extractor = Extractor()
extractor = extractor.set_xml_output(True)

with open("tests/quarkus.pdf", "rb") as file:
buffer = bytearray(file.read())
reader, metadata = extractor.extract_bytes(buffer)

result_xml = read_to_string(reader)

print(f"test_pdf:test_extract_bytes_as_xml result = {result_xml}")
result_text = extract_body_text(result_xml)
assert result_text.strip() == expected_result().strip()
Loading

0 comments on commit 012ef11

Please sign in to comment.