Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

35: Parse as xml #38

Merged
merged 7 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions .github/workflows/release_python.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This file was autogenerated by maturin v1.6.0 using:
# maturin generate-ci github -o ../../.github/workflows/release_pyton_pytest.yml --pytest
#
#
# Then adapted to the project
#
name: CI
Expand Down Expand Up @@ -38,7 +38,7 @@ jobs:
python-version: '3.8'

# On linux we don't use graalvm/[email protected] action to install graalvm because it will install it
# on the runner machine and on linux the build will happen inside a manylinux docker.
# on the runner machine and on linux the build will happen inside a manylinux docker.
# Instead, we use a script to install graalvm inside the docker container
# the script is launched by setting the before-script-linux config option of the maturin action
- name: Build wheels
Expand All @@ -60,7 +60,7 @@ jobs:
with:
name: wheels-linux-${{ matrix.platform.target }}
path: bindings/extractous-python/dist

- name: pytest
if: ${{ startsWith(matrix.platform.target, 'x86_64') }}
shell: bash
Expand All @@ -70,7 +70,7 @@ jobs:
python3 -m venv .venv
source .venv/bin/activate
pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
pip install pytest scikit-learn
pip install pytest scikit-learn lxml
cd bindings/extractous-python
pytest -s

Expand All @@ -85,7 +85,7 @@ jobs:
apt-get update
apt-get install tesseract-ocr tesseract-ocr-deu tesseract-ocr-ara
apt-get install -y --no-install-recommends python3 python3-pip
pip3 install -U pip pytest scikit-learn
pip3 install -U pip pytest scikit-learn lxml
run: |
set -e
pip3 install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
Expand Down Expand Up @@ -129,7 +129,7 @@ jobs:
python -m venv .venv
.venv\Scripts\activate.bat
pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
pip install pytest scikit-learn
pip install pytest scikit-learn lxml
cd bindings\extractous-python
pytest -s .

Expand Down Expand Up @@ -186,7 +186,7 @@ jobs:
python3 -m venv .venv
source .venv/bin/activate
pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
pip install pytest scikit-learn
pip install pytest scikit-learn lxml
cd bindings/extractous-python
pytest -s

Expand All @@ -206,7 +206,7 @@ jobs:
name: wheels-sdist
path: bindings/extractous-python/dist

# Follows the guide on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
# Follows the guide on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
# We use 2 actions one to publish on PyPi on tag pushes to main brnach and the other to publish on TestPyPi on any push
publish-to-testpypi:
name: Publish to TestPyPI
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ from extractous import Extractor
# Create a new extractor
extractor = Extractor()
extractor = extractor.set_extract_string_max_length(1000)
# if you need an xml
# extractor = extractor.set_xml_output(True)

# Extract text from a file
result, metadata = extractor.extract_file_to_string("README.md")
Expand Down Expand Up @@ -125,6 +127,8 @@ use extractous::Extractor;
fn main() {
// Create a new extractor. Note it uses a consuming builder pattern
let mut extractor = Extractor::new().set_extract_string_max_length(1000);
// if you need an xml
// extractor = extractor.set_parse_string_as_xml(false);

// Extract text from a file
let (text, metadata) = extractor.extract_file_to_string("README.md").unwrap();
Expand Down
4 changes: 3 additions & 1 deletion bindings/extractous-python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ from extractous import Extractor

# Create a new extractor
extractor = Extractor()
extractor.set_extract_string_max_length(1000)
extractor = extractor.set_extract_string_max_length(1000)
# if you need an xml
# extractor = extractor.set_xml_output(True)

# Extract text from a file
result, metadata = extractor.extract_file_to_string("README.md")
Expand Down
8 changes: 4 additions & 4 deletions bindings/extractous-python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "extractous"
version = '0.2.0'
version = '0.2.1'
classifiers = [
"Programming Language :: Rust",
"Programming Language :: Python :: Implementation :: CPython",
Expand Down Expand Up @@ -50,19 +50,19 @@ module-name = "extractous._extractous"
python-source = "python"

# Setting skip-auditwheel=true is very important to instruct maturin to not run its auditwheel flow
# maturin auditwheel flow changes any top level shared lib names which causes problems with our graalvm libs
# maturin auditwheel flow changes any top level shared lib names which causes problems with our graalvm libs
# By skipping the wheel, we just get a plain _extracts_rs* lib, and we have to:
# * bundle our graalvm libs using the below include [] directive
# * change the RPATH of _extracts_rs* lib to be able to properly find the bundled graalvm libs
skip-auditwheel=true
skip-auditwheel=true

# This tells cargo to set the RPATH for the private module built lib _extractous.abi3.so
# Set the RPATH to $ORIGIN because the graalvm libs will be bundled in the same dir as the _extractous.abi3.so
rustc-args = ["-C", "link-arg=-Wl,-rpath,$ORIGIN"]

# Maturin include command will start looking from the python/extractous folder
# so to include the graalvm libs the rust build script must copy them to python/extractous folder
include = [
include = [
{path = "**/*.so", format = ["wheel"]},
{path = "**/*.dylib", format = ["wheel"]},
{path = "**/*.dll", format = ["wheel"]}
Expand Down
6 changes: 6 additions & 0 deletions bindings/extractous-python/src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,12 @@ impl Extractor {
Ok(Self(inner))
}

/// Set the configuration for the parse as xml
pub fn set_xml_output(&self, xml_output: bool) -> PyResult<Self> {
let inner = self.0.clone().set_xml_output(xml_output);
Ok(Self(inner))
}

/// Extracts text from a file path. Returns a tuple with stream of the extracted text
/// the stream is decoded using the extractor's `encoding` and tika metadata.
pub fn extract_file<'py>(
Expand Down
66 changes: 64 additions & 2 deletions bindings/extractous-python/tests/test_extract_bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from extractous import Extractor
from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray, \
is_expected_metadata_contained
is_expected_metadata_contained, extract_body_text

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9, 0.8),
Expand Down Expand Up @@ -49,6 +49,36 @@ def test_extract_bytes_to_string(file_name, target_dist, metadata_dist):
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_bytes_to_string_as_xml(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
file_bytes = read_file_to_bytearray(original_filepath)

extractor = Extractor()
extractor = extractor.set_xml_output(True)
result_xml, metadata = extractor.extract_file_to_string(original_filepath)
result_text = extract_body_text(result_xml)

# Check Expected
assert cosine_similarity(result_text, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes of various file types."""
Expand Down Expand Up @@ -76,4 +106,36 @@ def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist):
# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"


@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_bytes_to_stream_as_xml(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes to stream as xml of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
file_bytes = read_file_to_bytearray(original_filepath)

extractor = Extractor()
extractor = extractor.set_xml_output(True)
reader, metadata = extractor.extract_bytes(file_bytes)
result_xml = read_to_string(reader)
result_text = extract_body_text(result_xml)

# Check Expected
assert cosine_similarity(result_text, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
67 changes: 63 additions & 4 deletions bindings/extractous-python/tests/test_extract_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest

from extractous import Extractor
from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string
from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string, extract_body_text

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9, 0.8),
Expand All @@ -21,7 +21,7 @@

@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_file_to_string(file_name, target_dist, metadata_dist):
"""Test the extraction and comparison of various file types."""
"""Test the extraction to string as plain text of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
Expand Down Expand Up @@ -49,10 +49,39 @@ def test_extract_file_to_string(file_name, target_dist, metadata_dist):
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_file_to_string_as_xml(file_name, target_dist, metadata_dist):
"""Test the extraction to string as XML of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
extractor = Extractor()
extractor = extractor.set_xml_output(True)
result_xml, metadata = extractor.extract_file_to_string(original_filepath)
result_text = extract_body_text(result_xml)

# Check extracted
assert cosine_similarity(result_text, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"



@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_file_to_stream(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes of various file types."""
"""Test the extraction from bytes to stream of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
Expand All @@ -75,4 +104,34 @@ def test_extract_file_to_stream(file_name, target_dist, metadata_dist):
# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"


@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_file_to_stream_as_xml(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
extractor = Extractor()
extractor = extractor.set_xml_output(True)
reader, metadata = extractor.extract_file(original_filepath)
result_xml = read_to_string(reader)
result_text = extract_body_text(result_xml)

# Check extracted
assert cosine_similarity(result_text, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
11 changes: 10 additions & 1 deletion bindings/extractous-python/tests/test_extract_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,13 @@ def test_extract_url_to_string():
content, metadata = extractor.extract_url_to_string("https://www.google.com")

assert "Google" in content
assert len(metadata.keys()) > 0
assert len(metadata.keys()) > 0

def test_extract_url_to_string_as_xml():
extractor = Extractor()
extractor = extractor.set_xml_output(True)

content, metadata = extractor.extract_url_to_string("https://www.google.com")

assert "Google" in content
assert len(metadata.keys()) > 0
27 changes: 26 additions & 1 deletion bindings/extractous-python/tests/test_pdf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from extractous import Extractor
from utils import read_to_string
from utils import read_to_string, extract_body_text


def expected_result():
Expand All @@ -22,6 +22,17 @@ def test_extract_file():
print(f"test_pdf:test_extract_file result = {result}")
assert result == expected_result()

def test_extract_file_as_xml():
extractor = Extractor()
extractor = extractor.set_xml_output(True)
reader, metadata = extractor.extract_file("tests/quarkus.pdf")

result_xml = read_to_string(reader)

print(f"test_pdf:test_extract_file_as_xml result = {result_xml}")
result_text = extract_body_text(result_xml)
assert result_text.strip() == expected_result().strip()

def test_extract_bytes():
extractor = Extractor()

Expand All @@ -33,3 +44,17 @@ def test_extract_bytes():

print(f"test_pdf:test_extract_bytes result = {result}")
assert result == expected_result()

def test_extract_bytes_as_xml():
extractor = Extractor()
extractor = extractor.set_xml_output(True)

with open("tests/quarkus.pdf", "rb") as file:
buffer = bytearray(file.read())
reader, metadata = extractor.extract_bytes(buffer)

result_xml = read_to_string(reader)

print(f"test_pdf:test_extract_bytes_as_xml result = {result_xml}")
result_text = extract_body_text(result_xml)
assert result_text.strip() == expected_result().strip()
Loading
Loading