Skip to content

Commit

Permalink
Merge pull request #27 from yobix-ai/25-make-reflection-data-platform…
Browse files Browse the repository at this point in the history
…-specific

25 make reflection data platform specific
  • Loading branch information
nmammeri authored Nov 17, 2024
2 parents 6ee0cd6 + 8169f81 commit 6831a6d
Show file tree
Hide file tree
Showing 24 changed files with 18,259 additions and 6,810 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/install-graalvm-sdkman.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ set -e -x
# Check for correct number of arguments
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <graalvm-sdkman-version>"
echo " Installs graalvm using sdkman, example: $0 22.0.1-graalce"
echo " Installs graalvm using sdkman, example: $0 23.0.1-graalce"
exit 1
fi

Expand Down
65 changes: 0 additions & 65 deletions .github/workflows/install-graalvm.sh

This file was deleted.

5 changes: 0 additions & 5 deletions .github/workflows/install-openssl.sh

This file was deleted.

32 changes: 18 additions & 14 deletions .github/workflows/release_python.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This file was autogenerated by maturin v1.6.0 using:
# maturin generate-ci github -o ../../.github/workflows/release_pyton_pytest.yml --pytest
#
# Then adapted to to the project
# Then adapted to the project
#
name: CI

Expand Down Expand Up @@ -37,7 +37,7 @@ jobs:
with:
python-version: '3.8'

# On linux we dont use graalvm/[email protected].2 action to install graalvm because it will install it
# On linux we don't use graalvm/[email protected].5 action to install graalvm because it will install it
# on the runner machine and on linux the build will happen inside a manylinux docker.
# Instead, we use a script to install graalvm inside the docker container
# the script is launched by setting the before-script-linux config option of the maturin action
Expand All @@ -50,9 +50,8 @@ jobs:
sccache: 'false'
target: ${{ matrix.platform.target }}
container: quay.io/pypa/manylinux_2_28_${{ matrix.platform.target }}:latest
#before-script-linux: .github/workflows/install-openssl.sh
before-script-linux: .github/workflows/install-graalvm-sdkman.sh 22.0.1-graalce
docker-options: "-e JAVA_HOME=/root/.sdkman/candidates/java/22.0.1-graalce -e GRAALVM_HOME=/root/.sdkman/candidates/java/22.0.1-graalce"
before-script-linux: .github/workflows/install-graalvm-sdkman.sh 23.0.1-graalce
docker-options: "-e JAVA_HOME=/root/.sdkman/candidates/java/23.0.1-graalce -e GRAALVM_HOME=/root/.sdkman/candidates/java/23.0.1-graalce"
#docker-options: "--mount type=bind,source=/opt/hostedtoolcache,target=/opt/hostedtoolcache -e JAVA_HOME -e GRAALVM_HOME"

# On linux we don't need to patch the wheel as the RPATH is set by the rustc compiler
Expand Down Expand Up @@ -107,9 +106,9 @@ jobs:
with:
python-version: '3.8'
architecture: ${{ matrix.platform.target }}
- uses: graalvm/[email protected].2
- uses: graalvm/[email protected].5
with:
java-version: '22'
java-version: '23'
distribution: 'graalvm-community'
set-java-home: 'true'
- name: Build wheels
Expand All @@ -126,14 +125,13 @@ jobs:
path: bindings/extractous-python/dist
- name: pytest
if: ${{ !startsWith(matrix.platform.target, 'aarch64') }}
shell: cmd
run: |
python -m venv .venv
.venv\Scripts\activate.bat
pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
pip install pytest scikit-learn
cd bindings\extractous-python
pytest -s
pytest -s .
macos:
runs-on: ${{ matrix.platform.runner }}
Expand All @@ -149,11 +147,17 @@ jobs:
- uses: actions/setup-python@v5
with:
python-version: '3.8'
- uses: graalvm/[email protected]
with:
java-version: '22'
distribution: 'liberica'
set-java-home: 'true'

# - uses: graalvm/[email protected]
# with:
# java-version: '23'
# distribution: 'liberica'
# set-java-home: 'true'
# - name: Install graalvm
# run: |
# set -e
# bash .github/workflows/install-graalvm-sdkman-macos.sh 24.1.1.r23-nik

- name: Build wheels
uses: PyO3/maturin-action@v1
with:
Expand Down
4 changes: 2 additions & 2 deletions bindings/extractous-python/build-wheels.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ current_dir=$(pwd)
# curl -s "https://get.sdkman.io" | sh -s -- -y
# source "$HOME/.sdkman/bin/sdkman-init.sh"

# sdk install java 22.0.1-graalce
# sdk use java 22.0.1-graalce
# sdk install java 23.0.1-graalce
# sdk use java 23.0.1-graalce
# echo "JAVA_HOME: $JAVA_HOME"

# # install rust
Expand Down
40 changes: 20 additions & 20 deletions bindings/extractous-python/tests/test_extract_bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,23 @@
is_expected_metadata_contained

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
("science-exploration-1p.pptx", 0.9),
("simple.odt", 0.9),
("table-multi-row-column-cells-actual.csv", 0.9),
("vodafone.xlsx", 0.4),
("category-level.docx", 0.9),
("simple.doc", 0.9),
("simple.pptx", 0.9),
("table-multi-row-column-cells.png", -1.0),
("winter-sports.epub", 0.9),
("bug_16.docx", 0.9),
#("eng-ocr.pdf", 0.9),
("2022_Q3_AAPL.pdf", 0.9, 0.8),
("science-exploration-1p.pptx", 0.9, 0.8),
("simple.odt", 0.9, 0.8),
("table-multi-row-column-cells-actual.csv", 0.9, 0.6),
("vodafone.xlsx", 0.4, 0.8),
("category-level.docx", 0.9, 0.8),
("simple.doc", 0.9, 0.8),
("simple.pptx", 0.9, 0.8),
#("table-multi-row-column-cells.png", -1.0, 0.8),
("winter-sports.epub", 0.9, 0.8),
("bug_16.docx", 0.9, 0.8),
#("eng-ocr.pdf", 0.9, 0.8),
]


@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_bytes_to_string(file_name, target_dist):
@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_bytes_to_string(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
Expand All @@ -41,16 +41,16 @@ def test_extract_bytes_to_string(file_name, target_dist):
result, metadata = extractor.extract_bytes_to_string(file_bytes)

# Check Expected
assert cosine_similarity(result, expected) > target_dist, \
assert cosine_similarity(result, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity > target_dist, \
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_bytes_to_stream(file_name, target_dist):
@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
Expand All @@ -70,10 +70,10 @@ def test_extract_bytes_to_stream(file_name, target_dist):
result = read_to_string(reader)

# Check Expected
assert cosine_similarity(result, expected) > target_dist, \
assert cosine_similarity(result, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity > target_dist, \
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
45 changes: 25 additions & 20 deletions bindings/extractous-python/tests/test_extract_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,22 @@
from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
("science-exploration-1p.pptx", 0.9),
("simple.odt", 0.9),
("table-multi-row-column-cells-actual.csv", 0.9),
("vodafone.xlsx", 0.4),
("category-level.docx", 0.9),
("simple.doc", 0.9),
("simple.pptx", 0.9),
("table-multi-row-column-cells.png", -1.0),
("winter-sports.epub", 0.9),
("bug_16.docx", 0.9),
#("eng-ocr.pdf", 0.9),
("2022_Q3_AAPL.pdf", 0.9, 0.8),
("science-exploration-1p.pptx", 0.9, 0.8),
("simple.odt", 0.9, 0.8),
("table-multi-row-column-cells-actual.csv", 0.9, 0.6),
("vodafone.xlsx", 0.4, 0.8),
("category-level.docx", 0.9, 0.8),
("simple.doc", 0.9, 0.8),
("simple.pptx", 0.9, 0.8),
("table-multi-row-column-cells.png", -1.0, 0.8),
("winter-sports.epub", 0.9, 0.8),
("bug_16.docx", 0.9, 0.8),
#("eng-ocr.pdf", 0.9, 0.8),
]

@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_file_to_string(file_name, target_dist):
@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_file_to_string(file_name, target_dist, metadata_dist):
"""Test the extraction and comparison of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
Expand All @@ -37,16 +37,21 @@ def test_extract_file_to_string(file_name, target_dist):
result, metadata = extractor.extract_file_to_string(original_filepath)

# Check extracted
assert cosine_similarity(result, expected) > target_dist, \
assert cosine_similarity(result, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
#metadata.pop("dc:format")
assert is_expected_metadata_contained(expected_metadata, metadata)
#assert is_expected_metadata_contained(expected_metadata, metadata)

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"


@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_file_to_stream(file_name, target_dist):
@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_file_to_stream(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
Expand All @@ -64,10 +69,10 @@ def test_extract_file_to_stream(file_name, target_dist):
result = read_to_string(reader)

# Check Expected
assert cosine_similarity(result, expected) > target_dist, \
assert cosine_similarity(result, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity > target_dist, \
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
8 changes: 7 additions & 1 deletion bindings/extractous-python/tests/test_ocr.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import sys

import pytest

from extractous import Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig
from utils import cosine_similarity


@pytest.mark.skipif(sys.platform.startswith("win"), reason="Test not supported on Windows")
def test_ara_ocr_png():
ocr_config = TesseractOcrConfig().set_language("ara")
extractor = Extractor().set_ocr_config(ocr_config)
Expand All @@ -13,6 +18,7 @@ def test_ara_ocr_png():
assert cosine_similarity(result, expected) > 0.9


@pytest.mark.skipif(sys.platform.startswith("win"), reason="Test not supported on Windows")
def test_extract_file_to_string_ocr_only_strategy_deu_ocr_pdf():
test_file = "../../test_files/documents/deu-ocr.pdf"
expected_result_file = "../../test_files/expected_result/deu-ocr.pdf.txt"
Expand All @@ -32,7 +38,7 @@ def test_extract_file_to_string_ocr_only_strategy_deu_ocr_pdf():

assert cosine_similarity(result, expected) > 0.9


@pytest.mark.skipif(sys.platform.startswith("win"), reason="Test not supported on Windows")
def test_test_extract_file_to_string_no_ocr_strategy_deu_ocr_pdf():
test_file = "../../test_files/documents/deu-ocr.pdf"

Expand Down
6 changes: 3 additions & 3 deletions bindings/extractous-python/tests/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def test_extract_file_to_string():
extractor = Extractor()
result, metadata = extractor.extract_file_to_string("tests/quarkus.pdf")

#print(result)
print(f"test_pdf:test_extract_file_to_string result = {result}")
assert result == expected_result()

def test_extract_file():
Expand All @@ -19,7 +19,7 @@ def test_extract_file():

result = read_to_string(reader)

#print(result)
print(f"test_pdf:test_extract_file result = {result}")
assert result == expected_result()

def test_extract_bytes():
Expand All @@ -31,5 +31,5 @@ def test_extract_bytes():

result = read_to_string(reader)

#print(result)
print(f"test_pdf:test_extract_bytes result = {result}")
assert result == expected_result()
Loading

0 comments on commit 6831a6d

Please sign in to comment.