Skip to content

Commit

Permalink
feat: parse to xml
Browse files Browse the repository at this point in the history
  • Loading branch information
KapiWow committed Dec 3, 2024
1 parent 5c20414 commit e756f18
Show file tree
Hide file tree
Showing 19 changed files with 309 additions and 47 deletions.
16 changes: 8 additions & 8 deletions .github/workflows/release_python.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This file was autogenerated by maturin v1.6.0 using:
# maturin generate-ci github -o ../../.github/workflows/release_pyton_pytest.yml --pytest
#
#
# Then adapted to the project
#
name: CI
Expand Down Expand Up @@ -38,7 +38,7 @@ jobs:
python-version: '3.8'

# On linux we don't use graalvm/[email protected] action to install graalvm because it will install it
# on the runner machine and on linux the build will happen inside a manylinux docker.
# on the runner machine and on linux the build will happen inside a manylinux docker.
# Instead, we use a script to install graalvm inside the docker container
# the script is launched by setting the before-script-linux config option of the maturin action
- name: Build wheels
Expand All @@ -60,7 +60,7 @@ jobs:
with:
name: wheels-linux-${{ matrix.platform.target }}
path: bindings/extractous-python/dist

- name: pytest
if: ${{ startsWith(matrix.platform.target, 'x86_64') }}
shell: bash
Expand All @@ -70,7 +70,7 @@ jobs:
python3 -m venv .venv
source .venv/bin/activate
pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
pip install pytest scikit-learn
pip install pytest scikit-learn lxml
cd bindings/extractous-python
pytest -s
Expand All @@ -85,7 +85,7 @@ jobs:
apt-get update
apt-get install tesseract-ocr tesseract-ocr-deu tesseract-ocr-ara
apt-get install -y --no-install-recommends python3 python3-pip
pip3 install -U pip pytest scikit-learn
pip3 install -U pip pytest scikit-learn lxml
run: |
set -e
pip3 install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
Expand Down Expand Up @@ -129,7 +129,7 @@ jobs:
python -m venv .venv
.venv\Scripts\activate.bat
pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
pip install pytest scikit-learn
pip install pytest scikit-learn lxml
cd bindings\extractous-python
pytest -s .
Expand Down Expand Up @@ -186,7 +186,7 @@ jobs:
python3 -m venv .venv
source .venv/bin/activate
pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
pip install pytest scikit-learn
pip install pytest scikit-learn lxml
cd bindings/extractous-python
pytest -s
Expand All @@ -206,7 +206,7 @@ jobs:
name: wheels-sdist
path: bindings/extractous-python/dist

# Follows the guide on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
# Follows the guide on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
# We use 2 actions one to publish on PyPi on tag pushes to main brnach and the other to publish on TestPyPi on any push
publish-to-testpypi:
name: Publish to TestPyPI
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ from extractous import Extractor
# Create a new extractor
extractor = Extractor()
extractor.set_extract_string_max_length(1000)
# if you need an xml
# extractor = extractor.set_parse_string_as_xml(True)

# Extract text from a file
result, metadata = extractor.extract_file_to_string("README.md")
Expand Down Expand Up @@ -125,6 +127,8 @@ use extractous::Extractor;
fn main() {
// Create a new extractor. Note it uses a consuming builder pattern
let mut extractor = Extractor::new().set_extract_string_max_length(1000);
// if you need an xml
// extractor = extractor.set_parse_string_as_xml(false);

// Extract text from a file
let (text, metadata) = extractor.extract_file_to_string("README.md").unwrap();
Expand Down
2 changes: 2 additions & 0 deletions bindings/extractous-python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ from extractous import Extractor
# Create a new extractor
extractor = Extractor()
extractor.set_extract_string_max_length(1000)
# if you need an xml
# extractor = extractor.set_parse_string_as_xml(True)

# Extract text from a file
result, metadata = extractor.extract_file_to_string("README.md")
Expand Down
8 changes: 4 additions & 4 deletions bindings/extractous-python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "extractous"
version = '0.2.0'
version = '0.2.1'
classifiers = [
"Programming Language :: Rust",
"Programming Language :: Python :: Implementation :: CPython",
Expand Down Expand Up @@ -50,19 +50,19 @@ module-name = "extractous._extractous"
python-source = "python"

# Setting skip-auditwheel=true is very important to instruct maturin to not run its auditwheel flow
# maturin auditwheel flow changes any top level shared lib names which causes problems with our graalvm libs
# maturin auditwheel flow changes any top level shared lib names which causes problems with our graalvm libs
# By skipping the wheel, we just get a plain _extracts_rs* lib, and we have to:
# * bundle our graalvm libs using the below include [] directive
# * change the RPATH of _extracts_rs* lib to be able to properly find the bundled graalvm libs
skip-auditwheel=true
skip-auditwheel=true

# This tells cargo to set the RPATH for the private module built lib _extractous.abi3.so
# Set the RPATH to $ORIGIN because the graalvm libs will be bundled in the same dir as the _extractous.abi3.so
rustc-args = ["-C", "link-arg=-Wl,-rpath,$ORIGIN"]

# Maturin include command will start looking from the python/extractous folder
# so to include the graalvm libs the rust build script must copy them to python/extractous folder
include = [
include = [
{path = "**/*.so", format = ["wheel"]},
{path = "**/*.dylib", format = ["wheel"]},
{path = "**/*.dll", format = ["wheel"]}
Expand Down
6 changes: 6 additions & 0 deletions bindings/extractous-python/src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,12 @@ impl Extractor {
Ok(Self(inner))
}

/// Set the configuration for the parse as xml
pub fn set_parse_string_as_xml(&self, parse_string_as_xml: bool) -> PyResult<Self> {
let inner = self.0.clone().set_parse_string_as_xml(parse_string_as_xml);
Ok(Self(inner))
}

/// Extracts text from a file path. Returns a tuple with stream of the extracted text
/// the stream is decoded using the extractor's `encoding` and tika metadata.
pub fn extract_file<'py>(
Expand Down
34 changes: 32 additions & 2 deletions bindings/extractous-python/tests/test_extract_bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from extractous import Extractor
from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray, \
is_expected_metadata_contained
is_expected_metadata_contained, extract_body_text

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9, 0.8),
Expand Down Expand Up @@ -49,6 +49,36 @@ def test_extract_bytes_to_string(file_name, target_dist, metadata_dist):
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_bytes_to_xml(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
file_bytes = read_file_to_bytearray(original_filepath)

extractor = Extractor()
extractor = extractor.set_parse_string_as_xml(True)
result_xml, metadata = extractor.extract_file_to_string(original_filepath)
result_text = extract_body_text(result_xml)

# Check Expected
assert cosine_similarity(result_text, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes of various file types."""
Expand Down Expand Up @@ -76,4 +106,4 @@ def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist):
# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
33 changes: 31 additions & 2 deletions bindings/extractous-python/tests/test_extract_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest

from extractous import Extractor
from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string
from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string, extract_body_text

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9, 0.8),
Expand Down Expand Up @@ -49,6 +49,35 @@ def test_extract_file_to_string(file_name, target_dist, metadata_dist):
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_file_to_xml(file_name, target_dist, metadata_dist):
"""Test the extraction and comparison of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
extractor = Extractor()
extractor = extractor.set_parse_string_as_xml(True)
result_xml, metadata = extractor.extract_file_to_string(original_filepath)
result_text = extract_body_text(result_xml)

# Check extracted
assert cosine_similarity(result_text, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"



@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_file_to_stream(file_name, target_dist, metadata_dist):
Expand All @@ -75,4 +104,4 @@ def test_extract_file_to_stream(file_name, target_dist, metadata_dist):
# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
11 changes: 10 additions & 1 deletion bindings/extractous-python/tests/test_extract_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,13 @@ def test_extract_url_to_string():
content, metadata = extractor.extract_url_to_string("https://www.google.com")

assert "Google" in content
assert len(metadata.keys()) > 0
assert len(metadata.keys()) > 0

def test_extract_url_to_xml():
extractor = Extractor()
extractor = extractor.set_parse_string_as_xml(True)

content, metadata = extractor.extract_url_to_string("https://www.google.com")

assert "Google" in content
assert len(metadata.keys()) > 0
19 changes: 18 additions & 1 deletion bindings/extractous-python/tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cosine_sim

from lxml import etree

def cosine_similarity(text1, text2):
"""Calculate the cosine similarity between two texts."""
Expand Down Expand Up @@ -78,3 +78,20 @@ def calculate_similarity_percent(expected, current):

# Return the similarity percentage
return matches / total


def extract_body_text(xml: str) -> str:
"""
Extracts and returns plain text content from the <body> section of an XML
string.
"""
try:
parser = etree.XMLParser(recover=True)
root = etree.fromstring(xml.encode(), parser=parser)
ns= {"ns": "http://www.w3.org/1999/xhtml"}
body = root.find(".//ns:body", namespaces=ns)
if body is None:
return ""
return "\n".join(body.itertext()).strip()
except ET.ParseError as e:
raise ValueError(f"Invalid XML input: {e}")
12 changes: 11 additions & 1 deletion extractous-core/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion extractous-core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "extractous"
version = "0.2.0"
version = "0.2.1"
edition = "2021"

description = """
Expand Down Expand Up @@ -36,6 +36,7 @@ test-case = "3.0"
criterion = "0.5.1"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
quick-xml = "0.37.1"

[build-dependencies]
fs_extra = { version = "1.3.0" }
Expand Down
7 changes: 5 additions & 2 deletions extractous-core/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ fn main() {
let file_path = &args[1];

// Extract the provided file content to a string
let extractor = Extractor::new();
let mut extractor = Extractor::new();
// if you need an xml
// extractor = extractor.set_parse_string_as_xml(false);
// Extract text from a file
let (content, metadata) = extractor.extract_file_to_string(file_path).unwrap();
println!("{}", content);
println!("{:?}", metadata);
Expand Down Expand Up @@ -128,7 +131,7 @@ installed on your system because some of the OCR tests will fail if no tesseract
* `sudo apt install tesseract-ocr`
* Install any language extensions you want. for example to install German and Arabic:
* `sudo apt install tesseract-ocr-deu tesseract-ocr-ara`
* On Mac
* On Mac
* `brew install tesseract tesseract-lang`

### Building Extractous
Expand Down
Loading

0 comments on commit e756f18

Please sign in to comment.