Skip to content

Commit

Permalink
feat: parse to xml
Browse files Browse the repository at this point in the history
  • Loading branch information
KapiWow committed Nov 27, 2024
1 parent 5c20414 commit b0ced9a
Show file tree
Hide file tree
Showing 14 changed files with 365 additions and 31 deletions.
49 changes: 49 additions & 0 deletions bindings/extractous-python/src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,55 @@ impl Extractor {
Ok((content, py_metadata.into()))
}

/// Extracts text from a file path. Returns a tuple with string that is of maximum length
/// of the extractor's `extract_string_max_length` and the metadata as dict.
pub fn extract_file_to_xml<'py>(
&self,
filename: &str,
py: Python<'py>,
) -> PyResult<(String, PyObject)> {
let (content, metadata) = self
.0
.extract_file_to_xml(filename)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((content, py_metadata.into()))
}

/// Extracts text from a bytearray. string that is of maximum length
/// of the extractor's `extract_string_max_length` and the metadata as dict.
pub fn extract_bytes_to_xml<'py>(
&self,
buffer: &Bound<'_, PyByteArray>,
py: Python<'py>,
) -> PyResult<(String, PyObject)> {
let (content, metadata) = self
.0
.extract_bytes_to_xml(&buffer.to_vec())
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((content, py_metadata.into()))
}

/// Extracts text from a URL. Returns a tuple with string that is of maximum length
/// of the extractor's `extract_string_max_length` and the metadata as dict.
pub fn extract_url_to_xml<'py>(
&self,
url: &str,
py: Python<'py>,
) -> PyResult<(String, PyObject)> {
let (content, metadata) = self
.0
.extract_url_to_xml(url)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((content, py_metadata.into()))
}

fn __repr__(&self) -> String {
format!("{:?}", self.0)
}
Expand Down
33 changes: 31 additions & 2 deletions bindings/extractous-python/tests/test_extract_bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from extractous import Extractor
from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray, \
is_expected_metadata_contained
is_expected_metadata_contained, extract_body_text

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9, 0.8),
Expand Down Expand Up @@ -49,6 +49,35 @@ def test_extract_bytes_to_string(file_name, target_dist, metadata_dist):
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_bytes_to_xml(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
file_bytes = read_file_to_bytearray(original_filepath)

extractor = Extractor()
result_xml, metadata = extractor.extract_bytes_to_xml(file_bytes)
result_text = extract_body_text(result_xml)

# Check Expected
assert cosine_similarity(result_text, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist):
"""Test the extraction from bytes of various file types."""
Expand Down Expand Up @@ -76,4 +105,4 @@ def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist):
# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
32 changes: 30 additions & 2 deletions bindings/extractous-python/tests/test_extract_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest

from extractous import Extractor
from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string
from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string, extract_body_text

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9, 0.8),
Expand Down Expand Up @@ -49,6 +49,34 @@ def test_extract_file_to_string(file_name, target_dist, metadata_dist):
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_file_to_xml(file_name, target_dist, metadata_dist):
"""Test the extraction and comparison of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
extractor = Extractor()
result_xml, metadata = extractor.extract_file_to_xml(original_filepath)
result_text = extract_body_text(result_xml)

# Check extracted
assert cosine_similarity(result_text, expected) >= target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"



@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
def test_extract_file_to_stream(file_name, target_dist, metadata_dist):
Expand All @@ -75,4 +103,4 @@ def test_extract_file_to_stream(file_name, target_dist, metadata_dist):
# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity >= metadata_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
10 changes: 9 additions & 1 deletion bindings/extractous-python/tests/test_extract_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,12 @@ def test_extract_url_to_string():
content, metadata = extractor.extract_url_to_string("https://www.google.com")

assert "Google" in content
assert len(metadata.keys()) > 0
assert len(metadata.keys()) > 0

def test_extract_url_to_xml():
extractor = Extractor()

content, metadata = extractor.extract_url_to_xml("https://www.google.com")

assert "Google" in content
assert len(metadata.keys()) > 0
19 changes: 18 additions & 1 deletion bindings/extractous-python/tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cosine_sim

from lxml import etree

def cosine_similarity(text1, text2):
"""Calculate the cosine similarity between two texts."""
Expand Down Expand Up @@ -78,3 +78,20 @@ def calculate_similarity_percent(expected, current):

# Return the similarity percentage
return matches / total


def extract_body_text(xml: str) -> str:
"""
Extracts and returns plain text content from the <body> section of an XML
string.
"""
try:
parser = etree.XMLParser(recover=True)
root = etree.fromstring(xml.encode(), parser=parser)
ns= {"ns": "http://www.w3.org/1999/xhtml"}
body = root.find(".//ns:body", namespaces=ns)
if body is None:
return ""
return "\n".join(body.itertext()).strip()
except ET.ParseError as e:
raise ValueError(f"Invalid XML input: {e}")
10 changes: 10 additions & 0 deletions extractous-core/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions extractous-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ test-case = "3.0"
criterion = "0.5.1"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
quick-xml = "0.37.1"

[build-dependencies]
fs_extra = { version = "1.3.0" }
Expand Down
60 changes: 60 additions & 0 deletions extractous-core/src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ impl Extractor {
&self.pdf_config,
&self.office_config,
&self.ocr_config,
false,
)
}

Expand All @@ -174,6 +175,7 @@ impl Extractor {
&self.pdf_config,
&self.office_config,
&self.ocr_config,
false,
)
}

Expand All @@ -186,8 +188,49 @@ impl Extractor {
&self.pdf_config,
&self.office_config,
&self.ocr_config,
false,
)
}

/// Extracts text from a file path. Returns a tuple with xml string that is of maximum length
/// of the extractor's `extract_string_max_length` and metadata.
pub fn extract_file_to_xml(&self, file_path: &str) -> ExtractResult<(String, Metadata)> {
tika::parse_file_to_string(
file_path,
self.extract_string_max_length,
&self.pdf_config,
&self.office_config,
&self.ocr_config,
true,
)
}

/// Extracts text from a byte buffer. Returns a tuple with xml string that is of maximum length
/// of the extractor's `extract_string_max_length` and metadata.
pub fn extract_bytes_to_xml(&self, buffer: &[u8]) -> ExtractResult<(String, Metadata)> {
tika::parse_bytes_to_string(
buffer,
self.extract_string_max_length,
&self.pdf_config,
&self.office_config,
&self.ocr_config,
true,
)
}

/// Extracts text from a URL. Returns a tuple with xml string that is of maximum length
/// of the extractor's `extract_string_max_length` and metadata.
pub fn extract_url_to_xml(&self, url: &str) -> ExtractResult<(String, Metadata)> {
tika::parse_url_to_string(
url,
self.extract_string_max_length,
&self.pdf_config,
&self.office_config,
&self.ocr_config,
true,
)
}

}

#[cfg(test)]
Expand All @@ -197,6 +240,7 @@ mod tests {
use std::fs::File;
use std::io::BufReader;
use std::io::{self, Read};
use std::str;

const TEST_FILE: &str = "README.md";

Expand Down Expand Up @@ -292,4 +336,20 @@ mod tests {
"Metadata should contain at least one entry"
);
}

#[test]
fn extract_file_to_xml_test() {
// Parse the files using extractous
let extractor = Extractor::new();
let result = extractor.extract_file_to_xml(TEST_FILE);
let (content, metadata) = result.unwrap();
assert!(
content.len() > 0,
"Metadata should contain at least one entry"
);
assert!(
metadata.len() > 0,
"Metadata should contain at least one entry"
);
}
}
Loading

0 comments on commit b0ced9a

Please sign in to comment.