Skip to content

Commit

Permalink
tests: add extract_url_to_string and extract_bytes_to_string tests
Browse files Browse the repository at this point in the history
  • Loading branch information
nmammeri committed Nov 16, 2024
1 parent 355bfbd commit 5e32273
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import pytest

from extractous import Extractor
from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray
from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray, \
is_expected_metadata_contained

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
Expand All @@ -20,6 +21,34 @@
]


@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_bytes_to_string(file_name, target_dist):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
file_bytes = read_file_to_bytearray(original_filepath)

extractor = Extractor()
result, metadata = extractor.extract_bytes_to_string(file_bytes)

# Check Expected
assert cosine_similarity(result, expected) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity > target_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_bytes_to_stream(file_name, target_dist):
"""Test the extraction from bytes of various file types."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest

from extractous import Extractor
from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained
from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
Expand Down Expand Up @@ -42,4 +42,32 @@ def test_extract_file_to_string(file_name, target_dist):

# Check metadata
#metadata.pop("dc:format")
assert is_expected_metadata_contained(expected_metadata, metadata)
assert is_expected_metadata_contained(expected_metadata, metadata)


@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_file_to_stream(file_name, target_dist):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
extractor = Extractor()
reader, metadata = extractor.extract_file(original_filepath)
result = read_to_string(reader)

# Check Expected
assert cosine_similarity(result, expected) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity > target_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
10 changes: 9 additions & 1 deletion bindings/extractous-python/tests/test_extract_url.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
from extractous import Extractor
from utils import read_to_string

def test_extract_url():
def test_extract_url_to_stream():
extractor = Extractor()

reader, metadata = extractor.extract_url("https://www.google.com")
result = read_to_string(reader)

assert "Google" in result
assert len(metadata.keys()) > 0

def test_extract_url_to_string():
extractor = Extractor()

content, metadata = extractor.extract_url_to_string("https://www.google.com")

assert "Google" in content
assert len(metadata.keys()) > 0

0 comments on commit 5e32273

Please sign in to comment.