tests: add extract_url_to_string and extract_bytes_to_string tests

yobix-ai · Nov 16, 2024 · 5e32273 · 5e32273
1 parent 355bfbd
commit 5e32273
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 4 deletions.
diff --git a/...hon/tests/test_extract_bytes_to_stream.py → ...actous-python/tests/test_extract_bytes.py b/...hon/tests/test_extract_bytes_to_stream.py → ...actous-python/tests/test_extract_bytes.py
@@ -2,7 +2,8 @@
 import pytest
 
 from extractous import Extractor
-from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray
+from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray, \
+    is_expected_metadata_contained
 
 TEST_CASES = [
     ("2022_Q3_AAPL.pdf", 0.9),
@@ -20,6 +21,34 @@
 ]
 
 
+@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
+def test_extract_bytes_to_string(file_name, target_dist):
+    """Test the extraction from bytes of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+
+    # Extract
+    file_bytes = read_file_to_bytearray(original_filepath)
+
+    extractor = Extractor()
+    result, metadata = extractor.extract_bytes_to_string(file_bytes)
+
+    # Check Expected
+    assert cosine_similarity(result, expected) > target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
+    # Check metadata
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity > target_dist, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
+
 @pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
 def test_extract_bytes_to_stream(file_name, target_dist):
     """Test the extraction from bytes of various file types."""

diff --git a/...thon/tests/test_extract_file_to_string.py → ...ractous-python/tests/test_extract_file.py b/...thon/tests/test_extract_file_to_string.py → ...ractous-python/tests/test_extract_file.py
@@ -2,7 +2,7 @@
 import pytest
 
 from extractous import Extractor
-from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained
+from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string
 
 TEST_CASES = [
     ("2022_Q3_AAPL.pdf", 0.9),
@@ -42,4 +42,32 @@ def test_extract_file_to_string(file_name, target_dist):
 
     # Check metadata
     #metadata.pop("dc:format")
-    assert is_expected_metadata_contained(expected_metadata, metadata)
+    assert is_expected_metadata_contained(expected_metadata, metadata)
+
+
+@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
+def test_extract_file_to_stream(file_name, target_dist):
+    """Test the extraction from bytes of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+
+    # Extract
+    extractor = Extractor()
+    reader, metadata = extractor.extract_file(original_filepath)
+    result = read_to_string(reader)
+
+    # Check Expected
+    assert cosine_similarity(result, expected) > target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
+    # Check metadata
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity > target_dist, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
diff --git a/bindings/extractous-python/tests/test_extract_url.py b/bindings/extractous-python/tests/test_extract_url.py
@@ -1,11 +1,19 @@
 from extractous import Extractor
 from utils import read_to_string
 
-def test_extract_url():
+def test_extract_url_to_stream():
     extractor = Extractor()
 
     reader, metadata  = extractor.extract_url("https://www.google.com")
     result = read_to_string(reader)
 
     assert "Google" in result
     assert len(metadata.keys()) > 0
+
+def test_extract_url_to_string():
+    extractor = Extractor()
+
+    content, metadata  = extractor.extract_url_to_string("https://www.google.com")
+
+    assert "Google" in content
+    assert len(metadata.keys()) > 0