Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions packages/paper-qa-pymupdf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@ name = "paper-qa-pymupdf"
readme = "README.md"
requires-python = ">=3.11"

[project.optional-dependencies]
dev = [
"fhlmi>=0.39", # Pin for bytes_to_string
"paper-qa>=5.23", # Pin for PDFParserFn
"pytest-asyncio",
"pytest>=8", # Pin to keep recent
]

[tool.ruff]
extend = "../../pyproject.toml"

Expand Down
3 changes: 2 additions & 1 deletion packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

import pymupdf
import pytest
from lmi.utils import bytes_to_string
from paperqa import Doc, Docs, Settings
from paperqa.readers import PDFParserFn, chunk_pdf
from paperqa.utils import ImpossibleParsingError, bytes_to_string
from paperqa.utils import ImpossibleParsingError

from paperqa_pymupdf import parse_pdf_to_pages

Expand Down
7 changes: 7 additions & 0 deletions packages/paper-qa-pypdf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ readme = "README.md"
requires-python = ">=3.11"

[project.optional-dependencies]
dev = [
"fhlmi>=0.39", # Pin for bytes_to_string
"paper-qa-pypdf[media]",
"paper-qa>=5.23", # Pin for PDFParserFn
"pytest-asyncio",
"pytest>=8", # Pin to keep recent
]
media = [
"pypdfium2>=4.22.0", # Pin for PYPDFIUM_INFO addition
]
Expand Down
3 changes: 2 additions & 1 deletion packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

import pypdf
import pytest
from lmi.utils import bytes_to_string
from paperqa import Doc, Docs
from paperqa.readers import PDFParserFn, chunk_pdf
from paperqa.utils import ImpossibleParsingError, bytes_to_string
from paperqa.utils import ImpossibleParsingError

from paperqa_pypdf import parse_pdf_to_pages

Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ requires = ["setuptools>=64", "setuptools_scm>=8"]

[dependency-groups]
dev = [
"paper-qa-pymupdf[dev]",
"paper-qa-pypdf[dev]",
"paper-qa[dev]",
]

Expand Down Expand Up @@ -78,9 +80,7 @@ dev = [
"typeguard",
"vcrpy>=6", # Pin for https://github.com/kevin1024/vcrpy/issues/884
]
image = [
"pillow>=10.3.0", # Pin for py.typed
]
image = ["fhlmi[image]"]
ldp = [
"ldp>=0.25.0,<1", # Lower pin for new LLM client interface, upper pin if v1 introduces breaks
]
Expand Down
5 changes: 2 additions & 3 deletions src/paperqa/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import tiktoken
from aviary.core import Message
from lmi import Embeddable, LLMResult
from lmi.utils import bytes_to_string, encode_image_as_url, string_to_bytes
from pybtex.database import BibliographyData, Entry, InvalidNameString, Person
from pybtex.database.input.bibtex import Parser
from pybtex.scanner import PybtexSyntaxError
Expand All @@ -37,14 +38,12 @@
)

from paperqa.utils import (
bytes_to_string,
create_bibtex_key,
encode_id,
format_bibtex,
get_citation_ids,
get_parenthetical_substrings,
maybe_get_date,
string_to_bytes,
)
from paperqa.version import __version__ as pqa_version

Expand Down Expand Up @@ -552,7 +551,7 @@ def __eq__(self, other) -> bool:

def to_image_url(self, image_type: str = "png") -> str:
"""Convert the image data to an RFC 2397 data URL format."""
return f"data:image/{image_type};base64,{bytes_to_string(self.data)}"
return encode_image_as_url(image_type, self.data)

def save(self, path: str | os.PathLike) -> None:
"""Save the image to the input file path."""
Expand Down
45 changes: 1 addition & 44 deletions src/paperqa/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
from __future__ import annotations

import asyncio
import base64
import contextlib
import hashlib
import logging
Expand All @@ -16,7 +13,7 @@
from functools import reduce
from http import HTTPStatus
from pathlib import Path
from typing import TYPE_CHECKING, Any, BinaryIO, ClassVar, TypeVar
from typing import Any, BinaryIO, ClassVar, TypeVar
from uuid import UUID

import httpx
Expand All @@ -33,12 +30,6 @@
wait_incrementing,
)

if TYPE_CHECKING:
from typing import IO

from PIL._typing import StrOrBytesPath


logger = logging.getLogger(__name__)

T = TypeVar("T")
Expand Down Expand Up @@ -645,37 +636,3 @@ def clean_possessives(text: str) -> str:
# Remove standalone apostrophes
text = re.sub(r"\s+'\s+", " ", text)
return re.sub(r"(?<!\w)'\s*", "", text)


def bytes_to_string(value: bytes) -> str:
"""Convert bytes to a base64-encoded string."""
# 1. Convert bytes to base64 bytes
# 2. Convert base64 bytes to base64 string,
# using UTF-8 since base64 produces ASCII characters
return base64.b64encode(value).decode("utf-8")


def string_to_bytes(value: str) -> bytes:
"""Convert a base64-encoded string to bytes."""
# 1. Convert base64 string to base64 bytes
# 2. Convert base64 bytes to original bytes
return base64.b64decode(value.encode("utf-8")) # noqa: FURB120


def validate_image(path: StrOrBytesPath | IO[bytes]) -> None:
"""
Validate that the file at the given path is a valid image.

Raises:
OSError: If the image file is truncated.
""" # noqa: DOC502
try:
from PIL import Image
except ImportError as exc:
raise ImportError(
"Image validation requires the 'image' extra for 'pillow'. Please:"
" `pip install paper-qa[image]`."
) from exc

with Image.open(path) as img:
img.load()
26 changes: 1 addition & 25 deletions tests/test_paperqa.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import asyncio
import base64
import contextlib
import csv
import io
Expand Down Expand Up @@ -35,7 +34,7 @@
SparseEmbeddingModel,
)
from lmi.llms import rate_limited
from lmi.utils import VCR_DEFAULT_MATCH_ON
from lmi.utils import VCR_DEFAULT_MATCH_ON, validate_image
from paperqa_pymupdf import parse_pdf_to_pages as pymupdf_parse_pdf_to_pages
from paperqa_pypdf import parse_pdf_to_pages as pypdf_parse_pdf_to_pages
from pytest_subtests import SubTests
Expand Down Expand Up @@ -72,18 +71,15 @@
ParsedText,
)
from paperqa.utils import (
bytes_to_string,
clean_possessives,
encode_id,
extract_score,
maybe_get_date,
maybe_is_html,
maybe_is_text,
name_in_text,
string_to_bytes,
strings_similarity,
strip_citations,
validate_image,
)

THIS_MODULE = pathlib.Path(__file__)
Expand Down Expand Up @@ -2557,26 +2553,6 @@ def test_clean_possessives(raw_text: str, cleaned_text: str) -> None:
assert clean_possessives(raw_text) == cleaned_text


@pytest.mark.parametrize(
"value",
[
pytest.param(b"Hello, World!", id="simple-text"),
pytest.param(b"", id="empty-bytes"),
pytest.param(bytes([0, 1, 2, 255, 128, 64]), id="binary-data"),
pytest.param(b"Test data for base64 encoding", id="base64-validation"),
pytest.param("Hello 世界 🌍".encode(), id="utf8-text"),
],
)
def test_str_bytes_conversions(value: bytes) -> None:
# Test round-trip conversion
encoded_string = bytes_to_string(value)
decoded_bytes = string_to_bytes(encoded_string)
assert decoded_bytes == value

# Validate that encoded string is valid base64
assert base64.b64decode(encoded_string) == value


tricky_test = (
"simple (pqac-a020507f) quote"
"TEST AND (easy OR mistaken OR not_context)"
Expand Down
47 changes: 41 additions & 6 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.