From 335054265fd37dadbc5d969e545dcd090aa88274 Mon Sep 17 00:00:00 2001 From: James Braza Date: Thu, 9 Oct 2025 15:56:27 -0700 Subject: [PATCH 1/3] Pulled in upstreamed utils from https://github.com/Future-House/ldp/pull/349 --- pyproject.toml | 4 +--- src/paperqa/types.py | 5 ++--- src/paperqa/utils.py | 43 +------------------------------------------ tests/test_paperqa.py | 26 +------------------------- uv.lock | 12 ++++++++---- 5 files changed, 13 insertions(+), 77 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d88fae422..b939534b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,9 +78,7 @@ dev = [ "typeguard", "vcrpy>=6", # Pin for https://github.com/kevin1024/vcrpy/issues/884 ] -image = [ - "pillow>=10.3.0", # Pin for py.typed -] +image = ["fhlmi[image]"] ldp = [ "ldp>=0.25.0,<1", # Lower pin for new LLM client interface, upper pin if v1 introduces breaks ] diff --git a/src/paperqa/types.py b/src/paperqa/types.py index 0d410d69d..a2de8df4a 100644 --- a/src/paperqa/types.py +++ b/src/paperqa/types.py @@ -20,6 +20,7 @@ import tiktoken from aviary.core import Message from lmi import Embeddable, LLMResult +from lmi.utils import bytes_to_string, encode_image_as_url, string_to_bytes from pybtex.database import BibliographyData, Entry, InvalidNameString, Person from pybtex.database.input.bibtex import Parser from pybtex.scanner import PybtexSyntaxError @@ -37,14 +38,12 @@ ) from paperqa.utils import ( - bytes_to_string, create_bibtex_key, encode_id, format_bibtex, get_citation_ids, get_parenthetical_substrings, maybe_get_date, - string_to_bytes, ) from paperqa.version import __version__ as pqa_version @@ -552,7 +551,7 @@ def __eq__(self, other) -> bool: def to_image_url(self, image_type: str = "png") -> str: """Convert the image data to an RFC 2397 data URL format.""" - return f"data:image/{image_type};base64,{bytes_to_string(self.data)}" + return encode_image_as_url(image_type, self.data) def save(self, path: str | os.PathLike) -> None: """Save the image to the input file path.""" diff --git a/src/paperqa/utils.py b/src/paperqa/utils.py index 426336992..a54256904 100644 --- a/src/paperqa/utils.py +++ b/src/paperqa/utils.py @@ -1,7 +1,6 @@ from __future__ import annotations import asyncio -import base64 import contextlib import hashlib import logging @@ -16,7 +15,7 @@ from functools import reduce from http import HTTPStatus from pathlib import Path -from typing import TYPE_CHECKING, Any, BinaryIO, ClassVar, TypeVar +from typing import Any, BinaryIO, ClassVar, TypeVar from uuid import UUID import httpx @@ -33,12 +32,6 @@ wait_incrementing, ) -if TYPE_CHECKING: - from typing import IO - - from PIL._typing import StrOrBytesPath - - logger = logging.getLogger(__name__) T = TypeVar("T") @@ -645,37 +638,3 @@ def clean_possessives(text: str) -> str: # Remove standalone apostrophes text = re.sub(r"\s+'\s+", " ", text) return re.sub(r"(? str: - """Convert bytes to a base64-encoded string.""" - # 1. Convert bytes to base64 bytes - # 2. Convert base64 bytes to base64 string, - # using UTF-8 since base64 produces ASCII characters - return base64.b64encode(value).decode("utf-8") - - -def string_to_bytes(value: str) -> bytes: - """Convert a base64-encoded string to bytes.""" - # 1. Convert base64 string to base64 bytes - # 2. Convert base64 bytes to original bytes - return base64.b64decode(value.encode("utf-8")) # noqa: FURB120 - - -def validate_image(path: StrOrBytesPath | IO[bytes]) -> None: - """ - Validate that the file at the given path is a valid image. - - Raises: - OSError: If the image file is truncated. - """ # noqa: DOC502 - try: - from PIL import Image - except ImportError as exc: - raise ImportError( - "Image validation requires the 'image' extra for 'pillow'. Please:" - " `pip install paper-qa[image]`." - ) from exc - - with Image.open(path) as img: - img.load() diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index 4a3263c9e..1156ac4c8 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -1,5 +1,4 @@ import asyncio -import base64 import contextlib import csv import io @@ -35,7 +34,7 @@ SparseEmbeddingModel, ) from lmi.llms import rate_limited -from lmi.utils import VCR_DEFAULT_MATCH_ON +from lmi.utils import VCR_DEFAULT_MATCH_ON, validate_image from paperqa_pymupdf import parse_pdf_to_pages as pymupdf_parse_pdf_to_pages from paperqa_pypdf import parse_pdf_to_pages as pypdf_parse_pdf_to_pages from pytest_subtests import SubTests @@ -72,7 +71,6 @@ ParsedText, ) from paperqa.utils import ( - bytes_to_string, clean_possessives, encode_id, extract_score, @@ -80,10 +78,8 @@ maybe_is_html, maybe_is_text, name_in_text, - string_to_bytes, strings_similarity, strip_citations, - validate_image, ) THIS_MODULE = pathlib.Path(__file__) @@ -2557,26 +2553,6 @@ def test_clean_possessives(raw_text: str, cleaned_text: str) -> None: assert clean_possessives(raw_text) == cleaned_text -@pytest.mark.parametrize( - "value", - [ - pytest.param(b"Hello, World!", id="simple-text"), - pytest.param(b"", id="empty-bytes"), - pytest.param(bytes([0, 1, 2, 255, 128, 64]), id="binary-data"), - pytest.param(b"Test data for base64 encoding", id="base64-validation"), - pytest.param("Hello δΈ–η•Œ 🌍".encode(), id="utf8-text"), - ], -) -def test_str_bytes_conversions(value: bytes) -> None: - # Test round-trip conversion - encoded_string = bytes_to_string(value) - decoded_bytes = string_to_bytes(encoded_string) - assert decoded_bytes == value - - # Validate that encoded string is valid base64 - assert base64.b64decode(encoded_string) == value - - tricky_test = ( "simple (pqac-a020507f) quote" "TEST AND (easy OR mistaken OR not_context)" diff --git a/uv.lock b/uv.lock index 4cd9efde9..279835cc1 100644 --- a/uv.lock +++ b/uv.lock @@ -597,6 +597,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9f/11/57e34bf6acfc459cceebfad419d79777f20d7a8301609fdb51a53012f505/fhlmi-0.39.0-py3-none-any.whl", hash = "sha256:50e815c71d1cf582cadfa64d5488d92f934366e1987bc676a6434f92a980eb3f", size = 41962, upload-time = "2025-10-14T20:54:20.995Z" }, ] +[package.optional-dependencies] +image = [ + { name = "pillow" }, +] + [[package]] name = "filelock" version = "3.20.0" @@ -1806,7 +1811,7 @@ dependencies = [ [package.optional-dependencies] dev = [ - { name = "fhlmi" }, + { name = "fhlmi", extra = ["image"] }, { name = "ipykernel" }, { name = "ipython" }, { name = "ldp" }, @@ -1814,7 +1819,6 @@ dev = [ { name = "mypy" }, { name = "paper-qa-pymupdf" }, { name = "paper-qa-pypdf", extra = ["media"] }, - { name = "pillow" }, { name = "prek" }, { name = "pydantic" }, { name = "pylint-pydantic" }, @@ -1839,7 +1843,7 @@ dev = [ { name = "vcrpy" }, ] image = [ - { name = "pillow" }, + { name = "fhlmi", extra = ["image"] }, ] ldp = [ { name = "ldp" }, @@ -1887,6 +1891,7 @@ requires-dist = [ { name = "fhaviary", extras = ["llm"], specifier = ">=0.20" }, { name = "fhlmi", specifier = ">=0.25.4" }, { name = "fhlmi", marker = "extra == 'dev'", specifier = ">=0.28" }, + { name = "fhlmi", extras = ["image"], marker = "extra == 'image'" }, { name = "html2text" }, { name = "httpx" }, { name = "httpx-aiohttp" }, @@ -1904,7 +1909,6 @@ requires-dist = [ { name = "paper-qa-pypdf", editable = "packages/paper-qa-pypdf" }, { name = "paper-qa-pypdf", marker = "extra == 'pypdf'", editable = "packages/paper-qa-pypdf" }, { name = "paper-qa-pypdf", extras = ["media"], marker = "extra == 'pypdf-media'", editable = "packages/paper-qa-pypdf" }, - { name = "pillow", marker = "extra == 'image'", specifier = ">=10.3.0" }, { name = "prek", marker = "extra == 'dev'" }, { name = "pybtex" }, { name = "pydantic", specifier = "~=2.0,>=2.10.1" }, From 6e0a6246c08c537662ed8085726c68383e358e64 Mon Sep 17 00:00:00 2001 From: James Braza Date: Thu, 9 Oct 2025 15:57:17 -0700 Subject: [PATCH 2/3] Dropped the '__future__.annotations' import in utils.py --- src/paperqa/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/paperqa/utils.py b/src/paperqa/utils.py index a54256904..4f79fb994 100644 --- a/src/paperqa/utils.py +++ b/src/paperqa/utils.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import asyncio import contextlib import hashlib From 42ce0678288fe68ae7de1d39062884e08952e863 Mon Sep 17 00:00:00 2001 From: James Braza Date: Tue, 14 Oct 2025 16:08:49 -0700 Subject: [PATCH 3/3] Pulled in latest lmi to reader packages as well --- packages/paper-qa-pymupdf/pyproject.toml | 8 +++++ .../tests/test_paperqa_pymupdf.py | 3 +- packages/paper-qa-pypdf/pyproject.toml | 7 ++++ .../tests/test_paperqa_pypdf.py | 3 +- pyproject.toml | 2 ++ uv.lock | 35 +++++++++++++++++-- 6 files changed, 54 insertions(+), 4 deletions(-) diff --git a/packages/paper-qa-pymupdf/pyproject.toml b/packages/paper-qa-pymupdf/pyproject.toml index 45b264d89..c087d9c8a 100644 --- a/packages/paper-qa-pymupdf/pyproject.toml +++ b/packages/paper-qa-pymupdf/pyproject.toml @@ -33,6 +33,14 @@ name = "paper-qa-pymupdf" readme = "README.md" requires-python = ">=3.11" +[project.optional-dependencies] +dev = [ + "fhlmi>=0.39", # Pin for bytes_to_string + "paper-qa>=5.23", # Pin for PDFParserFn + "pytest-asyncio", + "pytest>=8", # Pin to keep recent +] + [tool.ruff] extend = "../../pyproject.toml" diff --git a/packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py b/packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py index 98b6967f1..67c79fe24 100644 --- a/packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py +++ b/packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py @@ -6,9 +6,10 @@ import pymupdf import pytest +from lmi.utils import bytes_to_string from paperqa import Doc, Docs, Settings from paperqa.readers import PDFParserFn, chunk_pdf -from paperqa.utils import ImpossibleParsingError, bytes_to_string +from paperqa.utils import ImpossibleParsingError from paperqa_pymupdf import parse_pdf_to_pages diff --git a/packages/paper-qa-pypdf/pyproject.toml b/packages/paper-qa-pypdf/pyproject.toml index 40369a809..147711acd 100644 --- a/packages/paper-qa-pypdf/pyproject.toml +++ b/packages/paper-qa-pypdf/pyproject.toml @@ -34,6 +34,13 @@ readme = "README.md" requires-python = ">=3.11" [project.optional-dependencies] +dev = [ + "fhlmi>=0.39", # Pin for bytes_to_string + "paper-qa-pypdf[media]", + "paper-qa>=5.23", # Pin for PDFParserFn + "pytest-asyncio", + "pytest>=8", # Pin to keep recent +] media = [ "pypdfium2>=4.22.0", # Pin for PYPDFIUM_INFO addition ] diff --git a/packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py b/packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py index 277b55cb1..6f9da584e 100644 --- a/packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py +++ b/packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py @@ -6,9 +6,10 @@ import pypdf import pytest +from lmi.utils import bytes_to_string from paperqa import Doc, Docs from paperqa.readers import PDFParserFn, chunk_pdf -from paperqa.utils import ImpossibleParsingError, bytes_to_string +from paperqa.utils import ImpossibleParsingError from paperqa_pypdf import parse_pdf_to_pages diff --git a/pyproject.toml b/pyproject.toml index b939534b5..c50419248 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,8 @@ requires = ["setuptools>=64", "setuptools_scm>=8"] [dependency-groups] dev = [ + "paper-qa-pymupdf[dev]", + "paper-qa-pypdf[dev]", "paper-qa[dev]", ] diff --git a/uv.lock b/uv.lock index 279835cc1..60943a540 100644 --- a/uv.lock +++ b/uv.lock @@ -1883,6 +1883,8 @@ zotero = [ [package.dev-dependencies] dev = [ { name = "paper-qa", extra = ["dev"] }, + { name = "paper-qa-pymupdf", extra = ["dev"] }, + { name = "paper-qa-pypdf", extra = ["dev"] }, ] [package.metadata] @@ -1943,7 +1945,11 @@ requires-dist = [ provides-extras = ["dev", "image", "ldp", "local", "memory", "openreview", "pymupdf", "pypdf", "pypdf-media", "qdrant", "typing", "zotero"] [package.metadata.requires-dev] -dev = [{ name = "paper-qa", extras = ["dev"], editable = "." }] +dev = [ + { name = "paper-qa", extras = ["dev"], editable = "." }, + { name = "paper-qa-pymupdf", extras = ["dev"], editable = "packages/paper-qa-pymupdf" }, + { name = "paper-qa-pypdf", extras = ["dev"], editable = "packages/paper-qa-pypdf" }, +] [[package]] name = "paper-qa-pymupdf" @@ -1953,11 +1959,24 @@ dependencies = [ { name = "pymupdf" }, ] +[package.optional-dependencies] +dev = [ + { name = "fhlmi" }, + { name = "paper-qa" }, + { name = "pytest" }, + { name = "pytest-asyncio" }, +] + [package.metadata] requires-dist = [ + { name = "fhlmi", marker = "extra == 'dev'", specifier = ">=0.39" }, { name = "paper-qa", editable = "." }, + { name = "paper-qa", marker = "extra == 'dev'", editable = "." }, { name = "pymupdf", specifier = ">=1.24.12" }, + { name = "pytest", marker = "extra == 'dev'", specifier = ">=8" }, + { name = "pytest-asyncio", marker = "extra == 'dev'" }, ] +provides-extras = ["dev"] [[package]] name = "paper-qa-pypdf" @@ -1968,17 +1987,29 @@ dependencies = [ ] [package.optional-dependencies] +dev = [ + { name = "fhlmi" }, + { name = "paper-qa" }, + { name = "pypdfium2" }, + { name = "pytest" }, + { name = "pytest-asyncio" }, +] media = [ { name = "pypdfium2" }, ] [package.metadata] requires-dist = [ + { name = "fhlmi", marker = "extra == 'dev'", specifier = ">=0.39" }, { name = "paper-qa", editable = "." }, + { name = "paper-qa", marker = "extra == 'dev'", editable = "." }, + { name = "paper-qa-pypdf", extras = ["media"], marker = "extra == 'dev'", editable = "packages/paper-qa-pypdf" }, { name = "pypdf", specifier = ">=3" }, { name = "pypdfium2", marker = "extra == 'media'", specifier = ">=4.22.0" }, + { name = "pytest", marker = "extra == 'dev'", specifier = ">=8" }, + { name = "pytest-asyncio", marker = "extra == 'dev'" }, ] -provides-extras = ["media"] +provides-extras = ["dev", "media"] [[package]] name = "parso"