Skip to content

Commit ddc4a06

Browse files
committed
Renamed summary to name, for understandability
1 parent 9aa61b4 commit ddc4a06

File tree

8 files changed

+36
-36
lines changed

8 files changed

+36
-36
lines changed

packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,6 @@ def parse_pdf_to_pages(
177177
parsing_libraries=[f"{pymupdf.__name__} ({pymupdf.__version__})"],
178178
total_parsed_text_length=total_length,
179179
count_parsed_media=count_media,
180-
summary=f"pdf|block={use_block_parsing}{multimodal_string if parse_media else ''}",
180+
name=f"pdf|block={use_block_parsing}{multimodal_string if parse_media else ''}",
181181
)
182182
return ParsedText(content=content, metadata=metadata)

packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,8 @@ async def test_parse_pdf_to_pages() -> None:
124124
for pt in (parsed_text, parsed_text_full_page, parsed_text_no_media):
125125
(parsing_library,) = pt.metadata.parsing_libraries
126126
assert pymupdf.__name__ in parsing_library
127-
assert pt.metadata.summary
128-
assert "pdf" in pt.metadata.summary
127+
assert pt.metadata.name
128+
assert "pdf" in pt.metadata.name
129129

130130
# Check commonalities across all modes
131131
assert (

packages/paper-qa-pypdf/src/paperqa_pypdf/reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,6 @@ def parse_pdf_to_pages(
105105
],
106106
total_parsed_text_length=total_length,
107107
count_parsed_media=count_media,
108-
summary=f"pdf|{multimodal_string if parse_media else ''}",
108+
name=f"pdf|{multimodal_string if parse_media else ''}",
109109
)
110110
return ParsedText(content=pages, metadata=metadata)

packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,8 @@ async def test_parse_pdf_to_pages() -> None:
106106
for pt in (parsed_text_full_page, parsed_text_no_media):
107107
(parsing_library,) = pt.metadata.parsing_libraries
108108
assert pypdf.__name__ in parsing_library
109-
assert pt.metadata.summary
110-
assert "pdf" in pt.metadata.summary
109+
assert pt.metadata.name
110+
assert "pdf" in pt.metadata.name
111111

112112
# Check commonalities across all modes
113113
assert len(parsed_text_full_page.content) == len(

src/paperqa/docs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ async def aadd( # noqa: PLR0912
393393
include_metadata=True,
394394
)
395395
# loose check to see if document was loaded
396-
if metadata.summary != "image" and (
396+
if metadata.name != "image" and (
397397
not texts
398398
or len(texts[0].text) < 10 # noqa: PLR2004
399399
or (

src/paperqa/readers.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ async def parse_image(
5151
paperqa_version=pqa_version,
5252
total_parsed_text_length=0, # No text, just an image
5353
count_parsed_media=1,
54-
summary="image",
54+
name="image",
5555
)
5656
return ParsedText(content={"1": ("", [parsed_media])}, metadata=metadata)
5757

@@ -162,7 +162,7 @@ def parse_text(
162162
parsing_libraries=parsing_libraries,
163163
paperqa_version=pqa_version,
164164
total_parsed_text_length=total_length,
165-
summary=f"{parse_summary}|split-lines={split_lines}",
165+
name=f"{parse_summary}|split-lines={split_lines}",
166166
),
167167
)
168168

@@ -385,7 +385,7 @@ async def read_doc( # noqa: PLR0912
385385
chunk_metadata = ChunkMetadata(
386386
size=0,
387387
overlap=0,
388-
summary=f"paper-qa={pqa_version}|algorithm=none|reduction=cl100k_base",
388+
name=f"paper-qa={pqa_version}|algorithm=none|reduction=cl100k_base",
389389
)
390390
elif str_path.endswith(".pdf"):
391391
chunked_text = chunk_pdf(
@@ -394,7 +394,7 @@ async def read_doc( # noqa: PLR0912
394394
chunk_metadata = ChunkMetadata(
395395
size=chunk_chars,
396396
overlap=overlap,
397-
summary=(
397+
name=(
398398
f"paper-qa={pqa_version}|algorithm=overlap-pdf"
399399
f"|size={chunk_chars}|overlap={overlap}"
400400
),
@@ -406,7 +406,7 @@ async def read_doc( # noqa: PLR0912
406406
chunk_metadata = ChunkMetadata(
407407
size=0,
408408
overlap=0,
409-
summary=f"paper-qa={pqa_version}|algorithm=none",
409+
name=f"paper-qa={pqa_version}|algorithm=none",
410410
)
411411
elif str_path.endswith((".txt", ".html")):
412412
chunked_text = chunk_text(
@@ -415,7 +415,7 @@ async def read_doc( # noqa: PLR0912
415415
chunk_metadata = ChunkMetadata(
416416
size=chunk_chars,
417417
overlap=overlap,
418-
summary=(
418+
name=(
419419
f"paper-qa={pqa_version}|algorithm=overlap-text|reduction=cl100k_base"
420420
f"|size={chunk_chars}|overlap={overlap}"
421421
),
@@ -427,7 +427,7 @@ async def read_doc( # noqa: PLR0912
427427
chunk_metadata = ChunkMetadata(
428428
size=chunk_chars,
429429
overlap=overlap,
430-
summary=(
430+
name=(
431431
f"paper-qa={pqa_version}|algorithm=overlap-code|reduction=cl100k_base"
432432
f"|size={chunk_chars}|overlap={overlap}"
433433
),

src/paperqa/types.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ class ChunkMetadata(BaseModel):
461461

462462
size: int = Field(description="Chunk size (chars), or 0 for no chunking.")
463463
overlap: int = Field(description="Chunk overlap (chars), or 0 for no overlap.")
464-
summary: str | None = Field(
464+
name: str | None = Field(
465465
default=None,
466466
description=(
467467
"Optional string summarizing the chunking parameters, embodying a hash."
@@ -481,7 +481,7 @@ class ParsedMetadata(BaseModel):
481481
)
482482
total_parsed_text_length: int
483483
count_parsed_media: int = Field(default=0, ge=0)
484-
summary: str | None = Field(
484+
name: str | None = Field(
485485
default=None,
486486
description=(
487487
"Optional string summarizing the parsing parameters, embodying a hash."

tests/test_paperqa.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1307,8 +1307,8 @@ async def test_parser_only_reader(pdf_parser: PDFParserFn, stub_data_dir: Path)
13071307
parse_pdf=pdf_parser,
13081308
full_page=True, # Simple to support across many parsers
13091309
)
1310-
assert parsed_text.metadata.summary
1311-
assert "pdf" in parsed_text.metadata.summary
1310+
assert parsed_text.metadata.name
1311+
assert "pdf" in parsed_text.metadata.name
13121312
assert parsed_text.metadata.chunk_metadata is None
13131313
assert isinstance(parsed_text.content, dict)
13141314
num_chars = 0
@@ -1339,11 +1339,11 @@ async def test_chunk_metadata_reader(
13391339
include_metadata=True,
13401340
parse_pdf=pdf_parser,
13411341
)
1342-
assert metadata.summary
1343-
assert "pdf" in metadata.summary
1342+
assert metadata.name
1343+
assert "pdf" in metadata.name
13441344
assert isinstance(metadata.chunk_metadata, ChunkMetadata)
1345-
assert metadata.chunk_metadata.summary
1346-
assert "overlap-pdf" in metadata.chunk_metadata.summary
1345+
assert metadata.chunk_metadata.name
1346+
assert "overlap-pdf" in metadata.chunk_metadata.name
13471347
assert metadata.chunk_metadata.overlap == 100
13481348
assert metadata.chunk_metadata.size == 3000
13491349
assert len(chunk_text) > 2, "Expected multiple chunks, for meaningful assertions"
@@ -1380,11 +1380,11 @@ async def test_chunk_metadata_reader(
13801380
include_metadata=True,
13811381
)
13821382
# NOTE the use of tiktoken changes the actual char and overlap counts
1383-
assert metadata.summary
1384-
assert "html" in metadata.summary
1383+
assert metadata.name
1384+
assert "html" in metadata.name
13851385
assert isinstance(metadata.chunk_metadata, ChunkMetadata)
1386-
assert metadata.chunk_metadata.summary
1387-
assert "overlap-text" in metadata.chunk_metadata.summary
1386+
assert metadata.chunk_metadata.name
1387+
assert "overlap-text" in metadata.chunk_metadata.name
13881388
assert metadata.chunk_metadata.overlap == 100
13891389
assert metadata.chunk_metadata.size == 3000
13901390
assert all(
@@ -1404,11 +1404,11 @@ async def test_chunk_metadata_reader(
14041404
doc=Doc(docname="foo", citation="Foo et al, 2002", dockey="1"),
14051405
include_metadata=True,
14061406
)
1407-
assert metadata.summary
1408-
assert "txt" in metadata.summary
1407+
assert metadata.name
1408+
assert "txt" in metadata.name
14091409
assert isinstance(metadata.chunk_metadata, ChunkMetadata)
1410-
assert metadata.chunk_metadata.summary
1411-
assert "overlap-code" in metadata.chunk_metadata.summary
1410+
assert metadata.chunk_metadata.name
1411+
assert "overlap-code" in metadata.chunk_metadata.name
14121412
assert metadata.chunk_metadata.overlap == 100
14131413
assert metadata.chunk_metadata.size == 3000
14141414
assert all(
@@ -1483,8 +1483,8 @@ async def test_read_doc_images_metadata(stub_data_dir: Path) -> None:
14831483
image_id = parsed_image.to_id()
14841484
assert image_id.version == 4, "Expected a uuid4-compatible ID"
14851485
assert image_id == UUID("f6426bc3-382a-45a4-8677-08744044864f")
1486-
assert parsed_text.metadata.summary
1487-
assert "image" in parsed_text.metadata.summary
1486+
assert parsed_text.metadata.name
1487+
assert "image" in parsed_text.metadata.name
14881488
assert parsed_text.metadata.count_parsed_media == 1
14891489
assert parsed_text.metadata.total_parsed_text_length == 0
14901490
assert parsed_text.metadata.chunk_metadata is None
@@ -1502,15 +1502,15 @@ async def test_read_doc_images_metadata(stub_data_dir: Path) -> None:
15021502
texts, metadata = texts_with_metadata
15031503
assert len(texts) == 1
15041504
assert texts[0] == text
1505-
assert metadata.summary
1506-
assert "image" in metadata.summary
1505+
assert metadata.name
1506+
assert "image" in metadata.name
15071507
assert metadata.count_parsed_media == 1
15081508
assert metadata.total_parsed_text_length == 0
15091509
assert metadata.chunk_metadata is not None
15101510
assert not metadata.chunk_metadata.size
15111511
assert not metadata.chunk_metadata.overlap
1512-
assert metadata.chunk_metadata.summary
1513-
assert "algorithm=none" in metadata.chunk_metadata.summary
1512+
assert metadata.chunk_metadata.name
1513+
assert "algorithm=none" in metadata.chunk_metadata.name
15141514

15151515

15161516
@pytest.mark.asyncio

0 commit comments

Comments
 (0)