Skip to content

Commit 91689b6

Browse files
committed
Integrated ParsedMedia into evidence gathering, with tests
1 parent 8a409a8 commit 91689b6

File tree

8 files changed

+8413
-8246
lines changed

8 files changed

+8413
-8246
lines changed

src/paperqa/core.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from aviary.core import Message
1010
from lmi import LLMModel
1111

12+
from paperqa.prompts import text_with_tables_prompt_template
1213
from paperqa.types import Context, LLMResult, Text
1314
from paperqa.utils import extract_score, strip_citations
1415

@@ -164,18 +165,31 @@ async def map_fxn_summary(
164165
citation = text.name + ": " + text.doc.formatted_citation
165166
success = False
166167

168+
# Strip newlines in case chunking led to blank lines,
169+
# but not spaces, to preserve text alignment
170+
cleaned_text = text.text.strip("\n")
167171
if summary_llm_model and prompt_templates:
172+
media_text: list[str] = [m.text for m in text.media if m.text]
168173
data = {
169174
"question": question,
170175
"citation": citation,
171-
# Strip newlines in case chunking led to blank lines,
172-
# but not spaces, to preserve text alignment
173-
"text": text.text.strip("\n"),
176+
"text": (
177+
text_with_tables_prompt_template.format(
178+
text=cleaned_text,
179+
citation=citation,
180+
tables="\n\n----\n\n".join(media_text),
181+
)
182+
if media_text
183+
else cleaned_text
184+
),
174185
} | (extra_prompt_data or {})
175186
message_prompt, system_prompt = prompt_templates
176187
messages = [
177188
Message(role="system", content=system_prompt.format(**data)),
178-
Message(role="user", content=message_prompt.format(**data)),
189+
Message.create_message(
190+
text=message_prompt.format(**data),
191+
images=[i.to_image_url() for i in text.media] if text.media else None,
192+
),
179193
]
180194
llm_result = await summary_llm_model.call_single(
181195
messages=messages,
@@ -199,9 +213,7 @@ async def map_fxn_summary(
199213
except KeyError:
200214
success = False
201215
else:
202-
# Strip newlines in case chunking led to blank lines,
203-
# but not spaces, to preserve text alignment
204-
context = text.text.strip("\n")
216+
context = cleaned_text
205217
# If we don't assign scores, just default to 5.
206218
# why 5? Because we filter out 0s in another place
207219
# and 5/10 is the other default I could come up with

src/paperqa/docs.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -380,17 +380,18 @@ async def aadd( # noqa: PLR0912
380380
doc, **(query_kwargs | kwargs)
381381
)
382382

383-
texts = await read_doc(
383+
texts, metadata = await read_doc(
384384
path,
385385
doc,
386386
chunk_chars=parse_config.chunk_size,
387387
overlap=parse_config.overlap,
388388
page_size_limit=parse_config.page_size_limit,
389389
use_block_parsing=parse_config.pdfs_use_block_parsing,
390390
parse_pdf=parse_config.parse_pdf,
391+
include_metadata=True,
391392
)
392393
# loose check to see if document was loaded
393-
if (
394+
if metadata.parse_type != "image" and (
394395
not texts
395396
or len(texts[0].text) < 10 # noqa: PLR2004
396397
or (

src/paperqa/prompts.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,26 @@
11
from datetime import datetime
22

3-
# ruff: noqa: E501
4-
53
summary_prompt = (
64
"Summarize the excerpt below to help answer a question.\n\nExcerpt from"
7-
" {citation}\n\n----\n\n{text}\n\n----\n\nQuestion: {question}\n\nDo not directly"
5+
" {citation}\n\n------------\n\n{text}\n\n------------"
6+
"\n\nQuestion: {question}\n\nDo not directly"
87
" answer the question, instead summarize to give evidence to help answer the"
98
" question. Stay detailed; report specific numbers, equations, or direct quotes"
109
' (marked with quotation marks). Reply "Not applicable" if the excerpt is'
1110
" irrelevant. At the end of your response, provide an integer score from 1-10 on a"
1211
" newline indicating relevance to question. Do not explain your score.\n\nRelevant"
1312
" Information Summary ({summary_length}):"
1413
)
14+
# This prompt template integrates with `text` variable of the above `summary_prompt`
15+
text_with_tables_prompt_template = (
16+
"{text}\n\n------------\n\nMarkdown tables from {citation}."
17+
" If the markdown is garbled, refer to the images"
18+
"\n\n------------\n\n{tables}"
19+
)
1520

1621
summary_json_prompt = (
17-
"Excerpt from {citation}\n\n----\n\n{text}\n\n----\n\nQuestion: {question}\n\n"
22+
"Excerpt from {citation}\n\n------------\n\n{text}\n\n------------"
23+
"\n\nQuestion: {question}\n\n"
1824
)
1925

2026
# The below "cannot answer" sentinel phrase should:
@@ -45,7 +51,7 @@
4551

4652
qa_prompt = (
4753
"Answer the question below with the context.\n\n"
48-
"Context:\n\n{context}\n\n----\n\n"
54+
"Context:\n\n{context}\n\n------------\n\n"
4955
"Question: {question}\n\n"
5056
"Write an answer based on the context. "
5157
"If the context provides insufficient information reply "
@@ -99,15 +105,19 @@
99105
)
100106

101107
# NOTE: we use double curly braces here so it's not considered an f-string template
102-
summary_json_system_prompt = """\
103-
Provide a summary of the relevant information that could help answer the question based on the excerpt. Respond with the following JSON format:
104-
105-
{{
106-
"summary": "...",
107-
"relevance_score": "..."
108-
}}
109-
110-
where `summary` is relevant information from the text - {summary_length} words. `relevance_score` is an integer 1-10 for the relevance of `summary` to the question."""
108+
summary_json_system_prompt = (
109+
"Provide a summary of the relevant information"
110+
" that could help answer the question based on the excerpt."
111+
" Your summary, combined with many others,"
112+
" will be given to the model to generate an answer."
113+
" Respond with the following JSON format:"
114+
'\n\n{{\n "summary": "...",\n "relevance_score": "..."\n "used_images"\n}}'
115+
"\n\nwhere `summary` is relevant information from the text - {summary_length} words."
116+
" `relevance_score` is an integer 1-10 for the relevance of `summary` to the question."
117+
" `used_images` is a boolean flag indicating"
118+
" if any images present in a multimodal message were used,"
119+
" and if no images were present it should be false."
120+
)
111121

112122
env_system_prompt = (
113123
# Matching https://github.com/langchain-ai/langchain/blob/langchain%3D%3D0.2.3/libs/langchain/langchain/agents/openai_functions_agent/base.py#L213-L215

src/paperqa/settings.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -523,7 +523,11 @@ class IndexSettings(BaseModel):
523523
),
524524
)
525525
files_filter: Callable[[anyio.Path | pathlib.Path], bool] = Field(
526-
default=lambda f: f.suffix in {".txt", ".pdf", ".html", ".md"},
526+
default=lambda f: (
527+
f.suffix
528+
# TODO: add images after embeddings are supported
529+
in {".txt", ".pdf", ".html", ".md"}
530+
),
527531
exclude=True,
528532
description=(
529533
"Filter function to apply to files in the paper directory."

0 commit comments

Comments
 (0)