Skip to content

Commit f15abdb

Browse files
committed
fix: improve inline citation parsing bump:patch
1 parent f3a2a29 commit f15abdb

File tree

2 files changed

+90
-19
lines changed

2 files changed

+90
-19
lines changed

libs/kotaemon/kotaemon/indices/qa/citation_qa.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -342,11 +342,11 @@ def prepare_citations(self, answer, docs) -> tuple[list[Document], list[Document
342342

343343
span_idx = span.get("idx", None)
344344
if span_idx is not None:
345-
to_highlight = f"【{span_idx + 1}】" + to_highlight
345+
to_highlight = f"【{span_idx}】" + to_highlight
346346

347347
text += Render.highlight(
348348
to_highlight,
349-
elem_id=str(span_idx + 1) if span_idx is not None else None,
349+
elem_id=str(span_idx) if span_idx is not None else None,
350350
)
351351
if idx < len(ss) - 1:
352352
text += cur_doc.text[span["end"] : ss[idx + 1]["start"]]

libs/kotaemon/kotaemon/indices/qa/citation_qa_inline.py

Lines changed: 88 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
import re
22
import threading
33
from collections import defaultdict
4+
from dataclasses import dataclass
45
from typing import Generator
56

67
import numpy as np
78

89
from kotaemon.base import AIMessage, Document, HumanMessage, SystemMessage
910
from kotaemon.llms import PromptTemplate
1011

11-
from .citation import CiteEvidence
1212
from .citation_qa import CITATION_TIMEOUT, MAX_IMAGES, AnswerWithContextPipeline
1313
from .format_context import EVIDENCE_MODE_FIGURE
1414
from .utils import find_start_end_phrase
@@ -61,12 +61,27 @@
6161
END_PHRASE: this shows good retrieval quality.
6262
6363
FINAL ANSWER
64-
An alternative to semantic chunking is fixed-size chunking. This traditional method involves splitting documents into chunks of a predetermined or user-specified size, regardless of semantic content, which is computationally efficient【1】. However, it may result in the fragmentation of semantically related content, thereby potentially degrading retrieval performance【2】.
64+
An alternative to semantic chunking is fixed-size chunking. This traditional method involves splitting documents into chunks of a predetermined or user-specified size, regardless of semantic content, which is computationally efficient【1】. However, it may result in the fragmentation of semantically related content, thereby potentially degrading retrieval performance【1】【2】.
6565
6666
QUESTION: {question}\n
6767
ANSWER:
6868
""" # noqa
6969

70+
START_ANSWER = "FINAL ANSWER"
71+
START_CITATION = "CITATION LIST"
72+
CITATION_PATTERN = r"citation【(\d+)】"
73+
START_ANSWER_PATTERN = "start_phrase:"
74+
END_ANSWER_PATTERN = "end_phrase:"
75+
76+
77+
@dataclass
78+
class InlineEvidence:
79+
"""List of evidences to support the answer."""
80+
81+
start_phrase: str | None = None
82+
end_phrase: str | None = None
83+
idx: int | None = None
84+
7085

7186
class AnswerWithInlineCitation(AnswerWithContextPipeline):
7287
"""Answer the question based on the evidence with inline citation"""
@@ -85,15 +100,54 @@ def get_prompt(self, question, evidence, evidence_mode: int):
85100

86101
return prompt, evidence
87102

88-
def answer_to_citations(self, answer):
89-
evidences = []
103+
def answer_to_citations(self, answer) -> list[InlineEvidence]:
104+
citations: list[InlineEvidence] = []
90105
lines = answer.split("\n")
91-
for line in lines:
92-
for keyword in ["START_PHRASE:", "END_PHRASE:"]:
93-
if line.startswith(keyword):
94-
evidences.append(line[len(keyword) :].strip())
95106

96-
return CiteEvidence(evidences=evidences)
107+
current_evidence = None
108+
109+
for line in lines:
110+
# check citation idx using regex
111+
match = re.match(CITATION_PATTERN, line.lower())
112+
113+
if match:
114+
try:
115+
parsed_citation_idx = int(match.group(1))
116+
except ValueError:
117+
parsed_citation_idx = None
118+
119+
# conclude the current evidence if exists
120+
if current_evidence:
121+
citations.append(current_evidence)
122+
current_evidence = None
123+
124+
current_evidence = InlineEvidence(idx=parsed_citation_idx)
125+
else:
126+
for keyword in [START_ANSWER_PATTERN, END_ANSWER_PATTERN]:
127+
if line.lower().startswith(keyword):
128+
matched_phrase = line[len(keyword) :].strip()
129+
if not current_evidence:
130+
current_evidence = InlineEvidence(idx=None)
131+
132+
if keyword == START_ANSWER_PATTERN:
133+
current_evidence.start_phrase = matched_phrase
134+
else:
135+
current_evidence.end_phrase = matched_phrase
136+
137+
break
138+
139+
if (
140+
current_evidence
141+
and current_evidence.end_phrase
142+
and current_evidence.start_phrase
143+
):
144+
citations.append(current_evidence)
145+
current_evidence = None
146+
147+
if current_evidence:
148+
citations.append(current_evidence)
149+
150+
return citations
97151

98152
def replace_citation_with_link(self, answer: str):
99153
# Define the regex pattern to match 【number】
@@ -114,6 +168,8 @@ def replace_citation_with_link(self, answer: str):
114168
),
115169
)
116170

171+
answer = answer.replace(START_CITATION, "")
172+
117173
return answer
118174

119175
def stream( # type: ignore
@@ -178,21 +234,31 @@ def mindmap_call():
178234
# append main prompt
179235
messages.append(HumanMessage(content=prompt))
180236

181-
START_ANSWER = "FINAL ANSWER"
182-
start_of_answer = True
183237
final_answer = ""
184238

185239
try:
186240
# try streaming first
187241
print("Trying LLM streaming")
188242
for out_msg in self.llm.stream(messages):
189243
if START_ANSWER in output:
244+
if not final_answer:
245+
try:
246+
left_over_answer = output.split(START_ANSWER)[1].lstrip()
247+
except IndexError:
248+
left_over_answer = ""
249+
if left_over_answer:
250+
out_msg.text = left_over_answer + out_msg.text
251+
190252
final_answer += (
191-
out_msg.text.lstrip() if start_of_answer else out_msg.text
253+
out_msg.text.lstrip() if not final_answer else out_msg.text
192254
)
193-
start_of_answer = False
194255
yield Document(channel="chat", content=out_msg.text)
195256

257+
# check for the edge case of citation list is repeated
258+
# with smaller LLMs
259+
if START_CITATION in out_msg.text:
260+
break
261+
196262
output += out_msg.text
197263
logprobs += out_msg.logprobs
198264
except NotImplementedError:
@@ -235,10 +301,15 @@ def match_evidence_with_context(self, answer, docs) -> dict[str, list[dict]]:
235301
if not answer.metadata["citation"]:
236302
return spans
237303

238-
evidences = answer.metadata["citation"].evidences
304+
evidences = answer.metadata["citation"]
305+
306+
for e_id, evidence in enumerate(evidences):
307+
start_phrase, end_phrase = evidence.start_phrase, evidence.end_phrase
308+
evidence_idx = evidence.idx
309+
310+
if evidence_idx is None:
311+
evidence_idx = e_id + 1
239312

240-
for start_idx in range(0, len(evidences), 2):
241-
start_phrase, end_phrase = evidences[start_idx : start_idx + 2]
242313
best_match = None
243314
best_match_length = 0
244315
best_match_doc_idx = None
@@ -259,7 +330,7 @@ def match_evidence_with_context(self, answer, docs) -> dict[str, list[dict]]:
259330
{
260331
"start": best_match[0],
261332
"end": best_match[1],
262-
"idx": start_idx // 2, # implicitly set from the start_idx
333+
"idx": evidence_idx,
263334
}
264335
)
265336
return spans

0 commit comments

Comments
 (0)