1
1
import re
2
2
import threading
3
3
from collections import defaultdict
4
+ from dataclasses import dataclass
4
5
from typing import Generator
5
6
6
7
import numpy as np
7
8
8
9
from kotaemon .base import AIMessage , Document , HumanMessage , SystemMessage
9
10
from kotaemon .llms import PromptTemplate
10
11
11
- from .citation import CiteEvidence
12
12
from .citation_qa import CITATION_TIMEOUT , MAX_IMAGES , AnswerWithContextPipeline
13
13
from .format_context import EVIDENCE_MODE_FIGURE
14
14
from .utils import find_start_end_phrase
61
61
END_PHRASE: this shows good retrieval quality.
62
62
63
63
FINAL ANSWER
64
- An alternative to semantic chunking is fixed-size chunking. This traditional method involves splitting documents into chunks of a predetermined or user-specified size, regardless of semantic content, which is computationally efficient【1】. However, it may result in the fragmentation of semantically related content, thereby potentially degrading retrieval performance【2】.
64
+ An alternative to semantic chunking is fixed-size chunking. This traditional method involves splitting documents into chunks of a predetermined or user-specified size, regardless of semantic content, which is computationally efficient【1】. However, it may result in the fragmentation of semantically related content, thereby potentially degrading retrieval performance【1】【 2】.
65
65
66
66
QUESTION: {question}\n
67
67
ANSWER:
68
68
""" # noqa
69
69
70
+ START_ANSWER = "FINAL ANSWER"
71
+ START_CITATION = "CITATION LIST"
72
+ CITATION_PATTERN = r"citation【(\d+)】"
73
+ START_ANSWER_PATTERN = "start_phrase:"
74
+ END_ANSWER_PATTERN = "end_phrase:"
75
+
76
+
77
+ @dataclass
78
+ class InlineEvidence :
79
+ """List of evidences to support the answer."""
80
+
81
+ start_phrase : str | None = None
82
+ end_phrase : str | None = None
83
+ idx : int | None = None
84
+
70
85
71
86
class AnswerWithInlineCitation (AnswerWithContextPipeline ):
72
87
"""Answer the question based on the evidence with inline citation"""
@@ -85,15 +100,54 @@ def get_prompt(self, question, evidence, evidence_mode: int):
85
100
86
101
return prompt , evidence
87
102
88
- def answer_to_citations (self , answer ):
89
- evidences = []
103
+ def answer_to_citations (self , answer ) -> list [ InlineEvidence ] :
104
+ citations : list [ InlineEvidence ] = []
90
105
lines = answer .split ("\n " )
91
- for line in lines :
92
- for keyword in ["START_PHRASE:" , "END_PHRASE:" ]:
93
- if line .startswith (keyword ):
94
- evidences .append (line [len (keyword ) :].strip ())
95
106
96
- return CiteEvidence (evidences = evidences )
107
+ current_evidence = None
108
+
109
+ for line in lines :
110
+ # check citation idx using regex
111
+ match = re .match (CITATION_PATTERN , line .lower ())
112
+
113
+ if match :
114
+ try :
115
+ parsed_citation_idx = int (match .group (1 ))
116
+ except ValueError :
117
+ parsed_citation_idx = None
118
+
119
+ # conclude the current evidence if exists
120
+ if current_evidence :
121
+ citations .append (current_evidence )
122
+ current_evidence = None
123
+
124
+ current_evidence = InlineEvidence (idx = parsed_citation_idx )
125
+ else :
126
+ for keyword in [START_ANSWER_PATTERN , END_ANSWER_PATTERN ]:
127
+ if line .lower ().startswith (keyword ):
128
+ matched_phrase = line [len (keyword ) :].strip ()
129
+ if not current_evidence :
130
+ current_evidence = InlineEvidence (idx = None )
131
+
132
+ if keyword == START_ANSWER_PATTERN :
133
+ current_evidence .start_phrase = matched_phrase
134
+ else :
135
+ current_evidence .end_phrase = matched_phrase
136
+
137
+ break
138
+
139
+ if (
140
+ current_evidence
141
+ and current_evidence .end_phrase
142
+ and current_evidence .start_phrase
143
+ ):
144
+ citations .append (current_evidence )
145
+ current_evidence = None
146
+
147
+ if current_evidence :
148
+ citations .append (current_evidence )
149
+
150
+ return citations
97
151
98
152
def replace_citation_with_link (self , answer : str ):
99
153
# Define the regex pattern to match 【number】
@@ -114,6 +168,8 @@ def replace_citation_with_link(self, answer: str):
114
168
),
115
169
)
116
170
171
+ answer = answer .replace (START_CITATION , "" )
172
+
117
173
return answer
118
174
119
175
def stream ( # type: ignore
@@ -178,21 +234,31 @@ def mindmap_call():
178
234
# append main prompt
179
235
messages .append (HumanMessage (content = prompt ))
180
236
181
- START_ANSWER = "FINAL ANSWER"
182
- start_of_answer = True
183
237
final_answer = ""
184
238
185
239
try :
186
240
# try streaming first
187
241
print ("Trying LLM streaming" )
188
242
for out_msg in self .llm .stream (messages ):
189
243
if START_ANSWER in output :
244
+ if not final_answer :
245
+ try :
246
+ left_over_answer = output .split (START_ANSWER )[1 ].lstrip ()
247
+ except IndexError :
248
+ left_over_answer = ""
249
+ if left_over_answer :
250
+ out_msg .text = left_over_answer + out_msg .text
251
+
190
252
final_answer += (
191
- out_msg .text .lstrip () if start_of_answer else out_msg .text
253
+ out_msg .text .lstrip () if not final_answer else out_msg .text
192
254
)
193
- start_of_answer = False
194
255
yield Document (channel = "chat" , content = out_msg .text )
195
256
257
+ # check for the edge case of citation list is repeated
258
+ # with smaller LLMs
259
+ if START_CITATION in out_msg .text :
260
+ break
261
+
196
262
output += out_msg .text
197
263
logprobs += out_msg .logprobs
198
264
except NotImplementedError :
@@ -235,10 +301,15 @@ def match_evidence_with_context(self, answer, docs) -> dict[str, list[dict]]:
235
301
if not answer .metadata ["citation" ]:
236
302
return spans
237
303
238
- evidences = answer .metadata ["citation" ].evidences
304
+ evidences = answer .metadata ["citation" ]
305
+
306
+ for e_id , evidence in enumerate (evidences ):
307
+ start_phrase , end_phrase = evidence .start_phrase , evidence .end_phrase
308
+ evidence_idx = evidence .idx
309
+
310
+ if evidence_idx is None :
311
+ evidence_idx = e_id + 1
239
312
240
- for start_idx in range (0 , len (evidences ), 2 ):
241
- start_phrase , end_phrase = evidences [start_idx : start_idx + 2 ]
242
313
best_match = None
243
314
best_match_length = 0
244
315
best_match_doc_idx = None
@@ -259,7 +330,7 @@ def match_evidence_with_context(self, answer, docs) -> dict[str, list[dict]]:
259
330
{
260
331
"start" : best_match [0 ],
261
332
"end" : best_match [1 ],
262
- "idx" : start_idx // 2 , # implicitly set from the start_idx
333
+ "idx" : evidence_idx ,
263
334
}
264
335
)
265
336
return spans
0 commit comments