Skip to content

Commit 1889ddd

Browse files
UN-3001 [FIX] Extract text in Prompt Studio when extractor metadata is updated (#1661)
* UN-3001 [FEAT] Track X2Text config hash for extraction re-use optimization - Refactored extraction status tracking to use x2text_config_hash instead of doc_id - X2Text config hash isolates extraction from indexing-related parameters - Prevents unnecessary re-extractions when only vector DB/embeddings change - Single atomic update_or_create operation for extraction_status - Added extraction failure tracking with error messages for debugging - Unified mark_extraction_status() method handles both success and failure - Added USE_SDK_V2 feature flag support for sdk imports - Simplified signature of dynamic_extractor() and summarize() methods * UN-3001 [FIX] Add error handling for None metadata and improve exception logging - Guard against None metadata when adapter_metadata_b is None to prevent TypeError - Changed from logger.error() to logger.exception() to capture full stack traces - Added return value checks on mark_extraction_status() calls with warning logs - Improves debugging visibility when status updates fail Fixes extraction crashes and provides better error tracking for monitoring. --------- Co-authored-by: Rahul Johny <[email protected]>
1 parent a748c93 commit 1889ddd

File tree

2 files changed

+110
-77
lines changed

2 files changed

+110
-77
lines changed

backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,14 @@
7474
from unstract.sdk1.file_storage.env_helper import EnvHelper
7575
from unstract.sdk1.prompt import PromptTool
7676
from unstract.sdk1.utils.indexing import IndexingUtils
77+
from unstract.sdk1.utils.tool import ToolUtils
7778
else:
7879
from unstract.sdk.constants import LogLevel
7980
from unstract.sdk.exceptions import IndexingError, SdkError
8081
from unstract.sdk.file_storage.constants import StorageType
8182
from unstract.sdk.file_storage.env_helper import EnvHelper
8283
from unstract.sdk.prompt import PromptTool
84+
from unstract.sdk.utils import ToolUtils
8385
from unstract.sdk.utils.indexing_utils import IndexingUtils
8486

8587
logger = logging.getLogger(__name__)
@@ -421,11 +423,10 @@ def index_document(
421423
document_id=document_id,
422424
run_id=run_id,
423425
enable_highlight=tool.enable_highlight,
424-
doc_id=doc_id,
425426
)
426427
if tool.summarize_context:
427428
summarize_file_path = PromptStudioHelper.summarize(
428-
file_name, org_id, document_id, run_id, tool, doc_id
429+
file_name, org_id, run_id, tool
429430
)
430431
summarize_doc_id = IndexingUtils.generate_index_key(
431432
vector_db=str(summary_profile.vector_store.id),
@@ -481,7 +482,7 @@ def index_document(
481482
return doc_id
482483

483484
@staticmethod
484-
def summarize(file_name, org_id, document_id, run_id, tool, doc_id) -> str:
485+
def summarize(file_name, org_id, run_id, tool) -> str:
485486
summarizer_plugin = get_plugin("summarizer")
486487
usage_kwargs: dict[Any, Any] = dict()
487488
usage_kwargs[ToolStudioPromptKeys.RUN_ID] = run_id
@@ -880,7 +881,6 @@ def _fetch_response(
880881
document_id=document_id,
881882
run_id=run_id,
882883
enable_highlight=tool.enable_highlight,
883-
doc_id=doc_id,
884884
)
885885
logger.info(f"Extracted text from {file_path} for {doc_id}")
886886
if is_summary:
@@ -1232,25 +1232,13 @@ def _fetch_single_pass_response(
12321232
file_path = os.path.join(
12331233
directory, "extract", os.path.splitext(filename)[0] + ".txt"
12341234
)
1235-
doc_id = IndexingUtils.generate_index_key(
1236-
vector_db=str(default_profile.vector_store.id),
1237-
embedding=str(default_profile.embedding_model.id),
1238-
x2text=str(default_profile.x2text.id),
1239-
chunk_size=str(default_profile.chunk_size),
1240-
chunk_overlap=str(default_profile.chunk_overlap),
1241-
file_path=input_file_path,
1242-
file_hash=None,
1243-
fs=fs_instance,
1244-
tool=util,
1245-
)
12461235
PromptStudioHelper.dynamic_extractor(
12471236
profile_manager=default_profile,
12481237
file_path=input_file_path,
12491238
org_id=org_id,
12501239
document_id=document_id,
12511240
run_id=run_id,
12521241
enable_highlight=tool.enable_highlight,
1253-
doc_id=doc_id,
12541242
)
12551243
# Indexing is not needed as Single pass is always non chunked.
12561244
vector_db = str(default_profile.vector_store.id)
@@ -1326,8 +1314,11 @@ def dynamic_extractor(
13261314
org_id: str,
13271315
profile_manager: ProfileManager,
13281316
document_id: str,
1329-
doc_id: str,
13301317
) -> str:
1318+
# Guard against None metadata (when adapter_metadata_b is None)
1319+
metadata = profile_manager.x2text.metadata or {}
1320+
x2text_config_hash = ToolUtils.hash_str(json.dumps(metadata, sort_keys=True))
1321+
13311322
x2text = str(profile_manager.x2text.id)
13321323
is_extracted: bool = False
13331324
extract_file_path: str | None = None
@@ -1342,7 +1333,7 @@ def dynamic_extractor(
13421333
is_extracted = PromptStudioIndexHelper.check_extraction_status(
13431334
document_id=document_id,
13441335
profile_manager=profile_manager,
1345-
doc_id=doc_id,
1336+
x2text_config_hash=x2text_config_hash,
13461337
enable_highlight=enable_highlight,
13471338
)
13481339
if is_extracted:
@@ -1381,16 +1372,36 @@ def dynamic_extractor(
13811372
request_id=StateStore.get(Common.REQUEST_ID),
13821373
)
13831374
extracted_text = responder.extract(payload=payload)
1384-
PromptStudioIndexHelper.mark_extraction_status(
1375+
success = PromptStudioIndexHelper.mark_extraction_status(
13851376
document_id=document_id,
13861377
profile_manager=profile_manager,
1387-
doc_id=doc_id,
1378+
x2text_config_hash=x2text_config_hash,
13881379
enable_highlight=enable_highlight,
13891380
)
1381+
if not success:
1382+
logger.warning(
1383+
f"Failed to mark extraction success for document {document_id}. "
1384+
f"Extraction completed but status not saved."
1385+
)
13901386
except SdkError as e:
13911387
msg = str(e)
13921388
if e.actual_err and hasattr(e.actual_err, "response"):
13931389
msg = e.actual_err.response.json().get("error", str(e))
1390+
1391+
success = PromptStudioIndexHelper.mark_extraction_status(
1392+
document_id=document_id,
1393+
profile_manager=profile_manager,
1394+
x2text_config_hash=x2text_config_hash,
1395+
enable_highlight=enable_highlight,
1396+
extracted=False,
1397+
error_message=msg,
1398+
)
1399+
if not success:
1400+
logger.warning(
1401+
f"Failed to mark extraction failure for document {document_id}. "
1402+
f"Extraction failed but status not saved."
1403+
)
1404+
13941405
raise ExtractionAPIError(
13951406
f"Failed to extract '{filename}'. {msg}",
13961407
status_code=int(e.status_code or 500),

backend/prompt_studio/prompt_studio_index_manager_v2/prompt_studio_index_helper.py

Lines changed: 79 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -71,16 +71,23 @@ def handle_index_manager(
7171
def mark_extraction_status(
7272
document_id: str,
7373
profile_manager: ProfileManager,
74-
doc_id: str,
75-
enable_highlight: bool = False,
74+
x2text_config_hash: str,
75+
enable_highlight: bool,
76+
extracted: bool = True,
77+
error_message: str | None = None,
7678
) -> bool:
77-
"""Marks the extraction status for a given document with highlight metadata.
79+
"""Marks the extraction status for a given document.
80+
81+
Uses x2text_config_hash (hash of X2Text config metadata) as the key.
82+
Handles both successful and failed extractions.
7883
7984
Args:
8085
document_id (str): ID of the document in DocumentManager.
8186
profile_manager (ProfileManager): ProfileManager instance for context.
82-
doc_id (str): Unique identifier for the document within extraction status.
83-
enable_highlight (bool): Whether highlight metadata was used during extraction.
87+
x2text_config_hash (str): Hash of X2Text config metadata.
88+
enable_highlight (bool): Whether highlight metadata was used/attempted.
89+
extracted (bool): True for success, False for failure. Defaults to True.
90+
error_message (str | None): Error message if extraction failed.
8491
8592
Returns:
8693
bool: True if the status is successfully updated, False otherwise.
@@ -95,62 +102,78 @@ def mark_extraction_status(
95102
"profile_manager": profile_manager,
96103
}
97104

98-
index_manager, created = IndexManager.objects.get_or_create(**args)
99-
100-
index_manager.extraction_status = index_manager.extraction_status or {}
101-
102-
index_manager.extraction_status[doc_id] = {
103-
"extracted": True,
105+
# Build extraction status data
106+
status_data = {
107+
"extracted": extracted,
104108
"enable_highlight": enable_highlight,
105109
}
110+
111+
# Add error message if extraction failed
112+
if not extracted and error_message:
113+
status_data["error"] = error_message
114+
115+
defaults = {"extraction_status": {x2text_config_hash: status_data}}
116+
117+
index_manager, created = IndexManager.objects.update_or_create(
118+
**args,
119+
defaults=defaults,
120+
)
121+
106122
logger.info(
107123
f"Index manager {index_manager} {index_manager.index_ids_history}"
108124
)
109-
index_manager.save(update_fields=["extraction_status"])
110125

111-
if created:
112-
logger.info(
113-
f"IndexManager entry created "
114-
f"for document: {document_id} with {doc_id} "
115-
f"(highlight={enable_highlight})"
116-
)
126+
if extracted:
127+
if created:
128+
logger.info(
129+
f"IndexManager entry created with SUCCESS "
130+
f"for document: {document_id} "
131+
f"with x2text_config_hash: {x2text_config_hash}"
132+
)
133+
else:
134+
logger.info(
135+
f"Extraction SUCCESS for document: {document_id} "
136+
f"with x2text_config_hash: {x2text_config_hash}"
137+
)
117138
else:
118-
logger.info(
119-
f"Updated extraction status "
120-
f"for document: {document_id} with {doc_id} "
121-
f"(highlight={enable_highlight})"
139+
logger.error(
140+
f"Extraction FAILED for document: {document_id} "
141+
f"with x2text_config_hash: {x2text_config_hash}. "
142+
f"Error: {error_message}"
122143
)
144+
123145
return True
124146

125147
except DocumentManager.DoesNotExist:
126148
logger.error(f"Document with ID {document_id} does not exist.")
127-
raise IndexingAPIError(
128-
"Error occured while extracting. Please contact admin."
129-
)
149+
return False
130150

131151
except Exception as e:
132-
logger.error(f"Unexpected error updating extraction status: {e}")
133-
raise IndexingAPIError(f"Error updating indexing status {str(e)}") from e
152+
logger.exception(
153+
f"Unexpected error marking extraction status for document {document_id}: {e}"
154+
)
155+
return False
134156

135157
@staticmethod
136158
def check_extraction_status(
137159
document_id: str,
138160
profile_manager: ProfileManager,
139-
doc_id: str,
140-
enable_highlight: bool = False,
161+
x2text_config_hash: str,
162+
enable_highlight: bool,
141163
) -> bool:
142-
"""Checks if the extraction status is already marked as complete
143-
for the given document and doc_id with matching highlight setting.
164+
"""Checks if the extraction status is already marked as complete.
165+
166+
Uses x2text_config_hash (hash of X2Text config metadata) as the key.
167+
Also validates that enable_highlight setting matches.
144168
145169
Args:
146170
document_id (str): ID of the document in DocumentManager.
147171
profile_manager (ProfileManager): ProfileManager instance for context.
148-
doc_id (str): Unique identifier for the document within extraction status.
172+
x2text_config_hash (str): Hash of X2Text config metadata.
149173
enable_highlight (bool): Whether highlight metadata is required.
150174
151175
Returns:
152-
bool: True if extraction is complete with matching highlight setting,
153-
False otherwise.
176+
bool: True if extraction is complete with matching settings, False otherwise.
154177
"""
155178
try:
156179
index_manager = IndexManager.objects.filter(
@@ -162,45 +185,44 @@ def check_extraction_status(
162185
return False
163186

164187
extraction_status = index_manager.extraction_status or {}
165-
status_entry = extraction_status.get(doc_id)
188+
status_entry = extraction_status.get(x2text_config_hash)
166189

167190
if not status_entry:
168191
logger.info(
169-
f"Extraction is NOT yet marked as complete "
170-
f"for document: {document_id} with {doc_id}"
192+
f"Extraction NOT complete for document: {document_id} "
193+
f"with x2text_config_hash: {x2text_config_hash}"
171194
)
172195
return False
173196

174-
# Backward compatibility: treat boolean True as non-highlighted
175-
if isinstance(status_entry, bool):
176-
is_extracted = status_entry
177-
is_highlight_handled = False
178-
else:
179-
# New format: {"extracted": True, "enable_highlight": <bool>}
180-
is_extracted = status_entry.get("extracted", False)
181-
is_highlight_handled = status_entry.get("enable_highlight", False)
197+
# {"extracted": True/False, "enable_highlight": bool, "error": str (optional)}
198+
is_extracted = status_entry.get("extracted", False)
199+
stored_highlight = status_entry.get("enable_highlight", False)
182200

183-
# Check if extraction exists AND highlight setting matches
184-
if is_extracted and is_highlight_handled == enable_highlight:
201+
# Check if previous extraction failed
202+
if not is_extracted:
203+
error_msg = status_entry.get("error", "Unknown error")
185204
logger.info(
186-
f"Extraction is already marked as complete "
187-
f"for document: {document_id} with {doc_id} "
205+
f"Previous extraction FAILED for {x2text_config_hash}. "
206+
f"Error: {error_msg}. Will retry extraction."
207+
)
208+
return False # Allow retry
209+
210+
if is_extracted and stored_highlight == enable_highlight:
211+
logger.info(
212+
f"Extraction already complete for document: {document_id} "
213+
f"with x2text_config_hash: {x2text_config_hash} "
188214
f"(highlight={enable_highlight})"
189215
)
190216
return True
191-
elif is_extracted and is_highlight_handled != enable_highlight:
217+
elif is_extracted and stored_highlight != enable_highlight:
192218
logger.info(
193-
f"Extraction exists but highlight mismatch "
194-
f"for document: {document_id} with {doc_id}. "
195-
f"Stored: {is_highlight_handled}, Requested: {enable_highlight}. "
219+
f"Extraction exists but highlight mismatch for {x2text_config_hash}. "
220+
f"Stored: {stored_highlight}, Requested: {enable_highlight}. "
196221
f"Re-extraction needed."
197222
)
198223
return False
199224
else:
200-
logger.info(
201-
f"Extraction is NOT yet marked as complete "
202-
f"for document: {document_id} with {doc_id}"
203-
)
225+
logger.info(f"Extraction NOT complete for document: {document_id}")
204226
return False
205227

206228
except Exception as e:

0 commit comments

Comments
 (0)