88
99from constants import SettingsKeys # type: ignore [attr-defined]
1010from helpers import StructureToolHelper as STHelper
11- from utils import json_to_markdown
11+ from utils import json_to_markdown , repair_json_with_best_structure
1212
1313from unstract .flags .feature_flag import check_feature_flag_status
1414
@@ -116,6 +116,42 @@ def _override_section(
116116 self .stream_log (f"Overrode { change_desc } " )
117117 return changes
118118
119+ def _should_skip_extraction_for_smart_table (
120+ self , input_file : str , outputs : list [dict [str , Any ]]
121+ ) -> bool :
122+ """Check if extraction and indexing should be skipped for smart table extraction.
123+
124+ Args:
125+ input_file: Path to the input file
126+ outputs: List of output configurations
127+
128+ Returns:
129+ True if extraction/indexing should be skipped, False otherwise
130+ """
131+ # Check if input file is an Excel file
132+ file_ext = Path (input_file ).suffix .lower ()
133+ if file_ext not in [".xlsx" , ".xls" ]:
134+ return False
135+
136+ # Check if any output has table_settings with valid JSON prompt
137+ for output in outputs :
138+ if SettingsKeys .TABLE_SETTINGS in output :
139+ prompt = output .get (SettingsKeys .PROMPTX , "" )
140+ if prompt and isinstance (prompt , str ):
141+ try :
142+ # Try to parse the prompt as JSON
143+ schema_data = repair_json_with_best_structure (prompt )
144+ # If it's a valid dict (schema object), skip extraction
145+ if schema_data and isinstance (schema_data , dict ):
146+ return True
147+ except Exception as e :
148+ logger .warning (
149+ "Failed to parse prompt as JSON for smart table extraction: %s" ,
150+ str (e ),
151+ )
152+ continue
153+ return False
154+
119155 def validate (self , input_file : str , settings : dict [str , Any ]) -> None :
120156 enable_challenge : bool = settings .get (SettingsKeys .ENABLE_CHALLENGE , False )
121157 challenge_llm : str = settings .get (SettingsKeys .CHALLENGE_LLM_ADAPTER_ID , "" )
@@ -231,22 +267,34 @@ def run(
231267
232268 custom_data = self .get_exec_metadata .get (SettingsKeys .CUSTOM_DATA , {})
233269 payload ["custom_data" ] = custom_data
234- self .stream_log (f"Extracting document '{ self .source_file_name } '" )
235- usage_kwargs : dict [Any , Any ] = dict ()
236- usage_kwargs [UsageKwargs .RUN_ID ] = self .file_execution_id
237- usage_kwargs [UsageKwargs .FILE_NAME ] = self .source_file_name
238- usage_kwargs [UsageKwargs .EXECUTION_ID ] = self .execution_id
239- extracted_text = STHelper .dynamic_extraction (
240- file_path = input_file ,
241- enable_highlight = is_highlight_enabled ,
242- usage_kwargs = usage_kwargs ,
243- run_id = self .file_execution_id ,
244- tool_settings = tool_settings ,
245- extract_file_path = tool_data_dir / SettingsKeys .EXTRACT ,
246- tool = self ,
247- execution_run_data_folder = str (execution_run_data_folder ),
270+
271+ # Check if we should skip extraction and indexing for Excel table extraction with valid JSON
272+ skip_extraction_and_indexing = self ._should_skip_extraction_for_smart_table (
273+ input_file , outputs
248274 )
249275
276+ extracted_text = ""
277+ usage_kwargs : dict [Any , Any ] = dict ()
278+ if skip_extraction_and_indexing :
279+ self .stream_log (
280+ "Skipping extraction and indexing for Excel table with valid JSON schema"
281+ )
282+ else :
283+ self .stream_log (f"Extracting document '{ self .source_file_name } '" )
284+ usage_kwargs [UsageKwargs .RUN_ID ] = self .file_execution_id
285+ usage_kwargs [UsageKwargs .FILE_NAME ] = self .source_file_name
286+ usage_kwargs [UsageKwargs .EXECUTION_ID ] = self .execution_id
287+ extracted_text = STHelper .dynamic_extraction (
288+ file_path = input_file ,
289+ enable_highlight = is_highlight_enabled ,
290+ usage_kwargs = usage_kwargs ,
291+ run_id = self .file_execution_id ,
292+ tool_settings = tool_settings ,
293+ extract_file_path = tool_data_dir / SettingsKeys .EXTRACT ,
294+ tool = self ,
295+ execution_run_data_folder = str (execution_run_data_folder ),
296+ )
297+
250298 index_metrics = {}
251299 if is_summarization_enabled :
252300 summarize_file_path , summarize_file_hash = self ._summarize (
@@ -258,6 +306,10 @@ def run(
258306 )
259307 payload [SettingsKeys .FILE_HASH ] = summarize_file_hash
260308 payload [SettingsKeys .FILE_PATH ] = summarize_file_path
309+ elif skip_extraction_and_indexing :
310+ # Use source file directly for Excel with valid JSON
311+ payload [SettingsKeys .FILE_PATH ] = input_file
312+ pass
261313 elif not is_single_pass_enabled :
262314 # Track seen parameter combinations to avoid duplicate indexing
263315 seen_params = set ()
@@ -326,7 +378,11 @@ def run(
326378 is_directory_mode : bool = table_settings .get (
327379 SettingsKeys .IS_DIRECTORY_MODE , False
328380 )
329- table_settings [SettingsKeys .INPUT_FILE ] = extracted_input_file
381+ # Use source file directly for Excel with valid JSON, otherwise use extracted file
382+ if skip_extraction_and_indexing :
383+ table_settings [SettingsKeys .INPUT_FILE ] = input_file
384+ else :
385+ table_settings [SettingsKeys .INPUT_FILE ] = extracted_input_file
330386 table_settings [SettingsKeys .IS_DIRECTORY_MODE ] = is_directory_mode
331387 self .stream_log (f"Performing table extraction with: { table_settings } " )
332388 output .update ({SettingsKeys .TABLE_SETTINGS : table_settings })
0 commit comments