Zipstack
diff --git a/‎backend/workflow_manager/internal_views.py‎
Lines changed: 7 additions & 2 deletions b/‎backend/workflow_manager/internal_views.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎prompt-service/src/unstract/prompt_service/services/answer_prompt.py‎
Lines changed: 73 additions & 11 deletions b/‎prompt-service/src/unstract/prompt_service/services/answer_prompt.py‎
Lines changed: 73 additions & 11 deletions
diff --git a/‎tools/structure/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎tools/structure/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tools/structure/src/main.py‎
Lines changed: 72 additions & 16 deletions b/‎tools/structure/src/main.py‎
Lines changed: 72 additions & 16 deletions
diff --git a/‎tools/structure/src/utils.py‎
Lines changed: 69 additions & 0 deletions b/‎tools/structure/src/utils.py‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎workers/sample.env‎
Lines changed: 2 additions & 2 deletions b/‎workers/sample.env‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎workers/shared/constants/env_vars.py‎
Lines changed: 1 addition & 1 deletion b/‎workers/shared/constants/env_vars.py‎
Lines changed: 1 addition & 1 deletion
@@ -440,14 +440,18 @@ def _get_organization_context(self, execution: WorkflowExecution) -> dict:
             if execution.workflow and hasattr(execution.workflow, "organization"):
                 org = execution.workflow.organization
                 return {
-                    "organization_id": str(org.id),
-                    "organization_name": org.display_name,
+                    "organization_id": str(
+                        org.organization_id
+                    ),  # organization identifier
+                    "organization_uuid": str(org.id),  # organization uuid
+                    "organization_name": org.display_name,  # organization name
                     "settings": {},  # Add organization-specific settings if needed
                 }
             else:
                 logger.warning(f"No organization found for execution {execution.id}")
                 return {
                     "organization_id": None,
+                    "organization_uuid": None,
                     "organization_name": "Unknown",
                     "settings": {},
                 }
@@ -457,6 +461,7 @@ def _get_organization_context(self, execution: WorkflowExecution) -> dict:
             )
             return {
                 "organization_id": None,
+                "organization_uuid": None,
                 "organization_name": "Unknown",
                 "settings": {},
             }
 
@@ -264,23 +264,69 @@ def extract_table(
         prompt: str,
     ) -> dict[str, Any]:
         table_settings = output[PSKeys.TABLE_SETTINGS]
+
+        # Check if prompt has valid schema data using json_repair
+        has_valid_schema = False
+        schema_data = None
+
+        if prompt and isinstance(prompt, str):
+            try:
+                # Try to repair and parse the prompt as JSON
+                schema_data = repair_json_with_best_structure(prompt)
+                # Check if the result is a valid dict (schema object)
+                if isinstance(schema_data, dict) and schema_data:
+                    has_valid_schema = True
+                    app.logger.info(
+                        "Valid schema detected in prompt, using Smart Table Extractor"
+                    )
+            except Exception as e:
+                app.logger.debug(f"Prompt does not contain valid schema: {e}")
+
+        # If we have a valid schema, use the smart table extractor
+        if has_valid_schema:
+            smart_table_plugin: dict[str, Any] = PluginManager().get_plugin(
+                "smart-table-extractor"
+            )
+
+            if smart_table_plugin:
+                fs_instance = AnswerPromptService._get_file_storage_instance(
+                    execution_source
+                )
+
+                try:
+                    # Get the input file from table settings
+                    input_file = table_settings.get("input_file")
+
+                    # Run the smart table extractor
+                    result = smart_table_plugin["entrypoint_cls"].run(
+                        llm=llm,
+                        table_settings=table_settings,
+                        fs_instance=fs_instance,
+                        prompt=prompt,
+                        input_file=input_file,
+                    )
+
+                    # Extract the data from the result
+                    answer = result.get("data", [])
+                    structured_output[output[PSKeys.NAME]] = answer
+
+                    # We do not support summary and eval for table.
+                    # Hence returning the result
+                    return structured_output
+                except Exception as e:
+                    app.logger.error(f"Smart Table Extractor failed: {e}")
+                    # Fall back to regular table extractor
+                    app.logger.info("Falling back to regular table extractor")
+
+        # Use regular table extractor (original code)
         table_extractor: dict[str, Any] = PluginManager().get_plugin("table-extractor")
         if not table_extractor:
             raise APIError(
                 "Unable to extract table details. "
                 "Please contact admin to resolve this issue."
             )
-        fs_instance: FileStorage = FileStorage(FileStorageProvider.LOCAL)
-        if execution_source == ExecutionSource.IDE.value:
-            fs_instance = EnvHelper.get_storage(
-                storage_type=StorageType.PERMANENT,
-                env_name=FileStorageKeys.PERMANENT_REMOTE_STORAGE,
-            )
-        if execution_source == ExecutionSource.TOOL.value:
-            fs_instance = EnvHelper.get_storage(
-                storage_type=StorageType.SHARED_TEMPORARY,
-                env_name=FileStorageKeys.TEMPORARY_REMOTE_STORAGE,
-            )
+        fs_instance = AnswerPromptService._get_file_storage_instance(execution_source)
+
         try:
             answer = table_extractor["entrypoint_cls"].run_table_extraction(
                 llm=llm,
@@ -296,6 +342,22 @@ def extract_table(
             msg = f"Couldn't extract table. {e}"
             raise APIError(message=msg)
 
+    @staticmethod
+    def _get_file_storage_instance(execution_source) -> FileStorage:
+        fs_instance: FileStorage = FileStorage(FileStorageProvider.LOCAL)
+        if execution_source == ExecutionSource.IDE.value:
+            fs_instance = EnvHelper.get_storage(
+                storage_type=StorageType.PERMANENT,
+                env_name=FileStorageKeys.PERMANENT_REMOTE_STORAGE,
+            )
+        if execution_source == ExecutionSource.TOOL.value:
+            fs_instance = EnvHelper.get_storage(
+                storage_type=StorageType.SHARED_TEMPORARY,
+                env_name=FileStorageKeys.TEMPORARY_REMOTE_STORAGE,
+            )
+
+        return fs_instance
+
     @staticmethod
     def handle_json(
         answer: str,
 
@@ -6,3 +6,4 @@
 unstract-sdk[aws]~=0.78.0
 -e file:/unstract/sdk1
 -e file:/unstract/flags
+json-repair>=0.25.0
@@ -8,7 +8,7 @@
 
 from constants import SettingsKeys  # type: ignore [attr-defined]
 from helpers import StructureToolHelper as STHelper
-from utils import json_to_markdown
+from utils import json_to_markdown, repair_json_with_best_structure
 
 from unstract.flags.feature_flag import check_feature_flag_status
 
@@ -116,6 +116,42 @@ def _override_section(
                     self.stream_log(f"Overrode {change_desc}")
         return changes
 
+    def _should_skip_extraction_for_smart_table(
+        self, input_file: str, outputs: list[dict[str, Any]]
+    ) -> bool:
+        """Check if extraction and indexing should be skipped for smart table extraction.
+
+        Args:
+            input_file: Path to the input file
+            outputs: List of output configurations
+
+        Returns:
+            True if extraction/indexing should be skipped, False otherwise
+        """
+        # Check if input file is an Excel file
+        file_ext = Path(input_file).suffix.lower()
+        if file_ext not in [".xlsx", ".xls"]:
+            return False
+
+        # Check if any output has table_settings with valid JSON prompt
+        for output in outputs:
+            if SettingsKeys.TABLE_SETTINGS in output:
+                prompt = output.get(SettingsKeys.PROMPTX, "")
+                if prompt and isinstance(prompt, str):
+                    try:
+                        # Try to parse the prompt as JSON
+                        schema_data = repair_json_with_best_structure(prompt)
+                        # If it's a valid dict (schema object), skip extraction
+                        if schema_data and isinstance(schema_data, dict):
+                            return True
+                    except Exception as e:
+                        logger.warning(
+                            "Failed to parse prompt as JSON for smart table extraction: %s",
+                            str(e),
+                        )
+                        continue
+        return False
+
     def validate(self, input_file: str, settings: dict[str, Any]) -> None:
         enable_challenge: bool = settings.get(SettingsKeys.ENABLE_CHALLENGE, False)
         challenge_llm: str = settings.get(SettingsKeys.CHALLENGE_LLM_ADAPTER_ID, "")
@@ -231,22 +267,34 @@ def run(
 
         custom_data = self.get_exec_metadata.get(SettingsKeys.CUSTOM_DATA, {})
         payload["custom_data"] = custom_data
-        self.stream_log(f"Extracting document '{self.source_file_name}'")
-        usage_kwargs: dict[Any, Any] = dict()
-        usage_kwargs[UsageKwargs.RUN_ID] = self.file_execution_id
-        usage_kwargs[UsageKwargs.FILE_NAME] = self.source_file_name
-        usage_kwargs[UsageKwargs.EXECUTION_ID] = self.execution_id
-        extracted_text = STHelper.dynamic_extraction(
-            file_path=input_file,
-            enable_highlight=is_highlight_enabled,
-            usage_kwargs=usage_kwargs,
-            run_id=self.file_execution_id,
-            tool_settings=tool_settings,
-            extract_file_path=tool_data_dir / SettingsKeys.EXTRACT,
-            tool=self,
-            execution_run_data_folder=str(execution_run_data_folder),
+
+        # Check if we should skip extraction and indexing for Excel table extraction with valid JSON
+        skip_extraction_and_indexing = self._should_skip_extraction_for_smart_table(
+            input_file, outputs
         )
 
+        extracted_text = ""
+        usage_kwargs: dict[Any, Any] = dict()
+        if skip_extraction_and_indexing:
+            self.stream_log(
+                "Skipping extraction and indexing for Excel table with valid JSON schema"
+            )
+        else:
+            self.stream_log(f"Extracting document '{self.source_file_name}'")
+            usage_kwargs[UsageKwargs.RUN_ID] = self.file_execution_id
+            usage_kwargs[UsageKwargs.FILE_NAME] = self.source_file_name
+            usage_kwargs[UsageKwargs.EXECUTION_ID] = self.execution_id
+            extracted_text = STHelper.dynamic_extraction(
+                file_path=input_file,
+                enable_highlight=is_highlight_enabled,
+                usage_kwargs=usage_kwargs,
+                run_id=self.file_execution_id,
+                tool_settings=tool_settings,
+                extract_file_path=tool_data_dir / SettingsKeys.EXTRACT,
+                tool=self,
+                execution_run_data_folder=str(execution_run_data_folder),
+            )
+
         index_metrics = {}
         if is_summarization_enabled:
             summarize_file_path, summarize_file_hash = self._summarize(
@@ -258,6 +306,10 @@ def run(
             )
             payload[SettingsKeys.FILE_HASH] = summarize_file_hash
             payload[SettingsKeys.FILE_PATH] = summarize_file_path
+        elif skip_extraction_and_indexing:
+            # Use source file directly for Excel with valid JSON
+            payload[SettingsKeys.FILE_PATH] = input_file
+            pass
         elif not is_single_pass_enabled:
             # Track seen parameter combinations to avoid duplicate indexing
             seen_params = set()
@@ -326,7 +378,11 @@ def run(
                     is_directory_mode: bool = table_settings.get(
                         SettingsKeys.IS_DIRECTORY_MODE, False
                     )
-                    table_settings[SettingsKeys.INPUT_FILE] = extracted_input_file
+                    # Use source file directly for Excel with valid JSON, otherwise use extracted file
+                    if skip_extraction_and_indexing:
+                        table_settings[SettingsKeys.INPUT_FILE] = input_file
+                    else:
+                        table_settings[SettingsKeys.INPUT_FILE] = extracted_input_file
                     table_settings[SettingsKeys.IS_DIRECTORY_MODE] = is_directory_mode
                     self.stream_log(f"Performing table extraction with: {table_settings}")
                     output.update({SettingsKeys.TABLE_SETTINGS: table_settings})
 
@@ -1,5 +1,7 @@
 from typing import Any
 
+from json_repair import repair_json
+
 
 def json_to_markdown(data: Any, level: int = 0, parent_key: str = "") -> str:
     markdown = ""
@@ -29,3 +31,70 @@ def json_to_markdown(data: Any, level: int = 0, parent_key: str = "") -> str:
         markdown += f"{indent}- {data}\n"
 
     return markdown
+
+
+def repair_json_with_best_structure(json_str: str) -> Any:
+    """Repair and parse a potentially malformed JSON string with optimal structure detection.
+
+    This function attempts to repair and parse a JSON string using two different strategies
+    and returns the result that produces the most useful data structure. It handles cases
+    where the input might be incomplete, malformed, or ambiguous JSON.
+
+    The function tries two parsing approaches:
+    1. Parse the JSON string as-is
+    2. Parse the JSON string wrapped in array brackets [...]
+
+    It then intelligently selects the best result based on the following logic:
+    - If both results are strings (failed to parse as objects), return the as-is result
+    - If one result is a string and the other is an object/array, return the object/array
+    - If wrapping produces a single-element list that equals the as-is result, return as-is
+    - If as-is produces an object/array and wrapping produces multiple elements, prefer wrapped
+    - Otherwise, prefer the as-is result
+
+    Args:
+        json_str: A string containing potentially malformed JSON data. Can be a complete
+                 JSON object, array, or partial JSON that needs repair.
+
+    Returns:
+        The parsed JSON structure (dict, list, str, or other JSON-compatible type) that
+        represents the most meaningful interpretation of the input string. The return type
+        depends on the input and which parsing strategy produces the better result.
+
+    Example:
+        >>> repair_json_with_best_structure('{"name": "John", "age": 30}')
+        {'name': 'John', 'age': 30}
+
+        >>> repair_json_with_best_structure('{"incomplete": "object"')
+        {'incomplete': 'object'}
+
+        >>> repair_json_with_best_structure('{"a": 1}{"b": 2}')
+        [{'a': 1}, {'b': 2}]
+
+    Note:
+        This function is specifically designed for the structure-tool and uses the
+        json_repair library's repair_json function with return_objects=True and
+        ensure_ascii=False parameters.
+    """
+    parsed_as_is = repair_json(json_str=json_str, return_objects=True, ensure_ascii=False)
+    parsed_with_wrap = repair_json(
+        "[" + json_str + "]", return_objects=True, ensure_ascii=False
+    )
+
+    if all(isinstance(x, str) for x in (parsed_as_is, parsed_with_wrap)):
+        return parsed_as_is
+
+    if isinstance(parsed_as_is, str):
+        return parsed_with_wrap
+    if isinstance(parsed_with_wrap, str):
+        return parsed_as_is
+
+    if isinstance(parsed_with_wrap, list) and len(parsed_with_wrap) == 1:
+        if parsed_with_wrap[0] == parsed_as_is:
+            return parsed_as_is
+
+    if isinstance(parsed_as_is, (dict, list)):
+        if isinstance(parsed_with_wrap, list) and len(parsed_with_wrap) > 1:
+            return parsed_with_wrap
+        return parsed_as_is
+
+    return parsed_with_wrap
@@ -190,8 +190,8 @@ LOG_CONSUMER_AUTOSCALE=2,1
 # =============================================================================
 
 LOG_LEVEL=INFO
-# structured or django
-LOG_FORMAT=django
+# Note: LOG_FORMAT removed - format is now hardcoded (not configurable)
+# All workers use a single standardized format matching Django backend
 DEFAULT_LOG_LEVEL=INFO
 WORKER_VERSION=1.0.0
 WORKER_INSTANCE_ID=dev-01
 
@@ -51,5 +51,5 @@ class EnvVars:
 
     # Logging configuration
     LOG_LEVEL = "LOG_LEVEL"
-    LOG_FORMAT = "LOG_FORMAT"
     LOG_FILE = "LOG_FILE"
+    # Note: LOG_FORMAT removed - format is now hardcoded (not configurable)