Skip to content

Commit 58ce1a6

Browse files
Merge branch 'fix/UN-2882-bigquery-float-precision' of github.com:Zipstack/unstract into fix/UN-2882-bigquery-float-precision
2 parents 4b1dceb + 2bf8057 commit 58ce1a6

File tree

12 files changed

+296
-314
lines changed

12 files changed

+296
-314
lines changed

backend/workflow_manager/internal_views.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -440,14 +440,18 @@ def _get_organization_context(self, execution: WorkflowExecution) -> dict:
440440
if execution.workflow and hasattr(execution.workflow, "organization"):
441441
org = execution.workflow.organization
442442
return {
443-
"organization_id": str(org.id),
444-
"organization_name": org.display_name,
443+
"organization_id": str(
444+
org.organization_id
445+
), # organization identifier
446+
"organization_uuid": str(org.id), # organization uuid
447+
"organization_name": org.display_name, # organization name
445448
"settings": {}, # Add organization-specific settings if needed
446449
}
447450
else:
448451
logger.warning(f"No organization found for execution {execution.id}")
449452
return {
450453
"organization_id": None,
454+
"organization_uuid": None,
451455
"organization_name": "Unknown",
452456
"settings": {},
453457
}
@@ -457,6 +461,7 @@ def _get_organization_context(self, execution: WorkflowExecution) -> dict:
457461
)
458462
return {
459463
"organization_id": None,
464+
"organization_uuid": None,
460465
"organization_name": "Unknown",
461466
"settings": {},
462467
}

prompt-service/src/unstract/prompt_service/services/answer_prompt.py

Lines changed: 73 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -264,23 +264,69 @@ def extract_table(
264264
prompt: str,
265265
) -> dict[str, Any]:
266266
table_settings = output[PSKeys.TABLE_SETTINGS]
267+
268+
# Check if prompt has valid schema data using json_repair
269+
has_valid_schema = False
270+
schema_data = None
271+
272+
if prompt and isinstance(prompt, str):
273+
try:
274+
# Try to repair and parse the prompt as JSON
275+
schema_data = repair_json_with_best_structure(prompt)
276+
# Check if the result is a valid dict (schema object)
277+
if isinstance(schema_data, dict) and schema_data:
278+
has_valid_schema = True
279+
app.logger.info(
280+
"Valid schema detected in prompt, using Smart Table Extractor"
281+
)
282+
except Exception as e:
283+
app.logger.debug(f"Prompt does not contain valid schema: {e}")
284+
285+
# If we have a valid schema, use the smart table extractor
286+
if has_valid_schema:
287+
smart_table_plugin: dict[str, Any] = PluginManager().get_plugin(
288+
"smart-table-extractor"
289+
)
290+
291+
if smart_table_plugin:
292+
fs_instance = AnswerPromptService._get_file_storage_instance(
293+
execution_source
294+
)
295+
296+
try:
297+
# Get the input file from table settings
298+
input_file = table_settings.get("input_file")
299+
300+
# Run the smart table extractor
301+
result = smart_table_plugin["entrypoint_cls"].run(
302+
llm=llm,
303+
table_settings=table_settings,
304+
fs_instance=fs_instance,
305+
prompt=prompt,
306+
input_file=input_file,
307+
)
308+
309+
# Extract the data from the result
310+
answer = result.get("data", [])
311+
structured_output[output[PSKeys.NAME]] = answer
312+
313+
# We do not support summary and eval for table.
314+
# Hence returning the result
315+
return structured_output
316+
except Exception as e:
317+
app.logger.error(f"Smart Table Extractor failed: {e}")
318+
# Fall back to regular table extractor
319+
app.logger.info("Falling back to regular table extractor")
320+
321+
# Use regular table extractor (original code)
267322
table_extractor: dict[str, Any] = PluginManager().get_plugin("table-extractor")
268323
if not table_extractor:
269324
raise APIError(
270325
"Unable to extract table details. "
271326
"Please contact admin to resolve this issue."
272327
)
273-
fs_instance: FileStorage = FileStorage(FileStorageProvider.LOCAL)
274-
if execution_source == ExecutionSource.IDE.value:
275-
fs_instance = EnvHelper.get_storage(
276-
storage_type=StorageType.PERMANENT,
277-
env_name=FileStorageKeys.PERMANENT_REMOTE_STORAGE,
278-
)
279-
if execution_source == ExecutionSource.TOOL.value:
280-
fs_instance = EnvHelper.get_storage(
281-
storage_type=StorageType.SHARED_TEMPORARY,
282-
env_name=FileStorageKeys.TEMPORARY_REMOTE_STORAGE,
283-
)
328+
fs_instance = AnswerPromptService._get_file_storage_instance(execution_source)
329+
284330
try:
285331
answer = table_extractor["entrypoint_cls"].run_table_extraction(
286332
llm=llm,
@@ -296,6 +342,22 @@ def extract_table(
296342
msg = f"Couldn't extract table. {e}"
297343
raise APIError(message=msg)
298344

345+
@staticmethod
346+
def _get_file_storage_instance(execution_source) -> FileStorage:
347+
fs_instance: FileStorage = FileStorage(FileStorageProvider.LOCAL)
348+
if execution_source == ExecutionSource.IDE.value:
349+
fs_instance = EnvHelper.get_storage(
350+
storage_type=StorageType.PERMANENT,
351+
env_name=FileStorageKeys.PERMANENT_REMOTE_STORAGE,
352+
)
353+
if execution_source == ExecutionSource.TOOL.value:
354+
fs_instance = EnvHelper.get_storage(
355+
storage_type=StorageType.SHARED_TEMPORARY,
356+
env_name=FileStorageKeys.TEMPORARY_REMOTE_STORAGE,
357+
)
358+
359+
return fs_instance
360+
299361
@staticmethod
300362
def handle_json(
301363
answer: str,

tools/structure/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@
66
unstract-sdk[aws]~=0.78.0
77
-e file:/unstract/sdk1
88
-e file:/unstract/flags
9+
json-repair>=0.25.0

tools/structure/src/main.py

Lines changed: 72 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from constants import SettingsKeys # type: ignore [attr-defined]
1010
from helpers import StructureToolHelper as STHelper
11-
from utils import json_to_markdown
11+
from utils import json_to_markdown, repair_json_with_best_structure
1212

1313
from unstract.flags.feature_flag import check_feature_flag_status
1414

@@ -116,6 +116,42 @@ def _override_section(
116116
self.stream_log(f"Overrode {change_desc}")
117117
return changes
118118

119+
def _should_skip_extraction_for_smart_table(
120+
self, input_file: str, outputs: list[dict[str, Any]]
121+
) -> bool:
122+
"""Check if extraction and indexing should be skipped for smart table extraction.
123+
124+
Args:
125+
input_file: Path to the input file
126+
outputs: List of output configurations
127+
128+
Returns:
129+
True if extraction/indexing should be skipped, False otherwise
130+
"""
131+
# Check if input file is an Excel file
132+
file_ext = Path(input_file).suffix.lower()
133+
if file_ext not in [".xlsx", ".xls"]:
134+
return False
135+
136+
# Check if any output has table_settings with valid JSON prompt
137+
for output in outputs:
138+
if SettingsKeys.TABLE_SETTINGS in output:
139+
prompt = output.get(SettingsKeys.PROMPTX, "")
140+
if prompt and isinstance(prompt, str):
141+
try:
142+
# Try to parse the prompt as JSON
143+
schema_data = repair_json_with_best_structure(prompt)
144+
# If it's a valid dict (schema object), skip extraction
145+
if schema_data and isinstance(schema_data, dict):
146+
return True
147+
except Exception as e:
148+
logger.warning(
149+
"Failed to parse prompt as JSON for smart table extraction: %s",
150+
str(e),
151+
)
152+
continue
153+
return False
154+
119155
def validate(self, input_file: str, settings: dict[str, Any]) -> None:
120156
enable_challenge: bool = settings.get(SettingsKeys.ENABLE_CHALLENGE, False)
121157
challenge_llm: str = settings.get(SettingsKeys.CHALLENGE_LLM_ADAPTER_ID, "")
@@ -231,22 +267,34 @@ def run(
231267

232268
custom_data = self.get_exec_metadata.get(SettingsKeys.CUSTOM_DATA, {})
233269
payload["custom_data"] = custom_data
234-
self.stream_log(f"Extracting document '{self.source_file_name}'")
235-
usage_kwargs: dict[Any, Any] = dict()
236-
usage_kwargs[UsageKwargs.RUN_ID] = self.file_execution_id
237-
usage_kwargs[UsageKwargs.FILE_NAME] = self.source_file_name
238-
usage_kwargs[UsageKwargs.EXECUTION_ID] = self.execution_id
239-
extracted_text = STHelper.dynamic_extraction(
240-
file_path=input_file,
241-
enable_highlight=is_highlight_enabled,
242-
usage_kwargs=usage_kwargs,
243-
run_id=self.file_execution_id,
244-
tool_settings=tool_settings,
245-
extract_file_path=tool_data_dir / SettingsKeys.EXTRACT,
246-
tool=self,
247-
execution_run_data_folder=str(execution_run_data_folder),
270+
271+
# Check if we should skip extraction and indexing for Excel table extraction with valid JSON
272+
skip_extraction_and_indexing = self._should_skip_extraction_for_smart_table(
273+
input_file, outputs
248274
)
249275

276+
extracted_text = ""
277+
usage_kwargs: dict[Any, Any] = dict()
278+
if skip_extraction_and_indexing:
279+
self.stream_log(
280+
"Skipping extraction and indexing for Excel table with valid JSON schema"
281+
)
282+
else:
283+
self.stream_log(f"Extracting document '{self.source_file_name}'")
284+
usage_kwargs[UsageKwargs.RUN_ID] = self.file_execution_id
285+
usage_kwargs[UsageKwargs.FILE_NAME] = self.source_file_name
286+
usage_kwargs[UsageKwargs.EXECUTION_ID] = self.execution_id
287+
extracted_text = STHelper.dynamic_extraction(
288+
file_path=input_file,
289+
enable_highlight=is_highlight_enabled,
290+
usage_kwargs=usage_kwargs,
291+
run_id=self.file_execution_id,
292+
tool_settings=tool_settings,
293+
extract_file_path=tool_data_dir / SettingsKeys.EXTRACT,
294+
tool=self,
295+
execution_run_data_folder=str(execution_run_data_folder),
296+
)
297+
250298
index_metrics = {}
251299
if is_summarization_enabled:
252300
summarize_file_path, summarize_file_hash = self._summarize(
@@ -258,6 +306,10 @@ def run(
258306
)
259307
payload[SettingsKeys.FILE_HASH] = summarize_file_hash
260308
payload[SettingsKeys.FILE_PATH] = summarize_file_path
309+
elif skip_extraction_and_indexing:
310+
# Use source file directly for Excel with valid JSON
311+
payload[SettingsKeys.FILE_PATH] = input_file
312+
pass
261313
elif not is_single_pass_enabled:
262314
# Track seen parameter combinations to avoid duplicate indexing
263315
seen_params = set()
@@ -326,7 +378,11 @@ def run(
326378
is_directory_mode: bool = table_settings.get(
327379
SettingsKeys.IS_DIRECTORY_MODE, False
328380
)
329-
table_settings[SettingsKeys.INPUT_FILE] = extracted_input_file
381+
# Use source file directly for Excel with valid JSON, otherwise use extracted file
382+
if skip_extraction_and_indexing:
383+
table_settings[SettingsKeys.INPUT_FILE] = input_file
384+
else:
385+
table_settings[SettingsKeys.INPUT_FILE] = extracted_input_file
330386
table_settings[SettingsKeys.IS_DIRECTORY_MODE] = is_directory_mode
331387
self.stream_log(f"Performing table extraction with: {table_settings}")
332388
output.update({SettingsKeys.TABLE_SETTINGS: table_settings})

tools/structure/src/utils.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from typing import Any
22

3+
from json_repair import repair_json
4+
35

46
def json_to_markdown(data: Any, level: int = 0, parent_key: str = "") -> str:
57
markdown = ""
@@ -29,3 +31,70 @@ def json_to_markdown(data: Any, level: int = 0, parent_key: str = "") -> str:
2931
markdown += f"{indent}- {data}\n"
3032

3133
return markdown
34+
35+
36+
def repair_json_with_best_structure(json_str: str) -> Any:
37+
"""Repair and parse a potentially malformed JSON string with optimal structure detection.
38+
39+
This function attempts to repair and parse a JSON string using two different strategies
40+
and returns the result that produces the most useful data structure. It handles cases
41+
where the input might be incomplete, malformed, or ambiguous JSON.
42+
43+
The function tries two parsing approaches:
44+
1. Parse the JSON string as-is
45+
2. Parse the JSON string wrapped in array brackets [...]
46+
47+
It then intelligently selects the best result based on the following logic:
48+
- If both results are strings (failed to parse as objects), return the as-is result
49+
- If one result is a string and the other is an object/array, return the object/array
50+
- If wrapping produces a single-element list that equals the as-is result, return as-is
51+
- If as-is produces an object/array and wrapping produces multiple elements, prefer wrapped
52+
- Otherwise, prefer the as-is result
53+
54+
Args:
55+
json_str: A string containing potentially malformed JSON data. Can be a complete
56+
JSON object, array, or partial JSON that needs repair.
57+
58+
Returns:
59+
The parsed JSON structure (dict, list, str, or other JSON-compatible type) that
60+
represents the most meaningful interpretation of the input string. The return type
61+
depends on the input and which parsing strategy produces the better result.
62+
63+
Example:
64+
>>> repair_json_with_best_structure('{"name": "John", "age": 30}')
65+
{'name': 'John', 'age': 30}
66+
67+
>>> repair_json_with_best_structure('{"incomplete": "object"')
68+
{'incomplete': 'object'}
69+
70+
>>> repair_json_with_best_structure('{"a": 1}{"b": 2}')
71+
[{'a': 1}, {'b': 2}]
72+
73+
Note:
74+
This function is specifically designed for the structure-tool and uses the
75+
json_repair library's repair_json function with return_objects=True and
76+
ensure_ascii=False parameters.
77+
"""
78+
parsed_as_is = repair_json(json_str=json_str, return_objects=True, ensure_ascii=False)
79+
parsed_with_wrap = repair_json(
80+
"[" + json_str + "]", return_objects=True, ensure_ascii=False
81+
)
82+
83+
if all(isinstance(x, str) for x in (parsed_as_is, parsed_with_wrap)):
84+
return parsed_as_is
85+
86+
if isinstance(parsed_as_is, str):
87+
return parsed_with_wrap
88+
if isinstance(parsed_with_wrap, str):
89+
return parsed_as_is
90+
91+
if isinstance(parsed_with_wrap, list) and len(parsed_with_wrap) == 1:
92+
if parsed_with_wrap[0] == parsed_as_is:
93+
return parsed_as_is
94+
95+
if isinstance(parsed_as_is, (dict, list)):
96+
if isinstance(parsed_with_wrap, list) and len(parsed_with_wrap) > 1:
97+
return parsed_with_wrap
98+
return parsed_as_is
99+
100+
return parsed_with_wrap

workers/sample.env

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,8 +190,8 @@ LOG_CONSUMER_AUTOSCALE=2,1
190190
# =============================================================================
191191

192192
LOG_LEVEL=INFO
193-
# structured or django
194-
LOG_FORMAT=django
193+
# Note: LOG_FORMAT removed - format is now hardcoded (not configurable)
194+
# All workers use a single standardized format matching Django backend
195195
DEFAULT_LOG_LEVEL=INFO
196196
WORKER_VERSION=1.0.0
197197
WORKER_INSTANCE_ID=dev-01

workers/shared/constants/env_vars.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,5 +51,5 @@ class EnvVars:
5151

5252
# Logging configuration
5353
LOG_LEVEL = "LOG_LEVEL"
54-
LOG_FORMAT = "LOG_FORMAT"
5554
LOG_FILE = "LOG_FILE"
55+
# Note: LOG_FORMAT removed - format is now hardcoded (not configurable)

0 commit comments

Comments
 (0)