Skip to content

Commit b87a9db

Browse files
authored
Merge branch 'main' into feat/UN-1722-export-reminder-prompt-studio
2 parents d151e6d + 9ba210b commit b87a9db

File tree

697 files changed

+103536
-7770
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

697 files changed

+103536
-7770
lines changed
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
# Script to combine multiple test reports into a single markdown file
5+
# Usage: ./combine-test-reports.sh
6+
7+
OUTPUT_FILE="combined-test-report.md"
8+
REPORTS=()
9+
10+
# Find all test report files
11+
for report in runner-report.md sdk1-report.md; do
12+
if [ -f "$report" ]; then
13+
REPORTS+=("$report")
14+
fi
15+
done
16+
17+
# Exit if no reports found
18+
if [ ${#REPORTS[@]} -eq 0 ]; then
19+
echo "No test reports found. Skipping report generation."
20+
exit 0
21+
fi
22+
23+
# Function to strip LaTeX formatting from pytest-md-report output
24+
# Converts $$\textcolor{...}{\tt{VALUE}}$$ to just VALUE
25+
strip_latex() {
26+
local text="$1"
27+
# Extract content between \tt{ and }}
28+
if [[ "$text" =~ \\tt\{([^}]+)\} ]]; then
29+
echo "${BASH_REMATCH[1]}"
30+
else
31+
echo "$text"
32+
fi
33+
}
34+
35+
# Function to extract test counts from pytest-md-report markdown table
36+
extract_test_counts() {
37+
local report_file=$1
38+
local passed=0
39+
local failed=0
40+
local total=0
41+
42+
# Find the header row to determine column positions
43+
local header_line=$(grep -E '^\|.*filepath' "$report_file" | head -1)
44+
45+
if [ -z "$header_line" ]; then
46+
echo "0:0:0"
47+
return
48+
fi
49+
50+
# Extract column names and find positions (strip LaTeX from headers)
51+
IFS='|' read -ra headers <<< "$header_line"
52+
local passed_col=-1
53+
local failed_col=-1
54+
local subtotal_col=-1
55+
56+
for i in "${!headers[@]}"; do
57+
local col=$(strip_latex "${headers[$i]}" | tr -d ' ' | tr '[:upper:]' '[:lower:]')
58+
case "$col" in
59+
passed) passed_col=$i ;;
60+
failed) failed_col=$i ;;
61+
subtotal|sub) subtotal_col=$i ;;
62+
esac
63+
done
64+
65+
# Find the TOTAL row (TOTAL appears in first column, not as SUBTOTAL in header)
66+
local total_line=$(grep -E '^\|.*\\tt\{TOTAL\}' "$report_file" | head -1)
67+
68+
if [ -z "$total_line" ]; then
69+
echo "0:0:0"
70+
return
71+
fi
72+
73+
# Parse the TOTAL row values
74+
IFS='|' read -ra values <<< "$total_line"
75+
76+
# Extract passed count (strip LaTeX and get number)
77+
if [ "$passed_col" -ge 0 ] && [ "$passed_col" -lt "${#values[@]}" ]; then
78+
local clean_value=$(strip_latex "${values[$passed_col]}")
79+
passed=$(echo "$clean_value" | tr -d ' ' | grep -oE '[0-9]+' | head -1 || echo "0")
80+
fi
81+
82+
# Extract failed count (strip LaTeX and get number)
83+
if [ "$failed_col" -ge 0 ] && [ "$failed_col" -lt "${#values[@]}" ]; then
84+
local clean_value=$(strip_latex "${values[$failed_col]}")
85+
failed=$(echo "$clean_value" | tr -d ' ' | grep -oE '[0-9]+' | head -1 || echo "0")
86+
fi
87+
88+
# Extract total from SUBTOTAL column (strip LaTeX and get number)
89+
if [ "$subtotal_col" -ge 0 ] && [ "$subtotal_col" -lt "${#values[@]}" ]; then
90+
local clean_value=$(strip_latex "${values[$subtotal_col]}")
91+
total=$(echo "$clean_value" | tr -d ' ' | grep -oE '[0-9]+' | head -1 || echo "0")
92+
fi
93+
94+
# If total is still 0, calculate from passed + failed
95+
if [ "$total" -eq 0 ]; then
96+
total=$((passed + failed))
97+
fi
98+
99+
echo "${total}:${passed}:${failed}"
100+
}
101+
102+
# Initialize the combined report with collapsed summary
103+
cat > "$OUTPUT_FILE" << 'EOF'
104+
# Test Results
105+
106+
<details open>
107+
<summary><b>Summary</b></summary>
108+
109+
EOF
110+
111+
# Extract and display summary for each report
112+
for report in "${REPORTS[@]}"; do
113+
report_name=$(basename "$report" .md)
114+
115+
# Convert report name to title case
116+
if [ "$report_name" = "runner-report" ]; then
117+
title="Runner Tests"
118+
elif [ "$report_name" = "sdk1-report" ]; then
119+
title="SDK1 Tests"
120+
else
121+
title="${report_name}"
122+
fi
123+
124+
# Extract counts
125+
counts=$(extract_test_counts "$report")
126+
IFS=':' read -r total passed failed <<< "$counts"
127+
128+
# Determine status icon
129+
if [ "$failed" -gt 0 ]; then
130+
status=""
131+
elif [ "$passed" -gt 0 ]; then
132+
status=""
133+
else
134+
status="⚠️"
135+
fi
136+
137+
echo "- ${status} **${title}**: ${passed} passed, ${failed} failed (${total} total)" >> "$OUTPUT_FILE"
138+
done
139+
140+
cat >> "$OUTPUT_FILE" << 'EOF'
141+
142+
</details>
143+
144+
---
145+
146+
EOF
147+
148+
# Combine all reports with collapsible sections
149+
for report in "${REPORTS[@]}"; do
150+
report_name=$(basename "$report" .md)
151+
152+
# Convert report name to title case
153+
if [ "$report_name" = "runner-report" ]; then
154+
title="Runner Tests"
155+
elif [ "$report_name" = "sdk1-report" ]; then
156+
title="SDK1 Tests"
157+
else
158+
title="${report_name}"
159+
fi
160+
161+
echo "<details>" >> "$OUTPUT_FILE"
162+
echo "<summary><b>${title} - Full Report</b></summary>" >> "$OUTPUT_FILE"
163+
echo "" >> "$OUTPUT_FILE"
164+
cat "$report" >> "$OUTPUT_FILE"
165+
echo "" >> "$OUTPUT_FILE"
166+
echo "</details>" >> "$OUTPUT_FILE"
167+
echo "" >> "$OUTPUT_FILE"
168+
done
169+
170+
echo "Combined test report created: $OUTPUT_FILE"
171+
echo "Included reports: ${REPORTS[*]}"

.github/workflows/ci-test.yaml

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -48,20 +48,21 @@ jobs:
4848
run: |
4949
tox
5050
51-
- name: Render the report to the PR
52-
uses: marocchino/sticky-pull-request-comment@v2
51+
- name: Combine test reports
52+
if: always() && (hashFiles('runner-report.md') != '' || hashFiles('sdk1-report.md') != '')
53+
run: |
54+
bash .github/scripts/combine-test-reports.sh
55+
56+
- name: Render combined test report to PR
57+
uses: marocchino/sticky-pull-request-comment@773744901bac0e8cbb5a0dc842800d45e9b2b405 # v2.9.4
58+
if: always() && hashFiles('combined-test-report.md') != ''
5359
with:
54-
header: runner-test-report
60+
header: test-results
5561
recreate: true
56-
path: runner-report.md
62+
path: combined-test-report.md
5763

58-
- name: Output reports to the job summary when tests fail
64+
- name: Output combined report to job summary
65+
if: always() && hashFiles('combined-test-report.md') != ''
5966
shell: bash
6067
run: |
61-
if [ -f "runner-report.md" ]; then
62-
echo "<details><summary>Runner Test Report</summary>" >> $GITHUB_STEP_SUMMARY
63-
echo "" >> $GITHUB_STEP_SUMMARY
64-
cat "runner-report.md" >> $GITHUB_STEP_SUMMARY
65-
echo "" >> $GITHUB_STEP_SUMMARY
66-
echo "</details>" >> $GITHUB_STEP_SUMMARY
67-
fi
68+
cat combined-test-report.md >> $GITHUB_STEP_SUMMARY

.github/workflows/docker-tools-build-push.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,14 @@ jobs:
5959
id: build-config
6060
run: |
6161
if [ "${{ github.event.inputs.service_name }}" == "tool-classifier" ]; then
62-
echo "context=./tools/classifier" >> $GITHUB_OUTPUT
63-
echo "dockerfile=" >> $GITHUB_OUTPUT
62+
echo "context=." >> $GITHUB_OUTPUT
63+
echo "dockerfile=./tools/classifier/Dockerfile" >> $GITHUB_OUTPUT
6464
elif [ "${{ github.event.inputs.service_name }}" == "tool-structure" ]; then
65-
echo "context=./tools/structure" >> $GITHUB_OUTPUT
66-
echo "dockerfile=" >> $GITHUB_OUTPUT
65+
echo "context=." >> $GITHUB_OUTPUT
66+
echo "dockerfile=./tools/structure/Dockerfile" >> $GITHUB_OUTPUT
6767
elif [ "${{ github.event.inputs.service_name }}" == "tool-text-extractor" ]; then
68-
echo "context=./tools/text_extractor" >> $GITHUB_OUTPUT
69-
echo "dockerfile=" >> $GITHUB_OUTPUT
68+
echo "context=." >> $GITHUB_OUTPUT
69+
echo "dockerfile=./tools/text_extractor/Dockerfile" >> $GITHUB_OUTPUT
7070
elif [ "${{ github.event.inputs.service_name }}" == "tool-sidecar" ]; then
7171
echo "context=." >> $GITHUB_OUTPUT
7272
echo "dockerfile=docker/dockerfiles/tool-sidecar.Dockerfile" >> $GITHUB_OUTPUT

.gitignore

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
# Created by https://www.toptal.com/developers/gitignore/api/windows,macos,linux,pycharm,pycharm+all,pycharm+iml,python,visualstudiocode,react,django
22
# Edit at https://www.toptal.com/developers/gitignore?templates=windows,macos,linux,pycharm,pycharm+all,pycharm+iml,python,visualstudiocode,react,django
33

4+
# Development helper scripts
5+
*.sh
6+
# list Exceptional files with ! like !fix-and-test.sh
7+
!run-platform.sh
8+
!workers/run-worker.sh
9+
!workers/run-worker-docker.sh
10+
!workers/log_consumer/scheduler.sh
11+
412
### Django ###
513
*.log
614
*.pot
@@ -612,19 +620,6 @@ $RECYCLE.BIN/
612620

613621
### Unstract ###
614622

615-
# Authentication Plugins
616-
backend/plugins/authentication/*
617-
!backend/plugins/authentication/auth_sample
618-
619-
# Processor Plugins
620-
backend/plugins/processor/*
621-
622-
# Subscription Plugins
623-
backend/plugins/subscription/*
624-
625-
# API Deployment Plugins
626-
backend/plugins/api/**
627-
628623
# BE pluggable-apps
629624
backend/pluggable_apps/*
630625

@@ -676,6 +671,7 @@ backend/requirements.txt
676671
backend/backend/*_urls.py
677672
!backend/backend/base_urls.py
678673
!backend/backend/public_urls.py
674+
!backend/backend/internal_base_urls.py
679675
# TODO: Remove after v2 migration is completed
680676
backend/backend/*_urls_v2.py
681677
!backend/backend/public_urls_v2.py

README.md

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,8 @@
33

44
# Unstract
55

6-
## No-code LLM Platform to launch APIs and ETL Pipelines to structure unstructured documents
6+
## The Data Layer for your Agentic Workflows—Automate Document-based workflows with close to 100% accuracy!
77

8-
##
98

109
![Python Version from PEP 621 TOML](https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2FZipstack%2Funstract%2Frefs%2Fheads%2Fmain%2Fpyproject.toml)
1110
[![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
@@ -23,26 +22,44 @@
2322

2423
## 🤖 Prompt Studio
2524

26-
Prompt Studio's primary reason for existence is so you can develop the necessary prompts for document data extraction super efficiently. It is a purpose-built environment that makes this not just easy for you—but, lot of fun! The document sample, its variants, the prompts you're developing, outputs from different LLMs, the schema you're developing, costing details of the extraction and various tools that let you measure the effectiveness of your prompts are just a click away and easily accessible. Prompt Studio is designed for effective and high speed development and iteration of prompts for document data extraction. Welcome to IDP 2.0!
27-
25+
Prompt Studio is a purpose-built environment that supercharges your schema definition efforts. Compare outputs from different LLMs side-by-side, keep tab on costs while you develop generic prompts that work across wide-ranging document variations. And when you're ready, launch extraction APIs with a single click.
2826

2927
![img Prompt Studio](docs/assets/prompt_studio.png)
3028

31-
## 🧘‍♀️ Three step nirvana with Workflow Studio
29+
## 🔌 Integrations that suit your environment
30+
31+
Once you've used Prompt Studio to define your schema, Unstract makes it easy to integrate into your existing workflows. Simply choose the integration type that best fits your environment:
32+
33+
| Integration Type | Description | Best For | Documentation |
34+
|------------------|-------------|----------|---------------|
35+
| 🖥️ **MCP Servers** | Run Unstract as an MCP Server to provide structured data extraction to Agents or LLMs in your ecosystem. | Developers building **Agentic/LLM apps/tools** that speak MCP. | [Unstract MCP Server Docs](https://docs.unstract.com/unstract/unstract_platform/mcp/unstract_platform_mcp_server/) |
36+
| 🌐 **API Deployments** | Turn any document into JSON with an API call. Deploy any Prompt Studio project as a REST API endpoint with a single click. | Teams needing **programmatic access** in apps, services, or custom tooling. | [API Deployment Docs](https://docs.unstract.com/unstract/unstract_platform/api_deployment/unstract_api_deployment_intro/) |
37+
| ⚙️ **ETL Pipelines** | Embed Unstract directly into your ETL jobs to transform unstructured data before loading it into your warehouse / database. | **Engineering and Data engineering teams** that need to batch process documents into clean JSON. | [ETL Pipelines Docs](https://docs.unstract.com/unstract/unstract_platform/etl_pipeline/unstract_etl_pipeline_intro/) |
38+
| 🧩 **n8n Nodes** | Use Unstract as ready-made nodes in n8n workflows for drag-and-drop automation. | **Low-code users** and **ops teams** automating workflows. | [Unstract n8n Nodes Docs](https://docs.unstract.com/unstract/unstract_platform/api_deployment/unstract_api_deployment_n8n_custom_node/) |
39+
40+
## ☁️ Getting Started (Cloud / Enterprise)
3241

33-
Automate critical business processes that involve complex documents with a human in the loop. Go beyond RPA with the power of Large Language Models.
42+
The easy-peasy way to try Unstract is to [sign up for a **14-day free trial**](https://unstract.com/start-for-free/). Give Unstract a spin now!
3443

35-
🌟 **Step 1**: Add documents to no-code Prompt Studio and do prompt engineering to extract required fields <br>
36-
🌟 **Step 2**: Configure Prompt Studio project as API deployment or configure input source and output destination for ETL Pipeline<br>
37-
🌟 **Step 3**: Deploy Workflows as unstructured data APIs or unstructured data ETL Pipelines!
44+
Unstract Cloud also comes with some really awesome features that give serious accuracy boosts to agentic/LLM-powered document-centric workflows in the enterprise.
3845

39-
![img Using Unstract](docs/assets/Using_Unstract.png)
46+
| Feature | Description | Documentation |
47+
|---------|-------------|---------------|
48+
| 🧪 **LLMChallenge** | Uses two Large Language Models to ensure trustworthy output. You either get the right response or no response at all. | [Docs](https://docs.unstract.com/unstract/unstract_platform/features/llm_challenge/llm_challenge_intro/) |
49+
|**SinglePass Extraction** | Reduces LLM token usage by up to **8x**, dramatically cutting costs. | [Docs](https://docs.unstract.com/unstract/editions/cloud_edition/#singlepass-extraction) |
50+
| 📉 **SummarizedExtraction** | Reduces LLM token usage by up to **6x**, saving costs while keeping accuracy. | [Docs](https://docs.unstract.com/unstract/unstract_platform/features/summarized_extraction/summarized_extraction_intro/) |
51+
| 👀 **Human-In-The-Loop** | Side-by-side comparison of extracted value and source document, with highlighting for human review and tweaking. | [Docs](https://docs.unstract.com/unstract/unstract_platform/human_quality_review/human_quality_review_intro/) |
52+
| 🔐 **SSO Support** | Enterprise-ready authentication options for seamless onboarding and off-boarding. | [Docs](https://docs.unstract.com/unstract/editions/cloud_edition/#enterprise-features) |
53+
54+
## ⏩ Quick Start Guide
55+
56+
Unstract comes well documented. You can get introduced to the [basics of Unstract](https://docs.unstract.com/unstract/), and [learn how to connect](https://docs.unstract.com/unstract/unstract_platform/setup_accounts/whats_needed) various systems like LLMs, Vector Databases, Embedding Models and Text Extractors to it. The easiest way to wet your feet is to go through our [Quick Start Guide](https://docs.unstract.com/unstract/unstract_platform/quick_start) where you actually get to do some prompt engineering in Prompt Studio and launch an API to structure varied credit card statements!
4057

41-
## 🚀 Getting started
58+
## 🚀 Getting started (self-hosted)
4259

4360
### System Requirements
4461

45-
- 8GB RAM (recommended)
62+
- 8GB RAM (minimum)
4663

4764
### Prerequisites
4865

@@ -57,7 +74,6 @@ Next, either download a release or clone this repo and do the following:
5774
✅ Now visit [http://frontend.unstract.localhost](http://frontend.unstract.localhost) in your browser <br>
5875
✅ Use username and password `unstract` to login
5976

60-
6177
That's all there is to it!
6278

6379
Follow [these steps](backend/README.md#authentication) to change the default username and password.
@@ -93,10 +109,6 @@ Unstract supports a wide range of file formats for document processing:
93109
| | TIFF | Tagged Image File Format |
94110
| | WEBP | Web Picture Format |
95111

96-
## ⏩ Quick Start Guide
97-
98-
Unstract comes well documented. You can get introduced to the [basics of Unstract](https://docs.unstract.com/unstract/), and [learn how to connect](https://docs.unstract.com/unstract/unstract_platform/setup_accounts/whats_needed) various systems like LLMs, Vector Databases, Embedding Models and Text Extractors to it. The easiest way to wet your feet is to go through our [Quick Start Guide](https://docs.unstract.com/unstract/unstract_platform/quick_start) where you actually get to do some prompt engineering in Prompt Studio and launch an API to structure varied credit card statements!
99-
100112
## 🤝 Ecosystem support
101113

102114
### LLM Providers
@@ -113,7 +125,6 @@ Unstract comes well documented. You can get introduced to the [basics of Unstrac
113125
| <img src="docs/assets/3rd_party/anyscale.png" width="32"/> | Anyscale | ✅ Working |
114126
| <img src="docs/assets/3rd_party/mistral_ai.png" width="32"/> | Mistral AI | ✅ Working |
115127

116-
117128
### Vector Databases
118129

119130
|| Provider | Status |
@@ -124,8 +135,6 @@ Unstract comes well documented. You can get introduced to the [basics of Unstrac
124135
|<img src="docs/assets/3rd_party/postgres.png" width="32"/>| PostgreSQL | ✅ Working |
125136
|<img src="docs/assets/3rd_party/milvus.png" width="32"/>| Milvus | ✅ Working |
126137

127-
128-
129138
### Embeddings
130139

131140
|| Provider | Status |

backend/account_v2/custom_auth_middleware.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from account_v2.authentication_service import AuthenticationService
99
from account_v2.constants import Common
1010
from backend.constants import RequestHeader
11+
from backend.internal_api_constants import INTERNAL_API_PREFIX
1112

1213

1314
class CustomAuthMiddleware:
@@ -22,6 +23,10 @@ def __call__(self, request: HttpRequest) -> HttpResponse:
2223
if any(request.path.startswith(path) for path in settings.WHITELISTED_PATHS):
2324
return self.get_response(request)
2425

26+
# Skip internal API paths - they are handled by InternalAPIAuthMiddleware
27+
if request.path.startswith(f"{INTERNAL_API_PREFIX}/"):
28+
return self.get_response(request)
29+
2530
# Authenticating With API_KEY
2631
x_api_key = request.headers.get(RequestHeader.X_API_KEY)
2732
if (

0 commit comments

Comments
 (0)