Zipstack
diff --git a/‎.github/scripts/combine-test-reports.sh‎
Lines changed: 171 additions & 0 deletions b/‎.github/scripts/combine-test-reports.sh‎
Lines changed: 171 additions & 0 deletions
diff --git a/‎.github/workflows/ci-test.yaml‎
Lines changed: 13 additions & 12 deletions b/‎.github/workflows/ci-test.yaml‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎.github/workflows/docker-tools-build-push.yaml‎
Lines changed: 6 additions & 6 deletions b/‎.github/workflows/docker-tools-build-push.yaml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎.gitignore‎
Lines changed: 9 additions & 13 deletions b/‎.gitignore‎
Lines changed: 9 additions & 13 deletions
diff --git a/‎README.md‎
Lines changed: 29 additions & 20 deletions b/‎README.md‎
Lines changed: 29 additions & 20 deletions
diff --git a/‎backend/account_v2/custom_auth_middleware.py‎
Lines changed: 5 additions & 0 deletions b/‎backend/account_v2/custom_auth_middleware.py‎
Lines changed: 5 additions & 0 deletions
@@ -0,0 +1,171 @@
+#!/bin/bash
+set -euo pipefail
+
+# Script to combine multiple test reports into a single markdown file
+# Usage: ./combine-test-reports.sh
+
+OUTPUT_FILE="combined-test-report.md"
+REPORTS=()
+
+# Find all test report files
+for report in runner-report.md sdk1-report.md; do
+    if [ -f "$report" ]; then
+        REPORTS+=("$report")
+    fi
+done
+
+# Exit if no reports found
+if [ ${#REPORTS[@]} -eq 0 ]; then
+    echo "No test reports found. Skipping report generation."
+    exit 0
+fi
+
+# Function to strip LaTeX formatting from pytest-md-report output
+# Converts $$\textcolor{...}{\tt{VALUE}}$$ to just VALUE
+strip_latex() {
+    local text="$1"
+    # Extract content between \tt{ and }}
+    if [[ "$text" =~ \\tt\{([^}]+)\} ]]; then
+        echo "${BASH_REMATCH[1]}"
+    else
+        echo "$text"
+    fi
+}
+
+# Function to extract test counts from pytest-md-report markdown table
+extract_test_counts() {
+    local report_file=$1
+    local passed=0
+    local failed=0
+    local total=0
+
+    # Find the header row to determine column positions
+    local header_line=$(grep -E '^\|.*filepath' "$report_file" | head -1)
+
+    if [ -z "$header_line" ]; then
+        echo "0:0:0"
+        return
+    fi
+
+    # Extract column names and find positions (strip LaTeX from headers)
+    IFS='|' read -ra headers <<< "$header_line"
+    local passed_col=-1
+    local failed_col=-1
+    local subtotal_col=-1
+
+    for i in "${!headers[@]}"; do
+        local col=$(strip_latex "${headers[$i]}" | tr -d ' ' | tr '[:upper:]' '[:lower:]')
+        case "$col" in
+            passed) passed_col=$i ;;
+            failed) failed_col=$i ;;
+            subtotal|sub) subtotal_col=$i ;;
+        esac
+    done
+
+    # Find the TOTAL row (TOTAL appears in first column, not as SUBTOTAL in header)
+    local total_line=$(grep -E '^\|.*\\tt\{TOTAL\}' "$report_file" | head -1)
+
+    if [ -z "$total_line" ]; then
+        echo "0:0:0"
+        return
+    fi
+
+    # Parse the TOTAL row values
+    IFS='|' read -ra values <<< "$total_line"
+
+    # Extract passed count (strip LaTeX and get number)
+    if [ "$passed_col" -ge 0 ] && [ "$passed_col" -lt "${#values[@]}" ]; then
+        local clean_value=$(strip_latex "${values[$passed_col]}")
+        passed=$(echo "$clean_value" | tr -d ' ' | grep -oE '[0-9]+' | head -1 || echo "0")
+    fi
+
+    # Extract failed count (strip LaTeX and get number)
+    if [ "$failed_col" -ge 0 ] && [ "$failed_col" -lt "${#values[@]}" ]; then
+        local clean_value=$(strip_latex "${values[$failed_col]}")
+        failed=$(echo "$clean_value" | tr -d ' ' | grep -oE '[0-9]+' | head -1 || echo "0")
+    fi
+
+    # Extract total from SUBTOTAL column (strip LaTeX and get number)
+    if [ "$subtotal_col" -ge 0 ] && [ "$subtotal_col" -lt "${#values[@]}" ]; then
+        local clean_value=$(strip_latex "${values[$subtotal_col]}")
+        total=$(echo "$clean_value" | tr -d ' ' | grep -oE '[0-9]+' | head -1 || echo "0")
+    fi
+
+    # If total is still 0, calculate from passed + failed
+    if [ "$total" -eq 0 ]; then
+        total=$((passed + failed))
+    fi
+
+    echo "${total}:${passed}:${failed}"
+}
+
+# Initialize the combined report with collapsed summary
+cat > "$OUTPUT_FILE" << 'EOF'
+# Test Results
+
+<details open>
+<summary><b>Summary</b></summary>
+
+EOF
+
+# Extract and display summary for each report
+for report in "${REPORTS[@]}"; do
+    report_name=$(basename "$report" .md)
+
+    # Convert report name to title case
+    if [ "$report_name" = "runner-report" ]; then
+        title="Runner Tests"
+    elif [ "$report_name" = "sdk1-report" ]; then
+        title="SDK1 Tests"
+    else
+        title="${report_name}"
+    fi
+
+    # Extract counts
+    counts=$(extract_test_counts "$report")
+    IFS=':' read -r total passed failed <<< "$counts"
+
+    # Determine status icon
+    if [ "$failed" -gt 0 ]; then
+        status="❌"
+    elif [ "$passed" -gt 0 ]; then
+        status="✅"
+    else
+        status="⚠️"
+    fi
+
+    echo "- ${status} **${title}**: ${passed} passed, ${failed} failed (${total} total)" >> "$OUTPUT_FILE"
+done
+
+cat >> "$OUTPUT_FILE" << 'EOF'
+
+</details>
+
+---
+
+EOF
+
+# Combine all reports with collapsible sections
+for report in "${REPORTS[@]}"; do
+    report_name=$(basename "$report" .md)
+
+    # Convert report name to title case
+    if [ "$report_name" = "runner-report" ]; then
+        title="Runner Tests"
+    elif [ "$report_name" = "sdk1-report" ]; then
+        title="SDK1 Tests"
+    else
+        title="${report_name}"
+    fi
+
+    echo "<details>" >> "$OUTPUT_FILE"
+    echo "<summary><b>${title} - Full Report</b></summary>" >> "$OUTPUT_FILE"
+    echo "" >> "$OUTPUT_FILE"
+    cat "$report" >> "$OUTPUT_FILE"
+    echo "" >> "$OUTPUT_FILE"
+    echo "</details>" >> "$OUTPUT_FILE"
+    echo "" >> "$OUTPUT_FILE"
+done
+
+echo "Combined test report created: $OUTPUT_FILE"
+echo "Included reports: ${REPORTS[*]}"
@@ -48,20 +48,21 @@ jobs:
         run: |
           tox
 
-      - name: Render the report to the PR
-        uses: marocchino/sticky-pull-request-comment@v2
+      - name: Combine test reports
+        if: always() && (hashFiles('runner-report.md') != '' || hashFiles('sdk1-report.md') != '')
+        run: |
+          bash .github/scripts/combine-test-reports.sh
+
+      - name: Render combined test report to PR
+        uses: marocchino/sticky-pull-request-comment@773744901bac0e8cbb5a0dc842800d45e9b2b405 # v2.9.4
+        if: always() && hashFiles('combined-test-report.md') != ''
         with:
-          header: runner-test-report
+          header: test-results
           recreate: true
-          path: runner-report.md
+          path: combined-test-report.md
 
-      - name: Output reports to the job summary when tests fail
+      - name: Output combined report to job summary
+        if: always() && hashFiles('combined-test-report.md') != ''
         shell: bash
         run: |
-          if [ -f "runner-report.md" ]; then
-            echo "<details><summary>Runner Test Report</summary>" >> $GITHUB_STEP_SUMMARY
-            echo "" >> $GITHUB_STEP_SUMMARY
-            cat "runner-report.md" >> $GITHUB_STEP_SUMMARY
-            echo "" >> $GITHUB_STEP_SUMMARY
-            echo "</details>" >> $GITHUB_STEP_SUMMARY
-          fi
+          cat combined-test-report.md >> $GITHUB_STEP_SUMMARY
@@ -59,14 +59,14 @@ jobs:
         id: build-config
         run: |
           if [ "${{ github.event.inputs.service_name }}" == "tool-classifier" ]; then
-            echo "context=./tools/classifier" >> $GITHUB_OUTPUT
-            echo "dockerfile=" >> $GITHUB_OUTPUT
+            echo "context=." >> $GITHUB_OUTPUT
+            echo "dockerfile=./tools/classifier/Dockerfile" >> $GITHUB_OUTPUT
           elif [ "${{ github.event.inputs.service_name }}" == "tool-structure" ]; then
-            echo "context=./tools/structure" >> $GITHUB_OUTPUT
-            echo "dockerfile=" >> $GITHUB_OUTPUT
+            echo "context=." >> $GITHUB_OUTPUT
+            echo "dockerfile=./tools/structure/Dockerfile" >> $GITHUB_OUTPUT
           elif [ "${{ github.event.inputs.service_name }}" == "tool-text-extractor" ]; then
-            echo "context=./tools/text_extractor" >> $GITHUB_OUTPUT
-            echo "dockerfile=" >> $GITHUB_OUTPUT
+            echo "context=." >> $GITHUB_OUTPUT
+            echo "dockerfile=./tools/text_extractor/Dockerfile" >> $GITHUB_OUTPUT
           elif [ "${{ github.event.inputs.service_name }}" == "tool-sidecar" ]; then
             echo "context=." >> $GITHUB_OUTPUT
             echo "dockerfile=docker/dockerfiles/tool-sidecar.Dockerfile" >> $GITHUB_OUTPUT
 
@@ -1,6 +1,14 @@
 # Created by https://www.toptal.com/developers/gitignore/api/windows,macos,linux,pycharm,pycharm+all,pycharm+iml,python,visualstudiocode,react,django
 # Edit at https://www.toptal.com/developers/gitignore?templates=windows,macos,linux,pycharm,pycharm+all,pycharm+iml,python,visualstudiocode,react,django
 
+# Development helper scripts
+*.sh
+# list Exceptional files with ! like !fix-and-test.sh
+!run-platform.sh
+!workers/run-worker.sh
+!workers/run-worker-docker.sh
+!workers/log_consumer/scheduler.sh
+
 ### Django ###
 *.log
 *.pot
@@ -612,19 +620,6 @@ $RECYCLE.BIN/
 
 ### Unstract ###
 
-# Authentication Plugins
-backend/plugins/authentication/*
-!backend/plugins/authentication/auth_sample
-
-# Processor Plugins
-backend/plugins/processor/*
-
-# Subscription Plugins
-backend/plugins/subscription/*
-
-# API Deployment Plugins
-backend/plugins/api/**
-
 # BE pluggable-apps
 backend/pluggable_apps/*
 
@@ -676,6 +671,7 @@ backend/requirements.txt
 backend/backend/*_urls.py
 !backend/backend/base_urls.py
 !backend/backend/public_urls.py
+!backend/backend/internal_base_urls.py
 # TODO: Remove after v2 migration is completed
 backend/backend/*_urls_v2.py
 !backend/backend/public_urls_v2.py
 
@@ -3,9 +3,8 @@
 
 # Unstract
 
-## No-code LLM Platform to launch APIs and ETL Pipelines to structure unstructured documents
+## The Data Layer for your Agentic Workflows—Automate Document-based workflows with close to 100% accuracy!
 
-##
 
 ![Python Version from PEP 621 TOML](https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2FZipstack%2Funstract%2Frefs%2Fheads%2Fmain%2Fpyproject.toml)
 [![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
@@ -23,26 +22,44 @@
 
 ## 🤖 Prompt Studio
 
-Prompt Studio's primary reason for existence is so you can develop the necessary prompts for document data extraction super efficiently. It is a purpose-built environment that makes this not just easy for you—but, lot of fun! The document sample, its variants, the prompts you're developing, outputs from different LLMs, the schema you're developing, costing details of the extraction and various tools that let you measure the effectiveness of your prompts are just a click away and easily accessible. Prompt Studio is designed for effective and high speed development and iteration of prompts for document data extraction. Welcome to IDP 2.0!
-
+Prompt Studio is a purpose-built environment that supercharges your schema definition efforts. Compare outputs from different LLMs side-by-side, keep tab on costs while you develop generic prompts that work across wide-ranging document variations. And when you're ready, launch extraction APIs with a single click.
 
 ![img Prompt Studio](docs/assets/prompt_studio.png)
 
-## 🧘‍♀️ Three step nirvana with Workflow Studio
+## 🔌 Integrations that suit your environment
+
+Once you've used Prompt Studio to define your schema, Unstract makes it easy to integrate into your existing workflows. Simply choose the integration type that best fits your environment:
+
+| Integration Type | Description | Best For | Documentation |
+|------------------|-------------|----------|---------------|
+| 🖥️ **MCP Servers** | Run Unstract as an MCP Server to provide structured data extraction to Agents or LLMs in your ecosystem. | Developers building **Agentic/LLM apps/tools** that speak MCP. | [Unstract MCP Server Docs](https://docs.unstract.com/unstract/unstract_platform/mcp/unstract_platform_mcp_server/) |
+| 🌐 **API Deployments** | Turn any document into JSON with an API call. Deploy any Prompt Studio project as a REST API endpoint with a single click. | Teams needing **programmatic access** in apps, services, or custom tooling. | [API Deployment Docs](https://docs.unstract.com/unstract/unstract_platform/api_deployment/unstract_api_deployment_intro/) |
+| ⚙️ **ETL Pipelines** | Embed Unstract directly into your ETL jobs to transform unstructured data before loading it into your warehouse / database. | **Engineering and Data engineering teams** that need to batch process documents into clean JSON. | [ETL Pipelines Docs](https://docs.unstract.com/unstract/unstract_platform/etl_pipeline/unstract_etl_pipeline_intro/) |
+| 🧩 **n8n Nodes** | Use Unstract as ready-made nodes in n8n workflows for drag-and-drop automation. | **Low-code users** and **ops teams** automating workflows. | [Unstract n8n Nodes Docs](https://docs.unstract.com/unstract/unstract_platform/api_deployment/unstract_api_deployment_n8n_custom_node/) |
+
+## ☁️ Getting Started (Cloud / Enterprise)
 
-Automate critical business processes that involve complex documents with a human in the loop. Go beyond RPA with the power of Large Language Models.
+The easy-peasy way to try Unstract is to [sign up for a **14-day free trial**](https://unstract.com/start-for-free/). Give Unstract a spin now!  
 
-🌟 **Step 1**: Add documents to no-code Prompt Studio and do prompt engineering to extract required fields <br>
-🌟 **Step 2**: Configure Prompt Studio project as API deployment or configure input source and output destination for ETL Pipeline<br>
-🌟 **Step 3**: Deploy Workflows as unstructured data APIs or unstructured data ETL Pipelines!
+Unstract Cloud also comes with some really awesome features that give serious accuracy boosts to agentic/LLM-powered document-centric workflows in the enterprise.
 
-![img Using Unstract](docs/assets/Using_Unstract.png)
+| Feature | Description | Documentation |
+|---------|-------------|---------------|
+| 🧪 **LLMChallenge** | Uses two Large Language Models to ensure trustworthy output. You either get the right response or no response at all. | [Docs](https://docs.unstract.com/unstract/unstract_platform/features/llm_challenge/llm_challenge_intro/) |
+| ⚡ **SinglePass Extraction** | Reduces LLM token usage by up to **8x**, dramatically cutting costs. | [Docs](https://docs.unstract.com/unstract/editions/cloud_edition/#singlepass-extraction) |
+| 📉 **SummarizedExtraction** | Reduces LLM token usage by up to **6x**, saving costs while keeping accuracy. | [Docs](https://docs.unstract.com/unstract/unstract_platform/features/summarized_extraction/summarized_extraction_intro/) |
+| 👀 **Human-In-The-Loop** | Side-by-side comparison of extracted value and source document, with highlighting for human review and tweaking. | [Docs](https://docs.unstract.com/unstract/unstract_platform/human_quality_review/human_quality_review_intro/) |
+| 🔐 **SSO Support** | Enterprise-ready authentication options for seamless onboarding and off-boarding. | [Docs](https://docs.unstract.com/unstract/editions/cloud_edition/#enterprise-features) |
+
+## ⏩ Quick Start Guide
+
+Unstract comes well documented. You can get introduced to the [basics of Unstract](https://docs.unstract.com/unstract/), and [learn how to connect](https://docs.unstract.com/unstract/unstract_platform/setup_accounts/whats_needed) various systems like LLMs, Vector Databases, Embedding Models and Text Extractors to it. The easiest way to wet your feet is to go through our [Quick Start Guide](https://docs.unstract.com/unstract/unstract_platform/quick_start) where you actually get to do some prompt engineering in Prompt Studio and launch an API to structure varied credit card statements!
 
-## 🚀 Getting started
+## 🚀 Getting started (self-hosted)
 
 ### System Requirements
 
-- 8GB RAM (recommended)
+- 8GB RAM (minimum)
 
 ### Prerequisites
 
@@ -57,7 +74,6 @@ Next, either download a release or clone this repo and do the following:
 ✅ Now visit [http://frontend.unstract.localhost](http://frontend.unstract.localhost) in your browser <br>
 ✅ Use username and password `unstract` to login
 
-
 That's all there is to it!
 
 Follow [these steps](backend/README.md#authentication) to change the default username and password.
@@ -93,10 +109,6 @@ Unstract supports a wide range of file formats for document processing:
 | | TIFF | Tagged Image File Format |
 | | WEBP | Web Picture Format |
 
-## ⏩ Quick Start Guide
-
-Unstract comes well documented. You can get introduced to the [basics of Unstract](https://docs.unstract.com/unstract/), and [learn how to connect](https://docs.unstract.com/unstract/unstract_platform/setup_accounts/whats_needed) various systems like LLMs, Vector Databases, Embedding Models and Text Extractors to it. The easiest way to wet your feet is to go through our [Quick Start Guide](https://docs.unstract.com/unstract/unstract_platform/quick_start) where you actually get to do some prompt engineering in Prompt Studio and launch an API to structure varied credit card statements!
-
 ## 🤝 Ecosystem support
 
 ### LLM Providers
@@ -113,7 +125,6 @@ Unstract comes well documented. You can get introduced to the [basics of Unstrac
 | <img src="docs/assets/3rd_party/anyscale.png" width="32"/>     | Anyscale                    | ✅ Working |
 | <img src="docs/assets/3rd_party/mistral_ai.png" width="32"/>   | Mistral AI                  | ✅ Working |
 
-
 ### Vector Databases
 
 || Provider | Status |
@@ -124,8 +135,6 @@ Unstract comes well documented. You can get introduced to the [basics of Unstrac
 |<img src="docs/assets/3rd_party/postgres.png" width="32"/>| PostgreSQL | ✅ Working |
 |<img src="docs/assets/3rd_party/milvus.png" width="32"/>| Milvus | ✅ Working |
 
-
-
 ### Embeddings
 
 || Provider | Status |
 
@@ -8,6 +8,7 @@
 from account_v2.authentication_service import AuthenticationService
 from account_v2.constants import Common
 from backend.constants import RequestHeader
+from backend.internal_api_constants import INTERNAL_API_PREFIX
 
 
 class CustomAuthMiddleware:
@@ -22,6 +23,10 @@ def __call__(self, request: HttpRequest) -> HttpResponse:
         if any(request.path.startswith(path) for path in settings.WHITELISTED_PATHS):
             return self.get_response(request)
 
+        # Skip internal API paths - they are handled by InternalAPIAuthMiddleware
+        if request.path.startswith(f"{INTERNAL_API_PREFIX}/"):
+            return self.get_response(request)
+
         # Authenticating With API_KEY
         x_api_key = request.headers.get(RequestHeader.X_API_KEY)
         if (