vllm-project · rootfs · Dec 16, 2025 · Dec 15, 2025 · Dec 15, 2025
@@ -0,0 +1,137 @@
+# Hallucination Detection Benchmark
+
+E2E evaluation of hallucination detection through the semantic router.
+
+## Quick Start
+
+```bash
+# 1. Start vLLM (if not running)
+docker run -d --gpus all -p 8083:8000 vllm/vllm-openai:latest \
+    vllm serve --model Qwen/Qwen2.5-14B-Instruct-AWQ
+
+# 2. Start semantic router with hallucination config
+cd /path/to/semantic-router
+export LD_LIBRARY_PATH=$PWD/candle-binding/target/release
+./bin/router -config=bench/hallucination/config-7b.yaml
+
+# 3. Start Envoy
+make run-envoy
+
+# 4. Run benchmark
+python3 -m bench.hallucination.evaluate \
+    --endpoint http://localhost:8801 \
+    --dataset halueval \
+    --max-samples 50
+
+# Or use the Makefile target:
+make bench-hallucination MAX_SAMPLES=50
+```
+
+## Using the Large Model
+
+The large model (`lettucedect-large-modernbert-en-v1`, 395M params) provides better detection accuracy than the base model.
+
+### Step 1: Download the Large Model
+
+```bash
+cd /path/to/semantic-router
+
+# Download from HuggingFace
+hf download KRLabsOrg/lettucedect-large-modernbert-en-v1 \
+    --local-dir models/lettucedect-large-modernbert-en-v1
+```
+
+### Step 2: Update Config
+
+Edit `bench/hallucination/config-7b.yaml`:
+
+```yaml
+hallucination_mitigation:
+  enabled: true
+
+  hallucination_model:
+    model_id: "models/lettucedect-large-modernbert-en-v1"  # Use large model
+    threshold: 0.5
+    use_cpu: true  # Set to false for GPU
+```
+
+### Step 3: Restart Router
+
+```bash
+# Kill existing router
+pkill -f "router.*config"
+
+# Start with updated config
+export LD_LIBRARY_PATH=$PWD/candle-binding/target/release
+./bin/router -config=bench/hallucination/config-7b.yaml
+```
+
+## Supported Models
+
+| Model | Params | HuggingFace ID |
+|-------|--------|----------------|
+| Base | 149M | `KRLabsOrg/lettucedect-base-modernbert-en-v1` |
+| Large | 395M | `KRLabsOrg/lettucedect-large-modernbert-en-v1` |
+
+Both use `ModernBertForTokenClassification` architecture supported by candle-binding.
+
+## Config Reference
+
+Key settings in `config-7b.yaml`:
+
+```yaml
+# vLLM endpoint
+vllm_endpoints:
+  - name: "vllm-general"
+    address: "127.0.0.1"
+    port: 8083
+
+# Hallucination detection
+hallucination_mitigation:
+  enabled: true
+  hallucination_model:
+    model_id: "models/lettucedect-large-modernbert-en-v1"
+    threshold: 0.5
+    use_cpu: true
+  on_hallucination_detected: "warn"  # or "block"
+```
+
+## Datasets
+
+| Dataset | Command |
+|---------|---------|
+| HaluEval | `--dataset halueval` |
+| Custom | `--dataset /path/to/data.jsonl` |
+
+## Output
+
+Results saved to `bench/hallucination/results/` with:
+
+- Precision, Recall, F1 (when ground truth available)
+- Latency metrics (avg, p50, p99)
+- Per-sample detection results
+
+### Two-Stage Pipeline Efficiency Metrics
+
+The benchmark tracks the computational savings from the two-stage detection pipeline:
+
+```
+⚡ Two-Stage Pipeline Efficiency:
+----------------------------------------
+  Fact-check needed:     65/100 queries
+  Detection skipped:     35/100 queries
+  Avg context length:    4500 chars
+  Estimated detect time: 6500.00 ms (if all ran)
+  Actual detect time:    4225.00 ms
+  Time saved:            2275.00 ms
+  Efficiency gain:       35.0%
+
+  💡 Pre-filtering skipped 35.0% of requests,
+     saving 2275ms of detection compute.
+```
+
+This demonstrates the value of the HaluGate Sentinel pre-classifier:
+
+- **O(1) filtering** before **O(n) detection** (n = context length)
+- Non-factual queries (creative, opinion, brainstorming) skip expensive token classification
+- Critical for RAG applications with large contexts (8K+ tokens)
@@ -0,0 +1,10 @@
+"""Hallucination Detection Benchmark for Semantic Router.
+
+This package provides end-to-end evaluation of the hallucination detection pipeline
+through the router + Envoy stack.
+"""
+
+from .evaluate import HallucinationBenchmark
+from .datasets import HaluEvalDataset, CustomDataset, get_dataset
+
+__all__ = ["HallucinationBenchmark", "HaluEvalDataset", "CustomDataset", "get_dataset"]
@@ -0,0 +1,182 @@
+# Configuration for hallucination detection benchmark
+# Connects to real vLLM server at port 8083
+
+bert_model:
+  model_id: models/all-MiniLM-L12-v2
+  threshold: 0.6
+  use_cpu: true
+
+semantic_cache:
+  enabled: false  # Disable cache for benchmarking
+
+# Classifier configuration
+classifier:
+  category_model:
+    model_id: "models/category_classifier_modernbert-base_model"
+    use_modernbert: true
+    threshold: 0.6
+    use_cpu: true
+    category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
+  pii_model:
+    model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
+    use_modernbert: true
+    threshold: 0.7
+    use_cpu: true
+    pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"
+
+# Hallucination mitigation configuration
+hallucination_mitigation:
+  enabled: true
+
+  # Fact-check classifier: determines if a prompt needs fact verification
+  fact_check_model:
+    model_id: "models/halugate-sentinel"
+    threshold: 0.6
+    use_cpu: true
+    mapping_path: "models/halugate-sentinel/fact_check_mapping.json"
+
+  # Hallucination detector: verifies if LLM response is grounded in context
+  # Using large model (395M params) for better accuracy
+  hallucination_model:
+    model_id: "models/lettucedect-large-modernbert-en-v1"
+    threshold: 0.5
+    use_cpu: true
+
+  # NLI model: provides explanations for hallucinated spans
+  nli_model:
+    model_id: "models/ModernBERT-base-nli"
+    threshold: 0.7
+    use_cpu: true
+
+  # Action when hallucination detected: "warn" adds headers, "block" returns error
+  on_hallucination_detected: "warn"
+
+# Fact-check rules for signal classification
+# The classifier outputs one of these signals that can be referenced in decision conditions
+fact_check_rules:
+  - name: needs_fact_check
+    description: "Query contains factual claims that should be verified against context"
+  - name: no_fact_check_needed
+    description: "Query is creative, code-related, or opinion-based - no fact verification needed"
+
+# Prompt guard
+prompt_guard:
+  enabled: true
+  use_modernbert: true
+  model_id: "models/jailbreak_classifier_modernbert-base_model"
+  threshold: 0.7
+  use_cpu: true
+  jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
+
+# vLLM endpoint - real vLLM server
+vllm_endpoints:
+  - name: "vllm-general"
+    address: "127.0.0.1"
+    port: 8083
+    weight: 1
+    health_check_path: "/health"
+
+# Model configuration - use the actual model from vLLM
+model_config:
+  "Qwen/Qwen2.5-14B-Instruct-AWQ":
+    reasoning_family: "qwen3"
+    preferred_endpoints: ["vllm-general"]
+
+# Categories for routing
+categories:
+  - name: general
+    description: "General questions"
+    mmlu_categories: ["other"]
+  - name: math
+    description: "Mathematics and quantitative reasoning"
+    mmlu_categories: ["math"]
+  - name: science
+    description: "Science questions"
+    mmlu_categories: ["physics", "chemistry", "biology"]
+
+strategy: "priority"
+
+decisions:
+  - name: "math_decision"
+    description: "Mathematics and quantitative reasoning"
+    priority: 100
+    rules:
+      operator: "AND"
+      conditions:
+        - type: "domain"
+          name: "math"
+    modelRefs:
+      - model: "Qwen/Qwen2.5-14B-Instruct-AWQ"
+        use_reasoning: true
+    plugins:
+      - type: "pii"
+        configuration:
+          enabled: true
+          pii_types_allowed: []
+      - type: "hallucination"
+        configuration:
+          enabled: true
+          use_nli: true
+          hallucination_action: "header"
+          unverified_factual_action: "header"
+          include_hallucination_details: false
+
+  - name: "science_decision"
+    description: "Science questions"
+    priority: 100
+    rules:
+      operator: "AND"
+      conditions:
+        - type: "domain"
+          name: "science"
+    modelRefs:
+      - model: "Qwen/Qwen2.5-14B-Instruct-AWQ"
+        use_reasoning: true
+    plugins:
+      - type: "pii"
+        configuration:
+          enabled: true
+          pii_types_allowed: []
+      - type: "hallucination"
+        configuration:
+          enabled: true
+          use_nli: true
+          hallucination_action: "header"
+          unverified_factual_action: "header"
+          include_hallucination_details: false
+
+  - name: "general_decision"
+    description: "General questions"
+    priority: 50
+    rules:
+      operator: "AND"
+      conditions:
+        - type: "domain"
+          name: "general"
+    modelRefs:
+      - model: "Qwen/Qwen2.5-14B-Instruct-AWQ"
+        use_reasoning: false
+    plugins:
+      - type: "pii"
+        configuration:
+          enabled: true
+          pii_types_allowed: []
+      - type: "hallucination"
+        configuration:
+          enabled: true
+          use_nli: true
+          hallucination_action: "header"
+          unverified_factual_action: "header"
+          include_hallucination_details: false
+
+default_model: "Qwen/Qwen2.5-14B-Instruct-AWQ"
+
+# API Configuration
+api:
+  batch_classification:
+    metrics:
+      enabled: true
+      detailed_goroutine_tracking: true
+      high_resolution_timing: false
+      sample_rate: 1.0
+