vllm-project · JaredforReal · Dec 15, 2025 · Dec 14, 2025 · Dec 14, 2025 · Dec 14, 2025
@@ -6,14 +6,14 @@ on:
     branches:
       - main
     paths-ignore:
-      - 'website/**'
-      - '**/*.md'
+      - "website/**"
+      - "**/*.md"
   push:
     branches:
       - main
     paths-ignore:
-      - 'website/**'
-      - '**/*.md'
+      - "website/**"
+      - "**/*.md"
   workflow_dispatch: # Allow manual triggering
 
 concurrency:
@@ -24,7 +24,7 @@ jobs:
   test-ci-compose:
     if: github.repository == 'vllm-project/semantic-router' && !github.event.pull_request.draft
     runs-on: ubuntu-latest
-    timeout-minutes: 20  # Reduced from 30 - CI compose is faster
+    timeout-minutes: 20 # Reduced from 30 - CI compose is faster
 
     steps:
       - name: Check out the repo
@@ -36,11 +36,11 @@ jobs:
           # This helps prevent "no space left on device" errors
           echo "Disk space before setup:"
           df -h / && df -h /mnt
-          
+
           # Create /mnt/models directory if it doesn't exist
           sudo mkdir -p /mnt/models
           sudo chown -R $USER:$USER /mnt/models
-          
+
           # If models directory already exists in workspace, move it to /mnt
           if [ -d "models" ] && [ ! -L "models" ]; then
             echo "Moving existing models directory to /mnt/models..."
@@ -53,7 +53,7 @@ jobs:
               sudo mv models /mnt/models
             fi
           fi
-          
+
           # Create symlink from models/ to /mnt/models/ so existing code continues to work
           if [ ! -e "models" ]; then
             ln -s /mnt/models models
@@ -63,21 +63,21 @@ jobs:
           else
             echo "Warning: models exists but is not a symlink"
           fi
-          
+
           echo "Disk space after setup:"
           df -h / && df -h /mnt
           echo "Models directory setup complete. Models will be stored in /mnt/models"
 
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: '3.11'
+          python-version: "3.11"
 
       - name: Install dependencies
         run: |
           sudo apt-get update
           sudo apt-get install -y make curl
-          pip install huggingface_hub[cli]
+          pip install -r src/model_manager/requirements.txt
 
       - name: Download models
         run: |
@@ -86,6 +86,7 @@ jobs:
         env:
           CI: true
           CI_MINIMAL_MODELS: true
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
 
@@ -101,7 +102,7 @@ jobs:
           echo "Waiting for services to be healthy..."
           max_attempts=60
           attempt=1
-          
+
           while [ $attempt -le $max_attempts ]; do
             echo "Attempt $attempt/$max_attempts: Checking service health..."
 
@@ -128,7 +129,7 @@ jobs:
             sleep 5
             ((attempt++))
           done
-          
+
           echo "❌ Timeout waiting for services to be healthy"
           docker ps -a
           exit 1
@@ -173,7 +174,7 @@ jobs:
             }')
 
           echo "Response: $response"
-          
+
           # Verify we got a response
           if echo "$response" | grep -q "choices"; then
             echo "✅ Chat completions test passed"

@@ -63,13 +63,14 @@ jobs:
       - name: Build Rust library (CPU-only)
         run: make rust-ci
 
-      - name: Install HuggingFace CLI
+      - name: Install Model Manager dependencies
         run: |
-          pip install -U "huggingface_hub[cli]" hf_transfer
+          pip install -r src/model_manager/requirements.txt
 
       - name: Download models (minimal set for nightly)
         env:
-          CI_MINIMAL_MODELS: true
+          CI_MINIMAL_MODELS: false
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
         run: make download-models

@@ -5,16 +5,16 @@ on:
     branches:
       - main
     paths:
-      - 'src/semantic-router/**'
-      - 'candle-binding/**'
-      - 'perf/**'
-      - '.github/workflows/performance-test.yml'
+      - "src/semantic-router/**"
+      - "candle-binding/**"
+      - "perf/**"
+      - ".github/workflows/performance-test.yml"
   workflow_dispatch:
 
 permissions:
   contents: read
-  pull-requests: write  # Required to comment on PRs
-  issues: write         # Required to comment on PRs (PRs are issues)
+  pull-requests: write # Required to comment on PRs
+  issues: write # Required to comment on PRs (PRs are issues)
 
 jobs:
   component-benchmarks:
@@ -24,6 +24,8 @@ jobs:
     steps:
       - name: Check out the repo
         uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # Need full history for baseline comparison
 
       - name: Set up Go
         uses: actions/setup-go@v5
@@ -70,13 +72,14 @@ jobs:
       - name: Build Rust library (CPU-only)
         run: make rust-ci
 
-      - name: Install HuggingFace CLI
+      - name: Install Model Manager dependencies
         run: |
-          pip install -U "huggingface_hub[cli]" hf_transfer
+          pip install -r src/model_manager/requirements.txt
 
       - name: Download models (minimal)
         env:
           CI_MINIMAL_MODELS: true
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
         run: make download-models
@@ -117,7 +120,7 @@ jobs:
 
       - name: Comment PR with results
         if: github.event_name == 'pull_request'
-        continue-on-error: true  # May fail for PRs from forks due to GitHub security restrictions
+        continue-on-error: true # May fail for PRs from forks due to GitHub security restrictions
         uses: actions/github-script@v7
         with:
           script: |

@@ -34,7 +34,6 @@ jobs:
           || needs.changes.outputs.make == 'true'
           || needs.changes.outputs.ci   == 'true') }}
     runs-on: ubuntu-latest
-
     steps:
       - name: Check out the repo
         uses: actions/checkout@v4
@@ -86,11 +85,11 @@ jobs:
           # This helps prevent "no space left on device" errors
           echo "Disk space before setup:"
           df -h / && df -h /mnt
-          
+
           # Create /mnt/models directory if it doesn't exist
           sudo mkdir -p /mnt/models
           sudo chown -R $USER:$USER /mnt/models
-          
+
           # If models directory already exists in workspace, move it to /mnt
           if [ -d "models" ] && [ ! -L "models" ]; then
             echo "Moving existing models directory to /mnt/models..."
@@ -103,7 +102,7 @@ jobs:
               sudo mv models /mnt/models
             fi
           fi
-          
+
           # Create symlink from models/ to /mnt/models/ so existing code continues to work
           if [ ! -e "models" ]; then
             ln -s /mnt/models models
@@ -113,7 +112,7 @@ jobs:
           else
             echo "Warning: models exists but is not a symlink"
           fi
-          
+
           echo "Disk space after setup:"
           df -h / && df -h /mnt
           echo "Models directory setup complete. Models will be stored in /mnt/models"
@@ -134,13 +133,14 @@ jobs:
       - name: Build Rust library (CPU-only, no CUDA)
         run: make rust-ci
 
-      - name: Install HuggingFace CLI
+      - name: Install Model Manager dependencies
         run: |
-          pip install -U "huggingface_hub[cli]" hf_transfer
+          pip install -r src/model_manager/requirements.txt
 
       - name: Download models (minimal on PRs)
         env:
           CI_MINIMAL_MODELS: ${{ github.event_name == 'pull_request' }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
         run: make download-models
@@ -169,7 +169,7 @@ jobs:
 
           echo "Milvus is ready at localhost:19530"
           docker ps --filter "name=milvus-semantic-cache"
-      
+
       - name: Start Redis service
         run: |
           echo "Starting Redis Stack..."

@@ -2,7 +2,7 @@
 # This file contains only LoRA adapter models for incremental CI downloads.
 #
 # Usage:
-#   python -m model_manager --config config/models.lora.yaml
+#   python -m model_manager --config config/model_manager/models.lora.yaml
 #
 # Equivalent to: make download-models-lora
 

@@ -5,12 +5,13 @@
 # Usage:
 #   CI_MINIMAL_MODELS=true python -m model_manager
 #   # or explicitly:
-#   python -m model_manager --config config/models.minimal.yaml
+#   python -m model_manager --config config/model_manager/models.minimal.yaml
 #
 # Equivalent to: make download-models-minimal
 #             or CI_MINIMAL_MODELS=true make download-models
 #
-# Note: embeddinggemma-300m is gated and requires HF_TOKEN, so it's excluded.
+# Note: This is the minimal set for fast CI runs. Larger models like
+# embeddinggemma-300m are in models.yaml (full set) for local development.
 
 cache_dir: "models"
 verify: "size" # Use size for faster CI runs

@@ -3,7 +3,7 @@
 #
 # Usage:
 #   python -m model_manager
-#   python -m model_manager --config config/models.yaml
+#   python -m model_manager --config config/model_manager/models.yaml
 #
 # Includes additional LoRA variants (roberta, modernbert) and gated models not in minimal set.
 #