diff --git a/.github/workflows/integration-test-docker.yml b/.github/workflows/integration-test-docker.yml
index 3045dd1037..84d1ae8146 100644
--- a/.github/workflows/integration-test-docker.yml
+++ b/.github/workflows/integration-test-docker.yml
@@ -6,14 +6,14 @@ on:
     branches:
       - main
     paths-ignore:
-      - 'website/**'
-      - '**/*.md'
+      - "website/**"
+      - "**/*.md"
   push:
     branches:
       - main
     paths-ignore:
-      - 'website/**'
-      - '**/*.md'
+      - "website/**"
+      - "**/*.md"
   workflow_dispatch: # Allow manual triggering
 
 concurrency:
@@ -24,7 +24,7 @@ jobs:
   test-ci-compose:
     if: github.repository == 'vllm-project/semantic-router' && !github.event.pull_request.draft
     runs-on: ubuntu-latest
-    timeout-minutes: 20  # Reduced from 30 - CI compose is faster
+    timeout-minutes: 20 # Reduced from 30 - CI compose is faster
 
     steps:
       - name: Check out the repo
@@ -36,11 +36,11 @@ jobs:
           # This helps prevent "no space left on device" errors
           echo "Disk space before setup:"
           df -h / && df -h /mnt
-          
+
           # Create /mnt/models directory if it doesn't exist
           sudo mkdir -p /mnt/models
           sudo chown -R $USER:$USER /mnt/models
-          
+
           # If models directory already exists in workspace, move it to /mnt
           if [ -d "models" ] && [ ! -L "models" ]; then
             echo "Moving existing models directory to /mnt/models..."
@@ -53,7 +53,7 @@ jobs:
               sudo mv models /mnt/models
             fi
           fi
-          
+
           # Create symlink from models/ to /mnt/models/ so existing code continues to work
           if [ ! -e "models" ]; then
             ln -s /mnt/models models
@@ -63,7 +63,7 @@ jobs:
           else
             echo "Warning: models exists but is not a symlink"
           fi
-          
+
           echo "Disk space after setup:"
           df -h / && df -h /mnt
           echo "Models directory setup complete. Models will be stored in /mnt/models"
@@ -71,13 +71,13 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: '3.11'
+          python-version: "3.11"
 
       - name: Install dependencies
         run: |
           sudo apt-get update
           sudo apt-get install -y make curl
-          pip install huggingface_hub[cli]
+          pip install -r src/model_manager/requirements.txt
 
       - name: Download models
         run: |
@@ -86,6 +86,7 @@ jobs:
         env:
           CI: true
           CI_MINIMAL_MODELS: true
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
 
@@ -101,7 +102,7 @@ jobs:
           echo "Waiting for services to be healthy..."
           max_attempts=60
           attempt=1
-          
+
           while [ $attempt -le $max_attempts ]; do
             echo "Attempt $attempt/$max_attempts: Checking service health..."
             
@@ -128,7 +129,7 @@ jobs:
             sleep 5
             ((attempt++))
           done
-          
+
           echo "❌ Timeout waiting for services to be healthy"
           docker ps -a
           exit 1
@@ -173,7 +174,7 @@ jobs:
             }')
 
           echo "Response: $response"
-          
+
           # Verify we got a response
           if echo "$response" | grep -q "choices"; then
             echo "✅ Chat completions test passed"
diff --git a/.github/workflows/performance-nightly.yml b/.github/workflows/performance-nightly.yml
index 4a3979d6da..145deff849 100644
--- a/.github/workflows/performance-nightly.yml
+++ b/.github/workflows/performance-nightly.yml
@@ -63,13 +63,14 @@ jobs:
       - name: Build Rust library (CPU-only)
         run: make rust-ci
 
-      - name: Install HuggingFace CLI
+      - name: Install Model Manager dependencies
         run: |
-          pip install -U "huggingface_hub[cli]" hf_transfer
+          pip install -r src/model_manager/requirements.txt
 
       - name: Download models (minimal set for nightly)
         env:
-          CI_MINIMAL_MODELS: true
+          CI_MINIMAL_MODELS: false
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
         run: make download-models
diff --git a/.github/workflows/performance-test.yml b/.github/workflows/performance-test.yml
index 2e303a772b..c47b0ab2c7 100644
--- a/.github/workflows/performance-test.yml
+++ b/.github/workflows/performance-test.yml
@@ -5,16 +5,16 @@ on:
     branches:
       - main
     paths:
-      - 'src/semantic-router/**'
-      - 'candle-binding/**'
-      - 'perf/**'
-      - '.github/workflows/performance-test.yml'
+      - "src/semantic-router/**"
+      - "candle-binding/**"
+      - "perf/**"
+      - ".github/workflows/performance-test.yml"
   workflow_dispatch:
 
 permissions:
   contents: read
-  pull-requests: write  # Required to comment on PRs
-  issues: write         # Required to comment on PRs (PRs are issues)
+  pull-requests: write # Required to comment on PRs
+  issues: write # Required to comment on PRs (PRs are issues)
 
 jobs:
   component-benchmarks:
@@ -24,6 +24,8 @@ jobs:
     steps:
       - name: Check out the repo
         uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # Need full history for baseline comparison
 
       - name: Set up Go
         uses: actions/setup-go@v5
@@ -70,13 +72,14 @@ jobs:
       - name: Build Rust library (CPU-only)
         run: make rust-ci
 
-      - name: Install HuggingFace CLI
+      - name: Install Model Manager dependencies
         run: |
-          pip install -U "huggingface_hub[cli]" hf_transfer
+          pip install -r src/model_manager/requirements.txt
 
       - name: Download models (minimal)
         env:
           CI_MINIMAL_MODELS: true
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
         run: make download-models
@@ -117,7 +120,7 @@ jobs:
 
       - name: Comment PR with results
         if: github.event_name == 'pull_request'
-        continue-on-error: true  # May fail for PRs from forks due to GitHub security restrictions
+        continue-on-error: true # May fail for PRs from forks due to GitHub security restrictions
         uses: actions/github-script@v7
         with:
           script: |
diff --git a/.github/workflows/test-and-build.yml b/.github/workflows/test-and-build.yml
index db3a337a52..fd74ab5932 100644
--- a/.github/workflows/test-and-build.yml
+++ b/.github/workflows/test-and-build.yml
@@ -34,7 +34,6 @@ jobs:
           || needs.changes.outputs.make == 'true'
           || needs.changes.outputs.ci   == 'true') }}
     runs-on: ubuntu-latest
-
     steps:
       - name: Check out the repo
         uses: actions/checkout@v4
@@ -86,11 +85,11 @@ jobs:
           # This helps prevent "no space left on device" errors
           echo "Disk space before setup:"
           df -h / && df -h /mnt
-          
+
           # Create /mnt/models directory if it doesn't exist
           sudo mkdir -p /mnt/models
           sudo chown -R $USER:$USER /mnt/models
-          
+
           # If models directory already exists in workspace, move it to /mnt
           if [ -d "models" ] && [ ! -L "models" ]; then
             echo "Moving existing models directory to /mnt/models..."
@@ -103,7 +102,7 @@ jobs:
               sudo mv models /mnt/models
             fi
           fi
-          
+
           # Create symlink from models/ to /mnt/models/ so existing code continues to work
           if [ ! -e "models" ]; then
             ln -s /mnt/models models
@@ -113,7 +112,7 @@ jobs:
           else
             echo "Warning: models exists but is not a symlink"
           fi
-          
+
           echo "Disk space after setup:"
           df -h / && df -h /mnt
           echo "Models directory setup complete. Models will be stored in /mnt/models"
@@ -134,13 +133,14 @@ jobs:
       - name: Build Rust library (CPU-only, no CUDA)
         run: make rust-ci
 
-      - name: Install HuggingFace CLI
+      - name: Install Model Manager dependencies
         run: |
-          pip install -U "huggingface_hub[cli]" hf_transfer
+          pip install -r src/model_manager/requirements.txt
 
       - name: Download models (minimal on PRs)
         env:
           CI_MINIMAL_MODELS: ${{ github.event_name == 'pull_request' }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
         run: make download-models
@@ -169,7 +169,7 @@ jobs:
 
           echo "Milvus is ready at localhost:19530"
           docker ps --filter "name=milvus-semantic-cache"
-      
+
       - name: Start Redis service
         run: |
           echo "Starting Redis Stack..."
diff --git a/config/models.lora.yaml b/config/model_manager/models.lora.yaml
similarity index 93%
rename from config/models.lora.yaml
rename to config/model_manager/models.lora.yaml
index 9fdd23f01a..22375f144f 100644
--- a/config/models.lora.yaml
+++ b/config/model_manager/models.lora.yaml
@@ -2,7 +2,7 @@
 # This file contains only LoRA adapter models for incremental CI downloads.
 #
 # Usage:
-#   python -m model_manager --config config/models.lora.yaml
+#   python -m model_manager --config config/model_manager/models.lora.yaml
 #
 # Equivalent to: make download-models-lora
 
diff --git a/config/models.minimal.yaml b/config/model_manager/models.minimal.yaml
similarity index 92%
rename from config/models.minimal.yaml
rename to config/model_manager/models.minimal.yaml
index c14c017e3b..cff4b5d06f 100644
--- a/config/models.minimal.yaml
+++ b/config/model_manager/models.minimal.yaml
@@ -5,12 +5,13 @@
 # Usage:
 #   CI_MINIMAL_MODELS=true python -m model_manager
 #   # or explicitly:
-#   python -m model_manager --config config/models.minimal.yaml
+#   python -m model_manager --config config/model_manager/models.minimal.yaml
 #
 # Equivalent to: make download-models-minimal
 #             or CI_MINIMAL_MODELS=true make download-models
 #
-# Note: embeddinggemma-300m is gated and requires HF_TOKEN, so it's excluded.
+# Note: This is the minimal set for fast CI runs. Larger models like
+# embeddinggemma-300m are in models.yaml (full set) for local development.
 
 cache_dir: "models"
 verify: "size" # Use size for faster CI runs
diff --git a/config/models.yaml b/config/model_manager/models.yaml
similarity index 98%
rename from config/models.yaml
rename to config/model_manager/models.yaml
index 6ad03a0eac..fd32b268bf 100644
--- a/config/models.yaml
+++ b/config/model_manager/models.yaml
@@ -3,7 +3,7 @@
 #
 # Usage:
 #   python -m model_manager
-#   python -m model_manager --config config/models.yaml
+#   python -m model_manager --config config/model_manager/models.yaml
 #
 # Includes additional LoRA variants (roberta, modernbert) and gated models not in minimal set.
 #
diff --git a/src/model_manager/README.md b/src/model_manager/README.md
new file mode 100644
index 0000000000..5cfe6232bf
--- /dev/null
+++ b/src/model_manager/README.md
@@ -0,0 +1,224 @@
+# Model Manager
+
+A Python module for automated ML model download, verification, and caching from HuggingFace.
+
+## Features
+
+- **Automated Download**: Download models from HuggingFace Hub with support for specific revisions and file filtering
+- **Integrity Verification**: Verify model integrity using size or SHA256 checksums
+- **Smart Caching**: Skip downloads for already-cached models
+- **CI-Friendly**: Environment variable-based config selection for minimal CI downloads
+- **CLI & Programmatic API**: Use from command line or import as a Python module
+
+## Quick Start
+
+### CLI Usage
+
+```bash
+# Download all models (uses config/model_manager/models.yaml by default)
+python -m model_manager
+
+# Use a specific config file
+python -m model_manager --config config/model_manager/models.yaml
+
+# Download a specific model only
+python -m model_manager --model category_classifier_modernbert-base_model
+
+# List all configured models and their cache status
+python -m model_manager --list
+
+# Verify existing models without downloading
+python -m model_manager --verify-only
+
+# Clean all cached models (from default config)
+python -m model_manager --clean
+
+# Clean all models from a specific config
+python -m model_manager --config config/model_manager/models.yaml --clean
+
+# Clean a specific model
+python -m model_manager --clean-model category_classifier_modernbert-base_model
+
+# Verbose output for debugging
+python -m model_manager -v
+
+# Show help message
+python -m model_manager --help
+```
+
+### CI Mode
+
+For CI environments, use the minimal model set to speed up builds and avoid rate limits:
+
+```bash
+# Auto-select minimal config
+CI_MINIMAL_MODELS=true python -m model_manager
+
+# Or explicitly specify
+python -m model_manager --config config/model_manager/models.minimal.yaml
+```
+
+### Programmatic Usage
+
+```python
+from model_manager import ensure_models, ModelManager
+
+# Simple: ensure all models are downloaded
+ensure_models("config/model_manager/models.yaml")
+
+# Advanced: use ModelManager for more control
+manager = ModelManager.from_config("config/model_manager/models.yaml")
+
+# Ensure all models
+paths = manager.ensure_all()
+print(paths)  # {'model_id': '/path/to/model', ...}
+
+# Ensure a specific model
+path = manager.ensure_model("category_classifier_modernbert-base_model")
+
+# Check if a model is cached
+from model_manager import is_cached, get_model_path
+spec = manager.get_model_spec("category_classifier_modernbert-base_model")
+if is_cached(spec, manager.config.cache_dir):
+    print(f"Model cached at: {get_model_path(spec, manager.config.cache_dir)}")
+```
+
+## Configuration
+
+Models are defined in YAML configuration files. See `config/model_manager/` for examples.
+
+### Configuration Schema
+
+```yaml
+# Cache directory for downloaded models
+cache_dir: "models"
+
+# Verification level: "none", "size", or "sha256"
+verify: "sha256"
+
+# List of models to manage
+models:
+  - id: my-model # Unique identifier (required)
+    repo_id: org/model-name # HuggingFace repo ID (required)
+    revision: main # Git revision (optional, default: main)
+    local_dir: custom-name # Override local directory name (optional)
+    files: # Specific files to download (optional)
+      - config.json
+      - model.safetensors
+```
+
+### Verification Levels
+
+| Level    | Description          | Use Case                  |
+| -------- | -------------------- | ------------------------- |
+| `none`   | No verification      | Fast, trust HuggingFace   |
+| `size`   | Verify file sizes    | Quick verification for CI |
+| `sha256` | Full SHA256 checksum | Production environments   |
+
+### Available Configs
+
+| Config                | Description                                  |
+| --------------------- | -------------------------------------------- |
+| `models.yaml`         | Full model set for local development         |
+| `models.minimal.yaml` | Minimal set for CI (faster, no gated models) |
+| `models.lora.yaml`    | LoRA adapters only                           |
+
+## Environment Variables
+
+| Variable                    | Description                                                                             |
+| --------------------------- | --------------------------------------------------------------------------------------- |
+| `CI_MINIMAL_MODELS`         | Set to `true`, `1`, or `yes` to auto-select minimal config                              |
+| `HF_TOKEN`                  | HuggingFace token for gated models (e.g., `embeddinggemma-300m`)                        |
+| `HF_ENDPOINT`               | A HuggingFace mirror endpoint for accelerated downloads (e.g., `https://hf-mirror.com`) |
+| `HF_HUB_ENABLE_HF_TRANSFER` | Set to `1` to enable faster downloads using `hf_transfer` (enabled by default in CI)    |
+
+## API Reference
+
+### Functions
+
+#### `ensure_models(config_path, cache_dir=None)`
+
+Main entry point. Downloads and verifies all models from config.
+
+#### `is_cached(spec, cache_dir)`
+
+Check if a model is already cached.
+
+#### `get_model_path(spec, cache_dir)`
+
+Get the local path for a model.
+
+#### `download_model(spec, cache_dir)`
+
+Download a model from HuggingFace.
+
+#### `verify_model(path, verify_level)`
+
+Verify model integrity.
+
+### Classes
+
+#### `ModelManager`
+
+Central manager for model operations.
+
+- `from_config(config_path)` - Create from config file
+- `ensure_all()` - Ensure all models are ready
+- `ensure_model(model_id)` - Ensure a specific model
+- `get_model_spec(model_id)` - Get model specification
+
+#### `ModelSpec`
+
+Specification for a single model.
+
+- `id` - Unique identifier
+- `repo_id` - HuggingFace repository ID
+- `revision` - Git revision (default: "main")
+- `local_dir` - Override local directory name
+- `files` - Specific files to download
+
+#### `ModelsConfig`
+
+Configuration container.
+
+- `models` - List of ModelSpec
+- `cache_dir` - Cache directory path
+- `verify` - Verification level
+
+### Exceptions
+
+| Exception            | Description               |
+| -------------------- | ------------------------- |
+| `ModelManagerError`  | Base exception            |
+| `MissingModelError`  | Model not found in config |
+| `BadChecksumError`   | Verification failed       |
+| `DownloadError`      | Download failed           |
+| `ConfigurationError` | Invalid config            |
+
+## Development
+
+### Running Tests
+
+```bash
+# Run all model_manager tests
+pytest src/model_manager/tests/ -v
+
+# Run with coverage
+pytest src/model_manager/tests/ --cov=model_manager
+```
+
+### Project Structure
+
+```
+src/model_manager/
+├── __init__.py      # Public API and ModelManager class
+├── __main__.py      # CLI entrypoint
+├── cli.py           # CLI implementation
+├── config.py        # Configuration dataclasses
+├── registry.py      # YAML config parsing
+├── downloader.py    # HuggingFace download logic
+├── verifier.py      # Integrity verification
+├── cache.py         # Cache management
+├── errors.py        # Custom exceptions
+└── tests/           # Unit tests
+```
diff --git a/src/model_manager/cli.py b/src/model_manager/cli.py
index 2df78b843b..e79274e893 100644
--- a/src/model_manager/cli.py
+++ b/src/model_manager/cli.py
@@ -24,8 +24,8 @@ def get_default_config() -> str:
     """Get default config path based on CI_MINIMAL_MODELS environment variable."""
     ci_minimal = os.environ.get("CI_MINIMAL_MODELS", "").lower()
     if ci_minimal in ("true", "1", "yes"):
-        return "config/models.minimal.yaml"
-    return "config/models.yaml"
+        return "config/model_manager/models.minimal.yaml"
+    return "config/model_manager/models.yaml"
 
 
 def setup_logging(verbose: bool = False) -> None:
diff --git a/src/model_manager/requirements.txt b/src/model_manager/requirements.txt
index 679eb802a6..5c76aa24f3 100644
--- a/src/model_manager/requirements.txt
+++ b/src/model_manager/requirements.txt
@@ -2,5 +2,9 @@
 huggingface_hub>=0.20.0
 pyyaml>=6.0
 
+# Optional performance enhancement for faster downloads
+# Automatically used by huggingface_hub when HF_HUB_ENABLE_HF_TRANSFER=1
+hf_transfer>=0.1.0
+
 # Development/test dependencies
 pytest>=7.0.0
diff --git a/src/model_manager/tests/test_cli.py b/src/model_manager/tests/test_cli.py
index 5aa7d01725..c69dce93d8 100644
--- a/src/model_manager/tests/test_cli.py
+++ b/src/model_manager/tests/test_cli.py
@@ -18,37 +18,37 @@ def test_default_config_without_env(self):
         with patch("os.environ.get") as mock_get:
             mock_get.return_value = ""
             result = get_default_config()
-            assert result == "config/models.yaml"
+            assert result == "config/model_manager/models.yaml"
 
     def test_minimal_config_with_true(self):
         """Test that minimal config is returned when CI_MINIMAL_MODELS=true."""
         with patch.dict(os.environ, {"CI_MINIMAL_MODELS": "true"}):
             result = get_default_config()
-            assert result == "config/models.minimal.yaml"
+            assert result == "config/model_manager/models.minimal.yaml"
 
     def test_minimal_config_with_1(self):
         """Test that minimal config is returned when CI_MINIMAL_MODELS=1."""
         with patch.dict(os.environ, {"CI_MINIMAL_MODELS": "1"}):
             result = get_default_config()
-            assert result == "config/models.minimal.yaml"
+            assert result == "config/model_manager/models.minimal.yaml"
 
     def test_minimal_config_with_yes(self):
         """Test that minimal config is returned when CI_MINIMAL_MODELS=yes."""
         with patch.dict(os.environ, {"CI_MINIMAL_MODELS": "yes"}):
             result = get_default_config()
-            assert result == "config/models.minimal.yaml"
+            assert result == "config/model_manager/models.minimal.yaml"
 
     def test_minimal_config_case_insensitive(self):
         """Test that CI_MINIMAL_MODELS check is case insensitive."""
         with patch.dict(os.environ, {"CI_MINIMAL_MODELS": "TRUE"}):
             result = get_default_config()
-            assert result == "config/models.minimal.yaml"
+            assert result == "config/model_manager/models.minimal.yaml"
 
     def test_default_config_with_false(self):
         """Test that default config is returned when CI_MINIMAL_MODELS=false."""
         with patch.dict(os.environ, {"CI_MINIMAL_MODELS": "false"}):
             result = get_default_config()
-            assert result == "config/models.yaml"
+            assert result == "config/model_manager/models.yaml"
 
 
 class TestMainCLI:
diff --git a/tools/make/models.mk b/tools/make/models.mk
index f4dd3d0dd8..91192a2cc4 100644
--- a/tools/make/models.mk
+++ b/tools/make/models.mk
@@ -6,10 +6,10 @@
 
 # CI_MINIMAL_MODELS=true will download only the minimal set of models required for tests.
 # Default behavior downloads the full set used for local development.
-# Model configurations are moved to config directory
-#  full set: config/models.yaml
-#  minimal set: config/models.minimal.yaml
-#  LoRA and advanced embedding models: config/models.lora.yaml
+# Model configurations are in config/model_manager directory
+#  full set: config/model_manager/models.yaml
+#  minimal set: config/model_manager/models.minimal.yaml
+#  LoRA and advanced embedding models: config/model_manager/models.lora.yaml
 
 download-models:
 download-models: ## Download models (full or minimal set depending on CI_MINIMAL_MODELS)
@@ -22,24 +22,24 @@ download-models: ## Download models (full or minimal set depending on CI_MINIMAL
 download-models-minimal:
 download-models-minimal: ## Pre-download minimal set of models for CI tests
 	@mkdir -p models
-	PYTHONPATH=src python -m model_manager --config config/models.minimal.yaml
+	PYTHONPATH=src python -m model_manager --config config/model_manager/models.minimal.yaml
 
 # Explicit full download
 download-models-full:
 download-models-full: ## Download all models used in local development and docs
 	@mkdir -p models
-	PYTHONPATH=src python -m model_manager --config config/models.yaml
+	PYTHONPATH=src python -m model_manager --config config/model_manager/models.yaml
 
 # Download only LoRA and advanced embedding models (for CI after minimal tests)
 download-models-lora:
 download-models-lora: ## Download LoRA adapters and advanced embedding models only
 	@mkdir -p models
-	PYTHONPATH=src python -m model_manager --config config/models.lora.yaml
+	PYTHONPATH=src python -m model_manager --config config/model_manager/models.lora.yaml
 
 # Clean up minimal models to save disk space (for CI)
 clean-minimal-models: ## Remove minimal models to save disk space
 	@echo "Cleaning up minimal models to save disk space..."
-	PYTHONPATH=src python -m model_manager --config config/models.minimal.yaml --clean
+	PYTHONPATH=src python -m model_manager --config config/model_manager/models.minimal.yaml --clean
 
 # List configured models
 list-models: ## List models configured in the default or specified environment