diff --git a/.github/workflows/integration-test-docker.yml b/.github/workflows/integration-test-docker.yml index 3045dd1037..84d1ae8146 100644 --- a/.github/workflows/integration-test-docker.yml +++ b/.github/workflows/integration-test-docker.yml @@ -6,14 +6,14 @@ on: branches: - main paths-ignore: - - 'website/**' - - '**/*.md' + - "website/**" + - "**/*.md" push: branches: - main paths-ignore: - - 'website/**' - - '**/*.md' + - "website/**" + - "**/*.md" workflow_dispatch: # Allow manual triggering concurrency: @@ -24,7 +24,7 @@ jobs: test-ci-compose: if: github.repository == 'vllm-project/semantic-router' && !github.event.pull_request.draft runs-on: ubuntu-latest - timeout-minutes: 20 # Reduced from 30 - CI compose is faster + timeout-minutes: 20 # Reduced from 30 - CI compose is faster steps: - name: Check out the repo @@ -36,11 +36,11 @@ jobs: # This helps prevent "no space left on device" errors echo "Disk space before setup:" df -h / && df -h /mnt - + # Create /mnt/models directory if it doesn't exist sudo mkdir -p /mnt/models sudo chown -R $USER:$USER /mnt/models - + # If models directory already exists in workspace, move it to /mnt if [ -d "models" ] && [ ! -L "models" ]; then echo "Moving existing models directory to /mnt/models..." @@ -53,7 +53,7 @@ jobs: sudo mv models /mnt/models fi fi - + # Create symlink from models/ to /mnt/models/ so existing code continues to work if [ ! -e "models" ]; then ln -s /mnt/models models @@ -63,7 +63,7 @@ jobs: else echo "Warning: models exists but is not a symlink" fi - + echo "Disk space after setup:" df -h / && df -h /mnt echo "Models directory setup complete. Models will be stored in /mnt/models" @@ -71,13 +71,13 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.11' + python-version: "3.11" - name: Install dependencies run: | sudo apt-get update sudo apt-get install -y make curl - pip install huggingface_hub[cli] + pip install -r src/model_manager/requirements.txt - name: Download models run: | @@ -86,6 +86,7 @@ jobs: env: CI: true CI_MINIMAL_MODELS: true + HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_HUB_ENABLE_HF_TRANSFER: 1 HF_HUB_DISABLE_TELEMETRY: 1 @@ -101,7 +102,7 @@ jobs: echo "Waiting for services to be healthy..." max_attempts=60 attempt=1 - + while [ $attempt -le $max_attempts ]; do echo "Attempt $attempt/$max_attempts: Checking service health..." @@ -128,7 +129,7 @@ jobs: sleep 5 ((attempt++)) done - + echo "❌ Timeout waiting for services to be healthy" docker ps -a exit 1 @@ -173,7 +174,7 @@ jobs: }') echo "Response: $response" - + # Verify we got a response if echo "$response" | grep -q "choices"; then echo "✅ Chat completions test passed" diff --git a/.github/workflows/performance-nightly.yml b/.github/workflows/performance-nightly.yml index 4a3979d6da..145deff849 100644 --- a/.github/workflows/performance-nightly.yml +++ b/.github/workflows/performance-nightly.yml @@ -63,13 +63,14 @@ jobs: - name: Build Rust library (CPU-only) run: make rust-ci - - name: Install HuggingFace CLI + - name: Install Model Manager dependencies run: | - pip install -U "huggingface_hub[cli]" hf_transfer + pip install -r src/model_manager/requirements.txt - name: Download models (minimal set for nightly) env: - CI_MINIMAL_MODELS: true + CI_MINIMAL_MODELS: false + HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_HUB_ENABLE_HF_TRANSFER: 1 HF_HUB_DISABLE_TELEMETRY: 1 run: make download-models diff --git a/.github/workflows/performance-test.yml b/.github/workflows/performance-test.yml index 2e303a772b..c47b0ab2c7 100644 --- a/.github/workflows/performance-test.yml +++ b/.github/workflows/performance-test.yml @@ -5,16 +5,16 @@ on: branches: - main paths: - - 'src/semantic-router/**' - - 'candle-binding/**' - - 'perf/**' - - '.github/workflows/performance-test.yml' + - "src/semantic-router/**" + - "candle-binding/**" + - "perf/**" + - ".github/workflows/performance-test.yml" workflow_dispatch: permissions: contents: read - pull-requests: write # Required to comment on PRs - issues: write # Required to comment on PRs (PRs are issues) + pull-requests: write # Required to comment on PRs + issues: write # Required to comment on PRs (PRs are issues) jobs: component-benchmarks: @@ -24,6 +24,8 @@ jobs: steps: - name: Check out the repo uses: actions/checkout@v4 + with: + fetch-depth: 0 # Need full history for baseline comparison - name: Set up Go uses: actions/setup-go@v5 @@ -70,13 +72,14 @@ jobs: - name: Build Rust library (CPU-only) run: make rust-ci - - name: Install HuggingFace CLI + - name: Install Model Manager dependencies run: | - pip install -U "huggingface_hub[cli]" hf_transfer + pip install -r src/model_manager/requirements.txt - name: Download models (minimal) env: CI_MINIMAL_MODELS: true + HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_HUB_ENABLE_HF_TRANSFER: 1 HF_HUB_DISABLE_TELEMETRY: 1 run: make download-models @@ -117,7 +120,7 @@ jobs: - name: Comment PR with results if: github.event_name == 'pull_request' - continue-on-error: true # May fail for PRs from forks due to GitHub security restrictions + continue-on-error: true # May fail for PRs from forks due to GitHub security restrictions uses: actions/github-script@v7 with: script: | diff --git a/.github/workflows/test-and-build.yml b/.github/workflows/test-and-build.yml index db3a337a52..fd74ab5932 100644 --- a/.github/workflows/test-and-build.yml +++ b/.github/workflows/test-and-build.yml @@ -34,7 +34,6 @@ jobs: || needs.changes.outputs.make == 'true' || needs.changes.outputs.ci == 'true') }} runs-on: ubuntu-latest - steps: - name: Check out the repo uses: actions/checkout@v4 @@ -86,11 +85,11 @@ jobs: # This helps prevent "no space left on device" errors echo "Disk space before setup:" df -h / && df -h /mnt - + # Create /mnt/models directory if it doesn't exist sudo mkdir -p /mnt/models sudo chown -R $USER:$USER /mnt/models - + # If models directory already exists in workspace, move it to /mnt if [ -d "models" ] && [ ! -L "models" ]; then echo "Moving existing models directory to /mnt/models..." @@ -103,7 +102,7 @@ jobs: sudo mv models /mnt/models fi fi - + # Create symlink from models/ to /mnt/models/ so existing code continues to work if [ ! -e "models" ]; then ln -s /mnt/models models @@ -113,7 +112,7 @@ jobs: else echo "Warning: models exists but is not a symlink" fi - + echo "Disk space after setup:" df -h / && df -h /mnt echo "Models directory setup complete. Models will be stored in /mnt/models" @@ -134,13 +133,14 @@ jobs: - name: Build Rust library (CPU-only, no CUDA) run: make rust-ci - - name: Install HuggingFace CLI + - name: Install Model Manager dependencies run: | - pip install -U "huggingface_hub[cli]" hf_transfer + pip install -r src/model_manager/requirements.txt - name: Download models (minimal on PRs) env: CI_MINIMAL_MODELS: ${{ github.event_name == 'pull_request' }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_HUB_ENABLE_HF_TRANSFER: 1 HF_HUB_DISABLE_TELEMETRY: 1 run: make download-models @@ -169,7 +169,7 @@ jobs: echo "Milvus is ready at localhost:19530" docker ps --filter "name=milvus-semantic-cache" - + - name: Start Redis service run: | echo "Starting Redis Stack..." diff --git a/config/models.lora.yaml b/config/model_manager/models.lora.yaml similarity index 93% rename from config/models.lora.yaml rename to config/model_manager/models.lora.yaml index 9fdd23f01a..22375f144f 100644 --- a/config/models.lora.yaml +++ b/config/model_manager/models.lora.yaml @@ -2,7 +2,7 @@ # This file contains only LoRA adapter models for incremental CI downloads. # # Usage: -# python -m model_manager --config config/models.lora.yaml +# python -m model_manager --config config/model_manager/models.lora.yaml # # Equivalent to: make download-models-lora diff --git a/config/models.minimal.yaml b/config/model_manager/models.minimal.yaml similarity index 92% rename from config/models.minimal.yaml rename to config/model_manager/models.minimal.yaml index c14c017e3b..cff4b5d06f 100644 --- a/config/models.minimal.yaml +++ b/config/model_manager/models.minimal.yaml @@ -5,12 +5,13 @@ # Usage: # CI_MINIMAL_MODELS=true python -m model_manager # # or explicitly: -# python -m model_manager --config config/models.minimal.yaml +# python -m model_manager --config config/model_manager/models.minimal.yaml # # Equivalent to: make download-models-minimal # or CI_MINIMAL_MODELS=true make download-models # -# Note: embeddinggemma-300m is gated and requires HF_TOKEN, so it's excluded. +# Note: This is the minimal set for fast CI runs. Larger models like +# embeddinggemma-300m are in models.yaml (full set) for local development. cache_dir: "models" verify: "size" # Use size for faster CI runs diff --git a/config/models.yaml b/config/model_manager/models.yaml similarity index 98% rename from config/models.yaml rename to config/model_manager/models.yaml index 6ad03a0eac..fd32b268bf 100644 --- a/config/models.yaml +++ b/config/model_manager/models.yaml @@ -3,7 +3,7 @@ # # Usage: # python -m model_manager -# python -m model_manager --config config/models.yaml +# python -m model_manager --config config/model_manager/models.yaml # # Includes additional LoRA variants (roberta, modernbert) and gated models not in minimal set. # diff --git a/src/model_manager/README.md b/src/model_manager/README.md new file mode 100644 index 0000000000..5cfe6232bf --- /dev/null +++ b/src/model_manager/README.md @@ -0,0 +1,224 @@ +# Model Manager + +A Python module for automated ML model download, verification, and caching from HuggingFace. + +## Features + +- **Automated Download**: Download models from HuggingFace Hub with support for specific revisions and file filtering +- **Integrity Verification**: Verify model integrity using size or SHA256 checksums +- **Smart Caching**: Skip downloads for already-cached models +- **CI-Friendly**: Environment variable-based config selection for minimal CI downloads +- **CLI & Programmatic API**: Use from command line or import as a Python module + +## Quick Start + +### CLI Usage + +```bash +# Download all models (uses config/model_manager/models.yaml by default) +python -m model_manager + +# Use a specific config file +python -m model_manager --config config/model_manager/models.yaml + +# Download a specific model only +python -m model_manager --model category_classifier_modernbert-base_model + +# List all configured models and their cache status +python -m model_manager --list + +# Verify existing models without downloading +python -m model_manager --verify-only + +# Clean all cached models (from default config) +python -m model_manager --clean + +# Clean all models from a specific config +python -m model_manager --config config/model_manager/models.yaml --clean + +# Clean a specific model +python -m model_manager --clean-model category_classifier_modernbert-base_model + +# Verbose output for debugging +python -m model_manager -v + +# Show help message +python -m model_manager --help +``` + +### CI Mode + +For CI environments, use the minimal model set to speed up builds and avoid rate limits: + +```bash +# Auto-select minimal config +CI_MINIMAL_MODELS=true python -m model_manager + +# Or explicitly specify +python -m model_manager --config config/model_manager/models.minimal.yaml +``` + +### Programmatic Usage + +```python +from model_manager import ensure_models, ModelManager + +# Simple: ensure all models are downloaded +ensure_models("config/model_manager/models.yaml") + +# Advanced: use ModelManager for more control +manager = ModelManager.from_config("config/model_manager/models.yaml") + +# Ensure all models +paths = manager.ensure_all() +print(paths) # {'model_id': '/path/to/model', ...} + +# Ensure a specific model +path = manager.ensure_model("category_classifier_modernbert-base_model") + +# Check if a model is cached +from model_manager import is_cached, get_model_path +spec = manager.get_model_spec("category_classifier_modernbert-base_model") +if is_cached(spec, manager.config.cache_dir): + print(f"Model cached at: {get_model_path(spec, manager.config.cache_dir)}") +``` + +## Configuration + +Models are defined in YAML configuration files. See `config/model_manager/` for examples. + +### Configuration Schema + +```yaml +# Cache directory for downloaded models +cache_dir: "models" + +# Verification level: "none", "size", or "sha256" +verify: "sha256" + +# List of models to manage +models: + - id: my-model # Unique identifier (required) + repo_id: org/model-name # HuggingFace repo ID (required) + revision: main # Git revision (optional, default: main) + local_dir: custom-name # Override local directory name (optional) + files: # Specific files to download (optional) + - config.json + - model.safetensors +``` + +### Verification Levels + +| Level | Description | Use Case | +| -------- | -------------------- | ------------------------- | +| `none` | No verification | Fast, trust HuggingFace | +| `size` | Verify file sizes | Quick verification for CI | +| `sha256` | Full SHA256 checksum | Production environments | + +### Available Configs + +| Config | Description | +| --------------------- | -------------------------------------------- | +| `models.yaml` | Full model set for local development | +| `models.minimal.yaml` | Minimal set for CI (faster, no gated models) | +| `models.lora.yaml` | LoRA adapters only | + +## Environment Variables + +| Variable | Description | +| --------------------------- | --------------------------------------------------------------------------------------- | +| `CI_MINIMAL_MODELS` | Set to `true`, `1`, or `yes` to auto-select minimal config | +| `HF_TOKEN` | HuggingFace token for gated models (e.g., `embeddinggemma-300m`) | +| `HF_ENDPOINT` | A HuggingFace mirror endpoint for accelerated downloads (e.g., `https://hf-mirror.com`) | +| `HF_HUB_ENABLE_HF_TRANSFER` | Set to `1` to enable faster downloads using `hf_transfer` (enabled by default in CI) | + +## API Reference + +### Functions + +#### `ensure_models(config_path, cache_dir=None)` + +Main entry point. Downloads and verifies all models from config. + +#### `is_cached(spec, cache_dir)` + +Check if a model is already cached. + +#### `get_model_path(spec, cache_dir)` + +Get the local path for a model. + +#### `download_model(spec, cache_dir)` + +Download a model from HuggingFace. + +#### `verify_model(path, verify_level)` + +Verify model integrity. + +### Classes + +#### `ModelManager` + +Central manager for model operations. + +- `from_config(config_path)` - Create from config file +- `ensure_all()` - Ensure all models are ready +- `ensure_model(model_id)` - Ensure a specific model +- `get_model_spec(model_id)` - Get model specification + +#### `ModelSpec` + +Specification for a single model. + +- `id` - Unique identifier +- `repo_id` - HuggingFace repository ID +- `revision` - Git revision (default: "main") +- `local_dir` - Override local directory name +- `files` - Specific files to download + +#### `ModelsConfig` + +Configuration container. + +- `models` - List of ModelSpec +- `cache_dir` - Cache directory path +- `verify` - Verification level + +### Exceptions + +| Exception | Description | +| -------------------- | ------------------------- | +| `ModelManagerError` | Base exception | +| `MissingModelError` | Model not found in config | +| `BadChecksumError` | Verification failed | +| `DownloadError` | Download failed | +| `ConfigurationError` | Invalid config | + +## Development + +### Running Tests + +```bash +# Run all model_manager tests +pytest src/model_manager/tests/ -v + +# Run with coverage +pytest src/model_manager/tests/ --cov=model_manager +``` + +### Project Structure + +``` +src/model_manager/ +├── __init__.py # Public API and ModelManager class +├── __main__.py # CLI entrypoint +├── cli.py # CLI implementation +├── config.py # Configuration dataclasses +├── registry.py # YAML config parsing +├── downloader.py # HuggingFace download logic +├── verifier.py # Integrity verification +├── cache.py # Cache management +├── errors.py # Custom exceptions +└── tests/ # Unit tests +``` diff --git a/src/model_manager/cli.py b/src/model_manager/cli.py index 2df78b843b..e79274e893 100644 --- a/src/model_manager/cli.py +++ b/src/model_manager/cli.py @@ -24,8 +24,8 @@ def get_default_config() -> str: """Get default config path based on CI_MINIMAL_MODELS environment variable.""" ci_minimal = os.environ.get("CI_MINIMAL_MODELS", "").lower() if ci_minimal in ("true", "1", "yes"): - return "config/models.minimal.yaml" - return "config/models.yaml" + return "config/model_manager/models.minimal.yaml" + return "config/model_manager/models.yaml" def setup_logging(verbose: bool = False) -> None: diff --git a/src/model_manager/requirements.txt b/src/model_manager/requirements.txt index 679eb802a6..5c76aa24f3 100644 --- a/src/model_manager/requirements.txt +++ b/src/model_manager/requirements.txt @@ -2,5 +2,9 @@ huggingface_hub>=0.20.0 pyyaml>=6.0 +# Optional performance enhancement for faster downloads +# Automatically used by huggingface_hub when HF_HUB_ENABLE_HF_TRANSFER=1 +hf_transfer>=0.1.0 + # Development/test dependencies pytest>=7.0.0 diff --git a/src/model_manager/tests/test_cli.py b/src/model_manager/tests/test_cli.py index 5aa7d01725..c69dce93d8 100644 --- a/src/model_manager/tests/test_cli.py +++ b/src/model_manager/tests/test_cli.py @@ -18,37 +18,37 @@ def test_default_config_without_env(self): with patch("os.environ.get") as mock_get: mock_get.return_value = "" result = get_default_config() - assert result == "config/models.yaml" + assert result == "config/model_manager/models.yaml" def test_minimal_config_with_true(self): """Test that minimal config is returned when CI_MINIMAL_MODELS=true.""" with patch.dict(os.environ, {"CI_MINIMAL_MODELS": "true"}): result = get_default_config() - assert result == "config/models.minimal.yaml" + assert result == "config/model_manager/models.minimal.yaml" def test_minimal_config_with_1(self): """Test that minimal config is returned when CI_MINIMAL_MODELS=1.""" with patch.dict(os.environ, {"CI_MINIMAL_MODELS": "1"}): result = get_default_config() - assert result == "config/models.minimal.yaml" + assert result == "config/model_manager/models.minimal.yaml" def test_minimal_config_with_yes(self): """Test that minimal config is returned when CI_MINIMAL_MODELS=yes.""" with patch.dict(os.environ, {"CI_MINIMAL_MODELS": "yes"}): result = get_default_config() - assert result == "config/models.minimal.yaml" + assert result == "config/model_manager/models.minimal.yaml" def test_minimal_config_case_insensitive(self): """Test that CI_MINIMAL_MODELS check is case insensitive.""" with patch.dict(os.environ, {"CI_MINIMAL_MODELS": "TRUE"}): result = get_default_config() - assert result == "config/models.minimal.yaml" + assert result == "config/model_manager/models.minimal.yaml" def test_default_config_with_false(self): """Test that default config is returned when CI_MINIMAL_MODELS=false.""" with patch.dict(os.environ, {"CI_MINIMAL_MODELS": "false"}): result = get_default_config() - assert result == "config/models.yaml" + assert result == "config/model_manager/models.yaml" class TestMainCLI: diff --git a/tools/make/models.mk b/tools/make/models.mk index f4dd3d0dd8..91192a2cc4 100644 --- a/tools/make/models.mk +++ b/tools/make/models.mk @@ -6,10 +6,10 @@ # CI_MINIMAL_MODELS=true will download only the minimal set of models required for tests. # Default behavior downloads the full set used for local development. -# Model configurations are moved to config directory -# full set: config/models.yaml -# minimal set: config/models.minimal.yaml -# LoRA and advanced embedding models: config/models.lora.yaml +# Model configurations are in config/model_manager directory +# full set: config/model_manager/models.yaml +# minimal set: config/model_manager/models.minimal.yaml +# LoRA and advanced embedding models: config/model_manager/models.lora.yaml download-models: download-models: ## Download models (full or minimal set depending on CI_MINIMAL_MODELS) @@ -22,24 +22,24 @@ download-models: ## Download models (full or minimal set depending on CI_MINIMAL download-models-minimal: download-models-minimal: ## Pre-download minimal set of models for CI tests @mkdir -p models - PYTHONPATH=src python -m model_manager --config config/models.minimal.yaml + PYTHONPATH=src python -m model_manager --config config/model_manager/models.minimal.yaml # Explicit full download download-models-full: download-models-full: ## Download all models used in local development and docs @mkdir -p models - PYTHONPATH=src python -m model_manager --config config/models.yaml + PYTHONPATH=src python -m model_manager --config config/model_manager/models.yaml # Download only LoRA and advanced embedding models (for CI after minimal tests) download-models-lora: download-models-lora: ## Download LoRA adapters and advanced embedding models only @mkdir -p models - PYTHONPATH=src python -m model_manager --config config/models.lora.yaml + PYTHONPATH=src python -m model_manager --config config/model_manager/models.lora.yaml # Clean up minimal models to save disk space (for CI) clean-minimal-models: ## Remove minimal models to save disk space @echo "Cleaning up minimal models to save disk space..." - PYTHONPATH=src python -m model_manager --config config/models.minimal.yaml --clean + PYTHONPATH=src python -m model_manager --config config/model_manager/models.minimal.yaml --clean # List configured models list-models: ## List models configured in the default or specified environment