From 56c6192828974b58d93909e024ee15a9c949cd5f Mon Sep 17 00:00:00 2001 From: alisonshao Date: Mon, 1 Dec 2025 15:32:07 -0800 Subject: [PATCH 1/2] Fix validation to detect missing model files before loading ## Problem The current validation logic only checks files that are found by glob pattern matching. If a model's snapshot directory exists with an index file but actual weight files are missing (due to incomplete downloads or cache corruption), the validation passes and claims "Found local HF snapshot", then crashes with FileNotFoundError when trying to load the missing files. Example from CI: ``` [TP0] Found local HF snapshot for openai/gpt-oss-120b at /hf_home/hub/models--openai--gpt-oss-120b/snapshots/... FileNotFoundError: No such file or directory: .../model-00000-of-00014.safetensors ``` The issue is that glob only finds files that exist on disk. If files are missing entirely, they're never validated, so the system doesn't know they should exist. ## Solution Added `_check_index_files_exist()` function that: 1. Reads the safetensors index file (model.safetensors.index.json) 2. Extracts the complete list of required files from the weight_map 3. Verifies that ALL files in the weight_map actually exist on disk 4. Returns validation failure with specific missing filenames if any are absent This function is integrated into `_validate_sharded_model()` and runs before other validation checks. When files are missing, validation fails and triggers a re-download instead of crashing during load. ## Testing - Tested with simulated CI scenario (14-shard model with 1 missing file) - Validation correctly detects missing files and returns clear error message - Non-sharded models (no index file) are unaffected - All files present: validation passes as expected --- .../srt/model_loader/weight_validation.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/python/sglang/srt/model_loader/weight_validation.py b/python/sglang/srt/model_loader/weight_validation.py index d83c5dae8744..334ded673aff 100644 --- a/python/sglang/srt/model_loader/weight_validation.py +++ b/python/sglang/srt/model_loader/weight_validation.py @@ -1,3 +1,4 @@ +import json import logging import os import re @@ -36,6 +37,63 @@ def _validate_safetensors_file(file_path: str) -> bool: return False +def _check_index_files_exist(snapshot_dir: str) -> Tuple[bool, Optional[str]]: + """ + Check if all files listed in safetensors index files actually exist on disk. + + This catches cases where the snapshot directory exists but files are missing + (e.g., due to incomplete downloads or corrupted cache). + + Args: + snapshot_dir: Path to the model snapshot directory + + Returns: + Tuple of (all_exist, error_message) + """ + # Find all safetensors index files + index_files = [ + f + for f in os.listdir(snapshot_dir) + if f.endswith(".safetensors.index.json") + ] + + if not index_files: + # No index files means it's not a sharded model, skip this check + return True, None + + for index_file in index_files: + index_path = os.path.join(snapshot_dir, index_file) + try: + with open(index_path) as f: + index_data = json.load(f) + + weight_map = index_data.get("weight_map", {}) + if not weight_map: + continue + + # Check that all files in weight_map exist + required_files = set(weight_map.values()) + missing_files = [] + + for file_name in required_files: + file_path = os.path.join(snapshot_dir, file_name) + # Check both existence and that it's not a broken symlink + if not os.path.exists(file_path): + missing_files.append(file_name) + + if missing_files: + return ( + False, + f"Missing {len(missing_files)} file(s) from index {index_file}: {missing_files[:3]}{'...' if len(missing_files) > 3 else ''}", + ) + + except Exception as e: + logger.warning("Failed to read index file %s: %s", index_file, e) + continue + + return True, None + + def _validate_sharded_model( snapshot_dir: str, weight_files: List[str] ) -> Tuple[bool, Optional[str], List[str]]: @@ -50,6 +108,12 @@ def _validate_sharded_model( Tuple of (is_valid, error_message, corrupted_files) - corrupted_files: List of file paths that are corrupted (for selective cleanup) """ + # First, check if all files from the index actually exist + # This catches missing files that wouldn't be found by glob + index_check_valid, index_error = _check_index_files_exist(snapshot_dir) + if not index_check_valid: + return False, index_error, [] + # Pattern for sharded files: model-00001-of-00009.safetensors shard_pattern = re.compile(r"(.*?)-(\d+)-of-(\d+)\.(safetensors|bin)") From 52a3f56223ced289d3288681f7d3ce4b5fddaa10 Mon Sep 17 00:00:00 2001 From: alisonshao <54658187+alisonshao@users.noreply.github.com> Date: Mon, 1 Dec 2025 16:11:45 -0800 Subject: [PATCH 2/2] lint fix --- python/sglang/srt/model_loader/weight_validation.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/sglang/srt/model_loader/weight_validation.py b/python/sglang/srt/model_loader/weight_validation.py index 334ded673aff..3c145360d592 100644 --- a/python/sglang/srt/model_loader/weight_validation.py +++ b/python/sglang/srt/model_loader/weight_validation.py @@ -52,9 +52,7 @@ def _check_index_files_exist(snapshot_dir: str) -> Tuple[bool, Optional[str]]: """ # Find all safetensors index files index_files = [ - f - for f in os.listdir(snapshot_dir) - if f.endswith(".safetensors.index.json") + f for f in os.listdir(snapshot_dir) if f.endswith(".safetensors.index.json") ] if not index_files: