From b0187a3cf7e4f22d0273ae443621a20e94d59e70 Mon Sep 17 00:00:00 2001 From: alisonshao Date: Wed, 10 Dec 2025 16:06:48 -0800 Subject: [PATCH 1/2] Fix edge case for large sharded models in weight validation (#14761 follow-up) Enhances PR #14761 to handle an intermittent edge case with large sharded models (e.g., MiniMax-M2 with 130 shards, MoE models with EP parallelism). When validating sharded models, if shards are detected as missing (but not corrupted), we previously deleted the entire model cache. However, with large models running multiple TP/EP processes, this can cause issues where one process deletes files while another is still loading. The fix: for missing shards, don't delete the cache. Instead, return None to trigger snapshot_download() which will download only the missing files without disturbing existing files. This is safer for multi-process scenarios common with large model deployments. --- python/sglang/srt/model_loader/weight_utils.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py index 82e69d88c725..b474780e113b 100644 --- a/python/sglang/srt/model_loader/weight_utils.py +++ b/python/sglang/srt/model_loader/weight_utils.py @@ -393,14 +393,17 @@ def _find_local_hf_snapshot_dir_unlocked( ) return None else: - # Cannot selectively clean (e.g., missing shards) - remove entire cache + # Missing shards (not corruption) - let snapshot_download handle it. + # IMPORTANT: Do NOT delete the entire cache here, as other processes + # (TP/EP ranks) may already be loading weights from these files. + # Deleting the cache while other processes are using it causes + # FileNotFoundError race conditions. Instead, just return None + # to trigger a download - snapshot_download will only fetch + # missing files without disturbing existing ones. log_info_on_rank0( logger, f"Validation failed for {model_name_or_path}: {error_msg}. " - "Will remove entire cache and re-download.", - ) - _cleanup_corrupted_model_cache( - model_name_or_path, found_local_snapshot_dir, error_msg + "Will attempt to download missing files.", ) return None From 604d20621b7e3749bbedff26c99814305cbadda1 Mon Sep 17 00:00:00 2001 From: alisonshao Date: Thu, 11 Dec 2025 00:03:08 -0800 Subject: [PATCH 2/2] Fix broken index symlink detection and move MiniMax-M2 test first (#14754) Changes: 1. Detect and clean up broken index symlinks (symlink exists but blob missing) - Previously this caused server timeout as validation silently continued - Now detects, cleans up broken symlink, and triggers re-download 2. Handle FileNotFoundError for index files properly - Return False instead of swallowing error and continuing 3. Add unit tests for weight validation logic - Test missing shards detection - Test corrupted shard detection - Test broken index symlink detection and cleanup 4. Move MiniMax-M2 nightly test to run first in 8-gpu-h200 job - Allows faster detection of weight download issues --- .github/workflows/nightly-test-nvidia.yml | 38 ++-- .../srt/model_loader/weight_validation.py | 27 +++ test/manual/test_weight_validation.py | 186 ++++++++++++++++++ 3 files changed, 232 insertions(+), 19 deletions(-) create mode 100644 test/manual/test_weight_validation.py diff --git a/.github/workflows/nightly-test-nvidia.yml b/.github/workflows/nightly-test-nvidia.yml index f9dd81de9957..4203f3763857 100644 --- a/.github/workflows/nightly-test-nvidia.yml +++ b/.github/workflows/nightly-test-nvidia.yml @@ -115,6 +115,25 @@ jobs: cd test python3 run_suite.py --hw cuda --suite nightly-8-gpu-h200 --nightly --continue-on-error + - name: Run MiniMax-M2 nightly performance test + timeout-minutes: 180 + env: + TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }} + PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }} + GPU_CONFIG: "8-gpu-h200" + run: | + rm -rf test/performance_profiles_minimax_m2/ + cd test + python3 nightly/test_minimax_m2_perf.py + + - name: Publish MiniMax-M2 traces to storage repo + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + run: | + python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_minimax_m2 + - name: Run Qwen3-235B nightly performance test timeout-minutes: 180 env: @@ -172,25 +191,6 @@ jobs: run: | python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_glm_4_6 - - name: Run MiniMax-M2 nightly performance test - timeout-minutes: 180 - env: - TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }} - PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }} - GPU_CONFIG: "8-gpu-h200" - run: | - rm -rf test/performance_profiles_minimax_m2/ - cd test - python3 nightly/test_minimax_m2_perf.py - - - name: Publish MiniMax-M2 traces to storage repo - env: - GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} - GITHUB_RUN_ID: ${{ github.run_id }} - GITHUB_RUN_NUMBER: ${{ github.run_number }} - run: | - python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_minimax_m2 - # General tests - 8 GPU H20 nightly-test-general-8-gpu-h20: if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-general-8-gpu-h20') diff --git a/python/sglang/srt/model_loader/weight_validation.py b/python/sglang/srt/model_loader/weight_validation.py index 3c145360d592..39e969dd9cd1 100644 --- a/python/sglang/srt/model_loader/weight_validation.py +++ b/python/sglang/srt/model_loader/weight_validation.py @@ -61,6 +61,26 @@ def _check_index_files_exist(snapshot_dir: str) -> Tuple[bool, Optional[str]]: for index_file in index_files: index_path = os.path.join(snapshot_dir, index_file) + + # Check if index file is a broken symlink (exists in listing but blob missing) + if os.path.islink(index_path) and not os.path.exists(index_path): + # Broken symlink - clean it up so download can proceed + try: + blob_path = os.path.realpath(index_path) + os.remove(index_path) + logger.warning( + "Removed broken index symlink: %s (blob missing)", index_file + ) + # Also try to remove dangling blob reference if it somehow exists + if os.path.exists(blob_path): + os.remove(blob_path) + except Exception as e: + logger.error("Failed to remove broken symlink %s: %s", index_file, e) + return ( + False, + f"Broken index file symlink: {index_file} (cleaned up, will re-download)", + ) + try: with open(index_path) as f: index_data = json.load(f) @@ -85,6 +105,13 @@ def _check_index_files_exist(snapshot_dir: str) -> Tuple[bool, Optional[str]]: f"Missing {len(missing_files)} file(s) from index {index_file}: {missing_files[:3]}{'...' if len(missing_files) > 3 else ''}", ) + except FileNotFoundError as e: + # Index file was listed but can't be read - could be race condition or broken state + logger.warning("Failed to read index file %s: %s", index_file, e) + return ( + False, + f"Index file {index_file} unreadable (will re-download)", + ) except Exception as e: logger.warning("Failed to read index file %s: %s", index_file, e) continue diff --git a/test/manual/test_weight_validation.py b/test/manual/test_weight_validation.py new file mode 100644 index 000000000000..59af6107eb51 --- /dev/null +++ b/test/manual/test_weight_validation.py @@ -0,0 +1,186 @@ +""" +Unit tests for weight validation and cache cleanup logic. + +Tests the fix for issue #14754 - ensuring that missing shards do not trigger +entire cache deletion, which can cause race conditions in multi-process scenarios. +""" + +import json +import os +import struct +import tempfile +import unittest + +from sglang.srt.model_loader.weight_validation import ( + _check_index_files_exist, + _validate_sharded_model, +) + + +class TestWeightValidation(unittest.TestCase): + """Tests for weight validation functions.""" + + def test_validate_sharded_model_missing_shard(self): + """ + Test that missing shards are detected correctly. + + This is the core test for issue #14754 fix: when a shard is missing, + the validation should return is_valid=False with an error message + containing "Missing", but corrupted_files should be empty (indicating + this is a missing shard issue, not a corruption issue). + + This distinction is critical because: + - Missing shards: should NOT delete cache (other processes may be using it) + - Corrupted files: should delete only the corrupted files selectively + """ + with tempfile.TemporaryDirectory() as tmpdir: + # Create partial shards (missing shard 3) + for i in [1, 2]: # Missing shard 3 + open( + os.path.join(tmpdir, f"model-0000{i}-of-00003.safetensors"), "w" + ).close() + + # Create index file + index_data = { + "weight_map": { + "layer1": "model-00001-of-00003.safetensors", + "layer2": "model-00002-of-00003.safetensors", + "layer3": "model-00003-of-00003.safetensors", + } + } + with open(os.path.join(tmpdir, "model.safetensors.index.json"), "w") as f: + json.dump(index_data, f) + + weight_files = [ + os.path.join(tmpdir, f"model-0000{i}-of-00003.safetensors") + for i in [1, 2] + ] + + is_valid, error_msg, corrupted_files = _validate_sharded_model( + tmpdir, weight_files + ) + + self.assertFalse(is_valid) + self.assertIn("Missing", error_msg) + # CRITICAL: corrupted_files should be empty for missing shards + # This is what prevents entire cache deletion + self.assertEqual(corrupted_files, []) + + def test_validate_sharded_model_all_present(self): + """Test that complete shards pass validation.""" + with tempfile.TemporaryDirectory() as tmpdir: + # Create all shards with valid safetensors header + for i in [1, 2, 3]: + filepath = os.path.join(tmpdir, f"model-0000{i}-of-00003.safetensors") + # Create a minimal valid safetensors file + # Header: 8 bytes for header size + JSON header + header = b'{"__metadata__":{}}' + header_size = len(header) + with open(filepath, "wb") as f: + f.write(struct.pack("