From b0187a3cf7e4f22d0273ae443621a20e94d59e70 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Wed, 10 Dec 2025 16:06:48 -0800
Subject: [PATCH 1/2] Fix edge case for large sharded models in weight
 validation (#14761 follow-up)

Enhances PR #14761 to handle an intermittent edge case with large sharded
models (e.g., MiniMax-M2 with 130 shards, MoE models with EP parallelism).

When validating sharded models, if shards are detected as missing (but not
corrupted), we previously deleted the entire model cache. However, with large
models running multiple TP/EP processes, this can cause issues where one
process deletes files while another is still loading.

The fix: for missing shards, don't delete the cache. Instead, return None to
trigger snapshot_download() which will download only the missing files without
disturbing existing files. This is safer for multi-process scenarios common
with large model deployments.
---
 python/sglang/srt/model_loader/weight_utils.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py
index 82e69d88c725..b474780e113b 100644
--- a/python/sglang/srt/model_loader/weight_utils.py
+++ b/python/sglang/srt/model_loader/weight_utils.py
@@ -393,14 +393,17 @@ def _find_local_hf_snapshot_dir_unlocked(
                     )
                     return None
                 else:
-                    # Cannot selectively clean (e.g., missing shards) - remove entire cache
+                    # Missing shards (not corruption) - let snapshot_download handle it.
+                    # IMPORTANT: Do NOT delete the entire cache here, as other processes
+                    # (TP/EP ranks) may already be loading weights from these files.
+                    # Deleting the cache while other processes are using it causes
+                    # FileNotFoundError race conditions. Instead, just return None
+                    # to trigger a download - snapshot_download will only fetch
+                    # missing files without disturbing existing ones.
                     log_info_on_rank0(
                         logger,
                         f"Validation failed for {model_name_or_path}: {error_msg}. "
-                        "Will remove entire cache and re-download.",
-                    )
-                    _cleanup_corrupted_model_cache(
-                        model_name_or_path, found_local_snapshot_dir, error_msg
+                        "Will attempt to download missing files.",
                     )
                     return None
 

From 604d20621b7e3749bbedff26c99814305cbadda1 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Thu, 11 Dec 2025 00:03:08 -0800
Subject: [PATCH 2/2] Fix broken index symlink detection and move MiniMax-M2
 test first (#14754)

Changes:
1. Detect and clean up broken index symlinks (symlink exists but blob missing)
   - Previously this caused server timeout as validation silently continued
   - Now detects, cleans up broken symlink, and triggers re-download

2. Handle FileNotFoundError for index files properly
   - Return False instead of swallowing error and continuing

3. Add unit tests for weight validation logic
   - Test missing shards detection
   - Test corrupted shard detection
   - Test broken index symlink detection and cleanup

4. Move MiniMax-M2 nightly test to run first in 8-gpu-h200 job
   - Allows faster detection of weight download issues
---
 .github/workflows/nightly-test-nvidia.yml     |  38 ++--
 .../srt/model_loader/weight_validation.py     |  27 +++
 test/manual/test_weight_validation.py         | 186 ++++++++++++++++++
 3 files changed, 232 insertions(+), 19 deletions(-)
 create mode 100644 test/manual/test_weight_validation.py

diff --git a/.github/workflows/nightly-test-nvidia.yml b/.github/workflows/nightly-test-nvidia.yml
index f9dd81de9957..4203f3763857 100644
--- a/.github/workflows/nightly-test-nvidia.yml
+++ b/.github/workflows/nightly-test-nvidia.yml
@@ -115,6 +115,25 @@ jobs:
           cd test
           python3 run_suite.py --hw cuda --suite nightly-8-gpu-h200 --nightly --continue-on-error
 
+      - name: Run MiniMax-M2 nightly performance test
+        timeout-minutes: 180
+        env:
+          TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }}
+          PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }}
+          GPU_CONFIG: "8-gpu-h200"
+        run: |
+          rm -rf test/performance_profiles_minimax_m2/
+          cd test
+          python3 nightly/test_minimax_m2_perf.py
+
+      - name: Publish MiniMax-M2 traces to storage repo
+        env:
+          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_RUN_NUMBER: ${{ github.run_number }}
+        run: |
+          python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_minimax_m2
+
       - name: Run Qwen3-235B nightly performance test
         timeout-minutes: 180
         env:
@@ -172,25 +191,6 @@ jobs:
         run: |
           python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_glm_4_6
 
-      - name: Run MiniMax-M2 nightly performance test
-        timeout-minutes: 180
-        env:
-          TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }}
-          PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }}
-          GPU_CONFIG: "8-gpu-h200"
-        run: |
-          rm -rf test/performance_profiles_minimax_m2/
-          cd test
-          python3 nightly/test_minimax_m2_perf.py
-
-      - name: Publish MiniMax-M2 traces to storage repo
-        env:
-          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
-          GITHUB_RUN_ID: ${{ github.run_id }}
-          GITHUB_RUN_NUMBER: ${{ github.run_number }}
-        run: |
-          python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_minimax_m2
-
   # General tests - 8 GPU H20
   nightly-test-general-8-gpu-h20:
     if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-general-8-gpu-h20')
diff --git a/python/sglang/srt/model_loader/weight_validation.py b/python/sglang/srt/model_loader/weight_validation.py
index 3c145360d592..39e969dd9cd1 100644
--- a/python/sglang/srt/model_loader/weight_validation.py
+++ b/python/sglang/srt/model_loader/weight_validation.py
@@ -61,6 +61,26 @@ def _check_index_files_exist(snapshot_dir: str) -> Tuple[bool, Optional[str]]:
 
     for index_file in index_files:
         index_path = os.path.join(snapshot_dir, index_file)
+
+        # Check if index file is a broken symlink (exists in listing but blob missing)
+        if os.path.islink(index_path) and not os.path.exists(index_path):
+            # Broken symlink - clean it up so download can proceed
+            try:
+                blob_path = os.path.realpath(index_path)
+                os.remove(index_path)
+                logger.warning(
+                    "Removed broken index symlink: %s (blob missing)", index_file
+                )
+                # Also try to remove dangling blob reference if it somehow exists
+                if os.path.exists(blob_path):
+                    os.remove(blob_path)
+            except Exception as e:
+                logger.error("Failed to remove broken symlink %s: %s", index_file, e)
+            return (
+                False,
+                f"Broken index file symlink: {index_file} (cleaned up, will re-download)",
+            )
+
         try:
             with open(index_path) as f:
                 index_data = json.load(f)
@@ -85,6 +105,13 @@ def _check_index_files_exist(snapshot_dir: str) -> Tuple[bool, Optional[str]]:
                     f"Missing {len(missing_files)} file(s) from index {index_file}: {missing_files[:3]}{'...' if len(missing_files) > 3 else ''}",
                 )
 
+        except FileNotFoundError as e:
+            # Index file was listed but can't be read - could be race condition or broken state
+            logger.warning("Failed to read index file %s: %s", index_file, e)
+            return (
+                False,
+                f"Index file {index_file} unreadable (will re-download)",
+            )
         except Exception as e:
             logger.warning("Failed to read index file %s: %s", index_file, e)
             continue
diff --git a/test/manual/test_weight_validation.py b/test/manual/test_weight_validation.py
new file mode 100644
index 000000000000..59af6107eb51
--- /dev/null
+++ b/test/manual/test_weight_validation.py
@@ -0,0 +1,186 @@
+"""
+Unit tests for weight validation and cache cleanup logic.
+
+Tests the fix for issue #14754 - ensuring that missing shards do not trigger
+entire cache deletion, which can cause race conditions in multi-process scenarios.
+"""
+
+import json
+import os
+import struct
+import tempfile
+import unittest
+
+from sglang.srt.model_loader.weight_validation import (
+    _check_index_files_exist,
+    _validate_sharded_model,
+)
+
+
+class TestWeightValidation(unittest.TestCase):
+    """Tests for weight validation functions."""
+
+    def test_validate_sharded_model_missing_shard(self):
+        """
+        Test that missing shards are detected correctly.
+
+        This is the core test for issue #14754 fix: when a shard is missing,
+        the validation should return is_valid=False with an error message
+        containing "Missing", but corrupted_files should be empty (indicating
+        this is a missing shard issue, not a corruption issue).
+
+        This distinction is critical because:
+        - Missing shards: should NOT delete cache (other processes may be using it)
+        - Corrupted files: should delete only the corrupted files selectively
+        """
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Create partial shards (missing shard 3)
+            for i in [1, 2]:  # Missing shard 3
+                open(
+                    os.path.join(tmpdir, f"model-0000{i}-of-00003.safetensors"), "w"
+                ).close()
+
+            # Create index file
+            index_data = {
+                "weight_map": {
+                    "layer1": "model-00001-of-00003.safetensors",
+                    "layer2": "model-00002-of-00003.safetensors",
+                    "layer3": "model-00003-of-00003.safetensors",
+                }
+            }
+            with open(os.path.join(tmpdir, "model.safetensors.index.json"), "w") as f:
+                json.dump(index_data, f)
+
+            weight_files = [
+                os.path.join(tmpdir, f"model-0000{i}-of-00003.safetensors")
+                for i in [1, 2]
+            ]
+
+            is_valid, error_msg, corrupted_files = _validate_sharded_model(
+                tmpdir, weight_files
+            )
+
+            self.assertFalse(is_valid)
+            self.assertIn("Missing", error_msg)
+            # CRITICAL: corrupted_files should be empty for missing shards
+            # This is what prevents entire cache deletion
+            self.assertEqual(corrupted_files, [])
+
+    def test_validate_sharded_model_all_present(self):
+        """Test that complete shards pass validation."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Create all shards with valid safetensors header
+            for i in [1, 2, 3]:
+                filepath = os.path.join(tmpdir, f"model-0000{i}-of-00003.safetensors")
+                # Create a minimal valid safetensors file
+                # Header: 8 bytes for header size + JSON header
+                header = b'{"__metadata__":{}}'
+                header_size = len(header)
+                with open(filepath, "wb") as f:
+                    f.write(struct.pack("<Q", header_size))
+                    f.write(header)
+
+            # Create index file
+            index_data = {
+                "weight_map": {
+                    "layer1": "model-00001-of-00003.safetensors",
+                    "layer2": "model-00002-of-00003.safetensors",
+                    "layer3": "model-00003-of-00003.safetensors",
+                }
+            }
+            with open(os.path.join(tmpdir, "model.safetensors.index.json"), "w") as f:
+                json.dump(index_data, f)
+
+            weight_files = [
+                os.path.join(tmpdir, f"model-0000{i}-of-00003.safetensors")
+                for i in [1, 2, 3]
+            ]
+
+            is_valid, error_msg, corrupted_files = _validate_sharded_model(
+                tmpdir, weight_files
+            )
+
+            self.assertTrue(is_valid)
+            self.assertIsNone(error_msg)
+            self.assertEqual(corrupted_files, [])
+
+    def test_validate_sharded_model_corrupted_shard(self):
+        """
+        Test that corrupted shards are detected and returned in corrupted_files.
+
+        This tests the other branch: when a file exists but is corrupted
+        (invalid safetensors format), it should be added to corrupted_files
+        so that selective cleanup can remove just that file.
+        """
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Create shard 1 as valid
+            filepath1 = os.path.join(tmpdir, "model-00001-of-00003.safetensors")
+            header = b'{"__metadata__":{}}'
+            with open(filepath1, "wb") as f:
+                f.write(struct.pack("<Q", len(header)))
+                f.write(header)
+
+            # Create shard 2 as corrupted (invalid header)
+            filepath2 = os.path.join(tmpdir, "model-00002-of-00003.safetensors")
+            with open(filepath2, "wb") as f:
+                f.write(b"invalid data that is not a valid safetensors file")
+
+            # Create shard 3 as valid
+            filepath3 = os.path.join(tmpdir, "model-00003-of-00003.safetensors")
+            with open(filepath3, "wb") as f:
+                f.write(struct.pack("<Q", len(header)))
+                f.write(header)
+
+            # Create index file
+            index_data = {
+                "weight_map": {
+                    "layer1": "model-00001-of-00003.safetensors",
+                    "layer2": "model-00002-of-00003.safetensors",
+                    "layer3": "model-00003-of-00003.safetensors",
+                }
+            }
+            with open(os.path.join(tmpdir, "model.safetensors.index.json"), "w") as f:
+                json.dump(index_data, f)
+
+            weight_files = [filepath1, filepath2, filepath3]
+
+            is_valid, error_msg, corrupted_files = _validate_sharded_model(
+                tmpdir, weight_files
+            )
+
+            self.assertFalse(is_valid)
+            self.assertIn("Corrupt", error_msg)
+            # The corrupted file should be identified
+            self.assertEqual(len(corrupted_files), 1)
+            self.assertIn("model-00002-of-00003.safetensors", corrupted_files[0])
+
+    def test_broken_index_symlink_detected(self):
+        """
+        Test that broken index symlinks are detected and cause validation to fail.
+
+        When an index file is a symlink pointing to a non-existent blob,
+        validation should fail (to trigger re-download) rather than silently
+        continuing and causing timeout during actual loading.
+        """
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Create a broken symlink for the index file
+            index_path = os.path.join(tmpdir, "model.safetensors.index.json")
+            non_existent_blob = os.path.join(tmpdir, "blobs", "nonexistent_hash")
+            os.symlink(non_existent_blob, index_path)
+
+            # Verify it's a broken symlink
+            self.assertTrue(os.path.islink(index_path))
+            self.assertFalse(os.path.exists(index_path))
+
+            # Check should fail for broken symlink
+            is_valid, error_msg = _check_index_files_exist(tmpdir)
+
+            self.assertFalse(is_valid)
+            self.assertIn("Broken", error_msg)
+            # The broken symlink should have been cleaned up
+            self.assertFalse(os.path.exists(index_path))
+            self.assertFalse(os.path.islink(index_path))
+
+
+if __name__ == "__main__":
+    unittest.main()