diff --git a/.github/workflows/nightly-test-nvidia.yml b/.github/workflows/nightly-test-nvidia.yml index f9dd81de9957..4203f3763857 100644 --- a/.github/workflows/nightly-test-nvidia.yml +++ b/.github/workflows/nightly-test-nvidia.yml @@ -115,6 +115,25 @@ jobs: cd test python3 run_suite.py --hw cuda --suite nightly-8-gpu-h200 --nightly --continue-on-error + - name: Run MiniMax-M2 nightly performance test + timeout-minutes: 180 + env: + TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }} + PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }} + GPU_CONFIG: "8-gpu-h200" + run: | + rm -rf test/performance_profiles_minimax_m2/ + cd test + python3 nightly/test_minimax_m2_perf.py + + - name: Publish MiniMax-M2 traces to storage repo + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + run: | + python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_minimax_m2 + - name: Run Qwen3-235B nightly performance test timeout-minutes: 180 env: @@ -172,25 +191,6 @@ jobs: run: | python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_glm_4_6 - - name: Run MiniMax-M2 nightly performance test - timeout-minutes: 180 - env: - TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }} - PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }} - GPU_CONFIG: "8-gpu-h200" - run: | - rm -rf test/performance_profiles_minimax_m2/ - cd test - python3 nightly/test_minimax_m2_perf.py - - - name: Publish MiniMax-M2 traces to storage repo - env: - GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} - GITHUB_RUN_ID: ${{ github.run_id }} - GITHUB_RUN_NUMBER: ${{ github.run_number }} - run: | - python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_minimax_m2 - # General tests - 8 GPU H20 nightly-test-general-8-gpu-h20: if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-general-8-gpu-h20') diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py index 82e69d88c725..b474780e113b 100644 --- a/python/sglang/srt/model_loader/weight_utils.py +++ b/python/sglang/srt/model_loader/weight_utils.py @@ -393,14 +393,17 @@ def _find_local_hf_snapshot_dir_unlocked( ) return None else: - # Cannot selectively clean (e.g., missing shards) - remove entire cache + # Missing shards (not corruption) - let snapshot_download handle it. + # IMPORTANT: Do NOT delete the entire cache here, as other processes + # (TP/EP ranks) may already be loading weights from these files. + # Deleting the cache while other processes are using it causes + # FileNotFoundError race conditions. Instead, just return None + # to trigger a download - snapshot_download will only fetch + # missing files without disturbing existing ones. log_info_on_rank0( logger, f"Validation failed for {model_name_or_path}: {error_msg}. " - "Will remove entire cache and re-download.", - ) - _cleanup_corrupted_model_cache( - model_name_or_path, found_local_snapshot_dir, error_msg + "Will attempt to download missing files.", ) return None diff --git a/python/sglang/srt/model_loader/weight_validation.py b/python/sglang/srt/model_loader/weight_validation.py index 3c145360d592..39e969dd9cd1 100644 --- a/python/sglang/srt/model_loader/weight_validation.py +++ b/python/sglang/srt/model_loader/weight_validation.py @@ -61,6 +61,26 @@ def _check_index_files_exist(snapshot_dir: str) -> Tuple[bool, Optional[str]]: for index_file in index_files: index_path = os.path.join(snapshot_dir, index_file) + + # Check if index file is a broken symlink (exists in listing but blob missing) + if os.path.islink(index_path) and not os.path.exists(index_path): + # Broken symlink - clean it up so download can proceed + try: + blob_path = os.path.realpath(index_path) + os.remove(index_path) + logger.warning( + "Removed broken index symlink: %s (blob missing)", index_file + ) + # Also try to remove dangling blob reference if it somehow exists + if os.path.exists(blob_path): + os.remove(blob_path) + except Exception as e: + logger.error("Failed to remove broken symlink %s: %s", index_file, e) + return ( + False, + f"Broken index file symlink: {index_file} (cleaned up, will re-download)", + ) + try: with open(index_path) as f: index_data = json.load(f) @@ -85,6 +105,13 @@ def _check_index_files_exist(snapshot_dir: str) -> Tuple[bool, Optional[str]]: f"Missing {len(missing_files)} file(s) from index {index_file}: {missing_files[:3]}{'...' if len(missing_files) > 3 else ''}", ) + except FileNotFoundError as e: + # Index file was listed but can't be read - could be race condition or broken state + logger.warning("Failed to read index file %s: %s", index_file, e) + return ( + False, + f"Index file {index_file} unreadable (will re-download)", + ) except Exception as e: logger.warning("Failed to read index file %s: %s", index_file, e) continue diff --git a/test/manual/test_weight_validation.py b/test/manual/test_weight_validation.py new file mode 100644 index 000000000000..59af6107eb51 --- /dev/null +++ b/test/manual/test_weight_validation.py @@ -0,0 +1,186 @@ +""" +Unit tests for weight validation and cache cleanup logic. + +Tests the fix for issue #14754 - ensuring that missing shards do not trigger +entire cache deletion, which can cause race conditions in multi-process scenarios. +""" + +import json +import os +import struct +import tempfile +import unittest + +from sglang.srt.model_loader.weight_validation import ( + _check_index_files_exist, + _validate_sharded_model, +) + + +class TestWeightValidation(unittest.TestCase): + """Tests for weight validation functions.""" + + def test_validate_sharded_model_missing_shard(self): + """ + Test that missing shards are detected correctly. + + This is the core test for issue #14754 fix: when a shard is missing, + the validation should return is_valid=False with an error message + containing "Missing", but corrupted_files should be empty (indicating + this is a missing shard issue, not a corruption issue). + + This distinction is critical because: + - Missing shards: should NOT delete cache (other processes may be using it) + - Corrupted files: should delete only the corrupted files selectively + """ + with tempfile.TemporaryDirectory() as tmpdir: + # Create partial shards (missing shard 3) + for i in [1, 2]: # Missing shard 3 + open( + os.path.join(tmpdir, f"model-0000{i}-of-00003.safetensors"), "w" + ).close() + + # Create index file + index_data = { + "weight_map": { + "layer1": "model-00001-of-00003.safetensors", + "layer2": "model-00002-of-00003.safetensors", + "layer3": "model-00003-of-00003.safetensors", + } + } + with open(os.path.join(tmpdir, "model.safetensors.index.json"), "w") as f: + json.dump(index_data, f) + + weight_files = [ + os.path.join(tmpdir, f"model-0000{i}-of-00003.safetensors") + for i in [1, 2] + ] + + is_valid, error_msg, corrupted_files = _validate_sharded_model( + tmpdir, weight_files + ) + + self.assertFalse(is_valid) + self.assertIn("Missing", error_msg) + # CRITICAL: corrupted_files should be empty for missing shards + # This is what prevents entire cache deletion + self.assertEqual(corrupted_files, []) + + def test_validate_sharded_model_all_present(self): + """Test that complete shards pass validation.""" + with tempfile.TemporaryDirectory() as tmpdir: + # Create all shards with valid safetensors header + for i in [1, 2, 3]: + filepath = os.path.join(tmpdir, f"model-0000{i}-of-00003.safetensors") + # Create a minimal valid safetensors file + # Header: 8 bytes for header size + JSON header + header = b'{"__metadata__":{}}' + header_size = len(header) + with open(filepath, "wb") as f: + f.write(struct.pack("