From 3e0c0ea8de6a66059ad89319b2ad26deed2104d9 Mon Sep 17 00:00:00 2001 From: Hanwen Xing <1277646412@qq.com> Date: Sat, 17 Jan 2026 09:12:05 +0000 Subject: [PATCH 1/2] fix(ci): recover from corrupted MMMU parquet cache When MMMU dataset parquet files are corrupted in HuggingFace cache (ArrowInvalid: Parquet magic bytes not found), the lmms_eval process fails without producing JSON results, causing CI test failures. This fix adds automatic recovery in mmmu_vlm_kit.py: - Detects parquet corruption errors in lmms_eval subprocess output - Cleans up the corrupted MMMU dataset cache directory (prioritizes CI path /hf_home, fallback to HF_HOME) - Retries once with HF_HUB_OFFLINE=0 and force_redownload mode - Only affects MMMU parquet corruption cases (minimal impact) Fixes the intermittent CI failures on MMMU evaluation tests. --- python/sglang/test/kits/mmmu_vlm_kit.py | 75 +++++++++++++++++++++---- 1 file changed, 65 insertions(+), 10 deletions(-) diff --git a/python/sglang/test/kits/mmmu_vlm_kit.py b/python/sglang/test/kits/mmmu_vlm_kit.py index b7415b0d4745..b86c48032f4c 100644 --- a/python/sglang/test/kits/mmmu_vlm_kit.py +++ b/python/sglang/test/kits/mmmu_vlm_kit.py @@ -1,8 +1,10 @@ import glob import json import os +import shutil import subprocess import tempfile +from pathlib import Path from types import SimpleNamespace from sglang.srt.environ import temp_set_env @@ -18,6 +20,67 @@ DEFAULT_MEM_FRACTION_STATIC = 0.8 +def _is_mmmu_parquet_corruption(error_output: str) -> bool: + """Check if error is due to MMMU parquet file corruption.""" + return ( + "ArrowInvalid" in error_output + and "Parquet magic bytes not found" in error_output + and ("MMMU" in error_output or "lmms-lab--MMMU" in error_output) + ) + + +def _cleanup_mmmu_dataset_cache(): + """Clean up corrupted MMMU dataset cache to allow fresh download.""" + # Priority 1: Check CI convention path /hf_home first (used in Docker containers) + ci_hf_home = Path("/hf_home/hub/datasets--lmms-lab--MMMU") + if ci_hf_home.exists(): + mmmu_cache_path = ci_hf_home + else: + # Priority 2: Use HF_HOME env var or default user cache + hf_home = os.environ.get("HF_HOME", os.path.expanduser("~/.cache/huggingface")) + mmmu_cache_path = Path(hf_home) / "hub" / "datasets--lmms-lab--MMMU" + + if mmmu_cache_path.exists(): + print(f"Detected corrupted MMMU parquet cache. Cleaning up: {mmmu_cache_path}") + try: + shutil.rmtree(mmmu_cache_path) + print(f"Successfully removed corrupted cache: {mmmu_cache_path}") + return True + except OSError as e: + print(f"Warning: Failed to remove cache {mmmu_cache_path}: {e}") + return False + else: + print(f"MMMU cache not found at {mmmu_cache_path}, skipping cleanup") + return False + + +def _run_lmms_eval_with_retry(cmd: list[str], timeout: int = 3600) -> None: + """Run lmms_eval command with automatic retry on MMMU parquet corruption.""" + try: + subprocess.run( + cmd, + check=True, + timeout=timeout, + capture_output=True, + text=True, + ) + except subprocess.CalledProcessError as e: + error_output = e.stderr + e.stdout + if _is_mmmu_parquet_corruption(error_output): + print("Detected MMMU parquet corruption error. Attempting recovery...") + if _cleanup_mmmu_dataset_cache(): + print("Retrying lmms_eval with fresh download...") + with temp_set_env( + HF_HUB_OFFLINE="0", + HF_DATASETS_DOWNLOAD_MODE="force_redownload", + ): + subprocess.run(cmd, check=True, timeout=timeout) + else: + raise + else: + raise + + class MMMUMixin: """Mixin for MMMU evaluation. @@ -81,11 +144,7 @@ def run_mmmu_eval( OPENAI_API_KEY=self.api_key, OPENAI_API_BASE=f"{self.base_url}/v1", ): - subprocess.run( - cmd, - check=True, - timeout=3600, - ) + _run_lmms_eval_with_retry(cmd) def test_mmmu(self: CustomTestCase): """Run MMMU evaluation test.""" @@ -209,11 +268,7 @@ def run_mmmu_eval( *self.mmmu_args, ] - subprocess.run( - cmd, - check=True, - timeout=3600, - ) + _run_lmms_eval_with_retry(cmd) def _run_vlm_mmmu_test( self, From 21b763c0adc70ce917b43768f3be45be22e3f2ba Mon Sep 17 00:00:00 2001 From: Hanwen Xing <1277646412@qq.com> Date: Sat, 17 Jan 2026 09:21:48 +0000 Subject: [PATCH 2/2] refactor: improve error logging in MMMU retry logic Address code review feedback: - Print captured stdout/stderr on successful runs for visibility - Log error output before re-raising on cleanup failure - Log error output before re-raising on non-parquet errors This ensures subprocess outputs are preserved for debugging. --- python/sglang/test/kits/mmmu_vlm_kit.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/python/sglang/test/kits/mmmu_vlm_kit.py b/python/sglang/test/kits/mmmu_vlm_kit.py index b86c48032f4c..a1ca28fedc2d 100644 --- a/python/sglang/test/kits/mmmu_vlm_kit.py +++ b/python/sglang/test/kits/mmmu_vlm_kit.py @@ -57,13 +57,18 @@ def _cleanup_mmmu_dataset_cache(): def _run_lmms_eval_with_retry(cmd: list[str], timeout: int = 3600) -> None: """Run lmms_eval command with automatic retry on MMMU parquet corruption.""" try: - subprocess.run( + result = subprocess.run( cmd, check=True, timeout=timeout, capture_output=True, text=True, ) + # Print captured output to maintain visibility of successful runs + if result.stdout: + print(result.stdout, end="") + if result.stderr: + print(result.stderr, end="") except subprocess.CalledProcessError as e: error_output = e.stderr + e.stdout if _is_mmmu_parquet_corruption(error_output): @@ -76,8 +81,14 @@ def _run_lmms_eval_with_retry(cmd: list[str], timeout: int = 3600) -> None: ): subprocess.run(cmd, check=True, timeout=timeout) else: + print( + f"Failed to cleanup corrupted MMMU cache. Error from lmms_eval:\nStdout:\n{e.stdout}\nStderr:\n{e.stderr}" + ) raise else: + print( + f"lmms_eval failed with an unhandled error.\nStdout:\n{e.stdout}\nStderr:\n{e.stderr}" + ) raise