From 3e0c0ea8de6a66059ad89319b2ad26deed2104d9 Mon Sep 17 00:00:00 2001
From: Hanwen Xing <1277646412@qq.com>
Date: Sat, 17 Jan 2026 09:12:05 +0000
Subject: [PATCH 1/2] fix(ci): recover from corrupted MMMU parquet cache

When MMMU dataset parquet files are corrupted in HuggingFace cache
(ArrowInvalid: Parquet magic bytes not found), the lmms_eval process
fails without producing JSON results, causing CI test failures.

This fix adds automatic recovery in mmmu_vlm_kit.py:
- Detects parquet corruption errors in lmms_eval subprocess output
- Cleans up the corrupted MMMU dataset cache directory
  (prioritizes CI path /hf_home, fallback to HF_HOME)
- Retries once with HF_HUB_OFFLINE=0 and force_redownload mode
- Only affects MMMU parquet corruption cases (minimal impact)

Fixes the intermittent CI failures on MMMU evaluation tests.
---
 python/sglang/test/kits/mmmu_vlm_kit.py | 75 +++++++++++++++++++++----
 1 file changed, 65 insertions(+), 10 deletions(-)

diff --git a/python/sglang/test/kits/mmmu_vlm_kit.py b/python/sglang/test/kits/mmmu_vlm_kit.py
index b7415b0d4745..b86c48032f4c 100644
--- a/python/sglang/test/kits/mmmu_vlm_kit.py
+++ b/python/sglang/test/kits/mmmu_vlm_kit.py
@@ -1,8 +1,10 @@
 import glob
 import json
 import os
+import shutil
 import subprocess
 import tempfile
+from pathlib import Path
 from types import SimpleNamespace
 
 from sglang.srt.environ import temp_set_env
@@ -18,6 +20,67 @@
 DEFAULT_MEM_FRACTION_STATIC = 0.8
 
 
+def _is_mmmu_parquet_corruption(error_output: str) -> bool:
+    """Check if error is due to MMMU parquet file corruption."""
+    return (
+        "ArrowInvalid" in error_output
+        and "Parquet magic bytes not found" in error_output
+        and ("MMMU" in error_output or "lmms-lab--MMMU" in error_output)
+    )
+
+
+def _cleanup_mmmu_dataset_cache():
+    """Clean up corrupted MMMU dataset cache to allow fresh download."""
+    # Priority 1: Check CI convention path /hf_home first (used in Docker containers)
+    ci_hf_home = Path("/hf_home/hub/datasets--lmms-lab--MMMU")
+    if ci_hf_home.exists():
+        mmmu_cache_path = ci_hf_home
+    else:
+        # Priority 2: Use HF_HOME env var or default user cache
+        hf_home = os.environ.get("HF_HOME", os.path.expanduser("~/.cache/huggingface"))
+        mmmu_cache_path = Path(hf_home) / "hub" / "datasets--lmms-lab--MMMU"
+
+    if mmmu_cache_path.exists():
+        print(f"Detected corrupted MMMU parquet cache. Cleaning up: {mmmu_cache_path}")
+        try:
+            shutil.rmtree(mmmu_cache_path)
+            print(f"Successfully removed corrupted cache: {mmmu_cache_path}")
+            return True
+        except OSError as e:
+            print(f"Warning: Failed to remove cache {mmmu_cache_path}: {e}")
+            return False
+    else:
+        print(f"MMMU cache not found at {mmmu_cache_path}, skipping cleanup")
+        return False
+
+
+def _run_lmms_eval_with_retry(cmd: list[str], timeout: int = 3600) -> None:
+    """Run lmms_eval command with automatic retry on MMMU parquet corruption."""
+    try:
+        subprocess.run(
+            cmd,
+            check=True,
+            timeout=timeout,
+            capture_output=True,
+            text=True,
+        )
+    except subprocess.CalledProcessError as e:
+        error_output = e.stderr + e.stdout
+        if _is_mmmu_parquet_corruption(error_output):
+            print("Detected MMMU parquet corruption error. Attempting recovery...")
+            if _cleanup_mmmu_dataset_cache():
+                print("Retrying lmms_eval with fresh download...")
+                with temp_set_env(
+                    HF_HUB_OFFLINE="0",
+                    HF_DATASETS_DOWNLOAD_MODE="force_redownload",
+                ):
+                    subprocess.run(cmd, check=True, timeout=timeout)
+            else:
+                raise
+        else:
+            raise
+
+
 class MMMUMixin:
     """Mixin for MMMU evaluation.
 
@@ -81,11 +144,7 @@ def run_mmmu_eval(
             OPENAI_API_KEY=self.api_key,
             OPENAI_API_BASE=f"{self.base_url}/v1",
         ):
-            subprocess.run(
-                cmd,
-                check=True,
-                timeout=3600,
-            )
+            _run_lmms_eval_with_retry(cmd)
 
     def test_mmmu(self: CustomTestCase):
         """Run MMMU evaluation test."""
@@ -209,11 +268,7 @@ def run_mmmu_eval(
             *self.mmmu_args,
         ]
 
-        subprocess.run(
-            cmd,
-            check=True,
-            timeout=3600,
-        )
+        _run_lmms_eval_with_retry(cmd)
 
     def _run_vlm_mmmu_test(
         self,

From 21b763c0adc70ce917b43768f3be45be22e3f2ba Mon Sep 17 00:00:00 2001
From: Hanwen Xing <1277646412@qq.com>
Date: Sat, 17 Jan 2026 09:21:48 +0000
Subject: [PATCH 2/2] refactor: improve error logging in MMMU retry logic

Address code review feedback:
- Print captured stdout/stderr on successful runs for visibility
- Log error output before re-raising on cleanup failure
- Log error output before re-raising on non-parquet errors

This ensures subprocess outputs are preserved for debugging.
---
 python/sglang/test/kits/mmmu_vlm_kit.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/python/sglang/test/kits/mmmu_vlm_kit.py b/python/sglang/test/kits/mmmu_vlm_kit.py
index b86c48032f4c..a1ca28fedc2d 100644
--- a/python/sglang/test/kits/mmmu_vlm_kit.py
+++ b/python/sglang/test/kits/mmmu_vlm_kit.py
@@ -57,13 +57,18 @@ def _cleanup_mmmu_dataset_cache():
 def _run_lmms_eval_with_retry(cmd: list[str], timeout: int = 3600) -> None:
     """Run lmms_eval command with automatic retry on MMMU parquet corruption."""
     try:
-        subprocess.run(
+        result = subprocess.run(
             cmd,
             check=True,
             timeout=timeout,
             capture_output=True,
             text=True,
         )
+        # Print captured output to maintain visibility of successful runs
+        if result.stdout:
+            print(result.stdout, end="")
+        if result.stderr:
+            print(result.stderr, end="")
     except subprocess.CalledProcessError as e:
         error_output = e.stderr + e.stdout
         if _is_mmmu_parquet_corruption(error_output):
@@ -76,8 +81,14 @@ def _run_lmms_eval_with_retry(cmd: list[str], timeout: int = 3600) -> None:
                 ):
                     subprocess.run(cmd, check=True, timeout=timeout)
             else:
+                print(
+                    f"Failed to cleanup corrupted MMMU cache. Error from lmms_eval:\nStdout:\n{e.stdout}\nStderr:\n{e.stderr}"
+                )
                 raise
         else:
+            print(
+                f"lmms_eval failed with an unhandled error.\nStdout:\n{e.stdout}\nStderr:\n{e.stderr}"
+            )
             raise