sgl-project · Fridge003 · Apr 4, 2026 · Mar 31, 2026 · Mar 31, 2026 · Apr 1, 2026
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
@@ -77,7 +77,8 @@
 # NOTE: please sort the test cases alphabetically by the test file name
 suite_xpu = {
     "per-commit-xpu": [
-        TestFile("xpu/test_deepseek_ocr.py"),
+        TestFile("xpu/test_deepseek_ocr.py", 360),
+        TestFile("xpu/test_deepseek_ocr_triton.py", 360),
         # TestFile("xpu/test_internvl.py"),
         TestFile("xpu/test_intel_xpu_backend.py"),
     ],

diff --git a/test/srt/xpu/test_deepseek_ocr.py b/test/srt/xpu/test_deepseek_ocr.py
@@ -2,9 +2,11 @@
 python3 -m unittest test_deepseek_ocr.py
 """
 
+import gc
 import json
 import os
 import unittest
+from pathlib import Path
 
 import requests
 from transformers import AutoTokenizer
@@ -19,11 +21,32 @@
 
 
 class TestDeepSeekOCR(CustomTestCase):
+    @classmethod
+    def _cleanup_xpu_memory(cls):
+        gc.collect()
+        try:
+            import torch
+
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                torch.xpu.synchronize()
+                torch.xpu.empty_cache()
+        except Exception:
+            # Best-effort cleanup only; tests should continue if cleanup is unavailable.
+            pass
+
     @classmethod
     def setUpClass(cls):
+        cls._cleanup_xpu_memory()
         cls.model = "deepseek-ai/DeepSeek-OCR"
-        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model, use_fast=False)
+        cls.tokenizer = AutoTokenizer.from_pretrained(
+            cls.model, use_fast=False, trust_remote_code=True
+        )
         cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.image_path = str(
+            (Path(__file__).resolve().parents[3] / "examples/assets/example_image.png")
+        )
+        if not os.path.exists(cls.image_path):
+            raise FileNotFoundError(f"Image not found: {cls.image_path}")
         cls.common_args = [
             "--device",
             "xpu",
@@ -43,14 +66,16 @@ def setUpClass(cls):
     @classmethod
     def tearDownClass(cls):
         """Fixture that is run once after all tests in the class."""
-        kill_process_tree(cls.process.pid)
+        if hasattr(cls, "process") and cls.process:
+            kill_process_tree(cls.process.pid)
+        cls._cleanup_xpu_memory()
 
     def get_request_json(self, max_new_tokens=32, n=1):
         response = requests.post(
             self.base_url + "/generate",
             json={
                 "text": "<image>\n<|grounding|>Convert the document to pure text.",
-                "image_data": "../../examples/assets/example_image.png",
+                "image_data": self.image_path,
                 "sampling_params": {
                     "temperature": 0 if n == 1 else 0.5,
                     "max_new_tokens": max_new_tokens,
@@ -94,28 +119,5 @@ def test_moe(self):
         self.run_decode()
 
 
-class TestDeepSeekOCRTriton(TestDeepSeekOCR):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "deepseek-ai/DeepSeek-OCR"
-        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model, use_fast=False)
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.common_args = [
-            "--device",
-            "xpu",
-            "--attention-backend",
-            "intel_xpu",
-        ]
-        os.environ["SGLANG_USE_SGL_XPU"] = "0"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                *cls.common_args,
-            ],
-        )
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/srt/xpu/test_deepseek_ocr_triton.py b/test/srt/xpu/test_deepseek_ocr_triton.py
@@ -0,0 +1,51 @@
+"""
+python3 -m unittest test_deepseek_ocr_triton.py
+"""
+
+import os
+import unittest
+from pathlib import Path
+
+import test_deepseek_ocr as deepseek_ocr
+from transformers import AutoTokenizer
+
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestDeepSeekOCRTriton(deepseek_ocr.TestDeepSeekOCR):
+    @classmethod
+    def setUpClass(cls):
+        cls._cleanup_xpu_memory()
+        cls.model = "deepseek-ai/DeepSeek-OCR"
+        cls.tokenizer = AutoTokenizer.from_pretrained(
+            cls.model, use_fast=False, trust_remote_code=True
+        )
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.image_path = str(
+            (Path(__file__).resolve().parents[3] / "examples/assets/example_image.png")
+        )
+        if not os.path.exists(cls.image_path):
+            raise FileNotFoundError(f"Image not found: {cls.image_path}")
+        cls.common_args = [
+            "--device",
+            "xpu",
+            "--attention-backend",
+            "intel_xpu",
+        ]
+        os.environ["SGLANG_USE_SGL_XPU"] = "0"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                *cls.common_args,
+            ],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/xpu/test_intel_xpu_backend.py b/test/srt/xpu/test_intel_xpu_backend.py
@@ -3,6 +3,7 @@
 python3 -m unittest test_intel_xpu_backend.TestIntelXPUBackend.test_latency_qwen_model
 """
 
+import gc
 import unittest
 from functools import wraps
 
@@ -15,26 +16,44 @@
 )
 
 
+def _cleanup_xpu_memory():
+    gc.collect()
+    try:
+        import torch
+
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
+            torch.xpu.synchronize()
+            torch.xpu.empty_cache()
+    except Exception:
+        # Best-effort cleanup only.
+        pass
+
+
 def intel_xpu_benchmark(extra_args=None, min_throughput=None):
     def decorator(test_func):
         @wraps(test_func)
         def wrapper(self):
+            _cleanup_xpu_memory()
             common_args = [
                 "--disable-radix",
                 "--trust-remote-code",
                 "--mem-fraction-static",
-                "0.3",
+                "0.4",
                 "--batch-size",
                 "1",
                 "--device",
                 "xpu",
             ]
-            full_args = common_args + (extra_args or [])
+            ci_args = ["--input", "64", "--output", "4"] if is_in_ci() else []
+            full_args = common_args + ci_args + (extra_args or [])
 
             model = test_func(self)
-            prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
-                model, full_args
-            )
+            try:
+                prefill_latency, decode_throughput, decode_latency = (
+                    run_bench_one_batch(model, full_args)
+                )
+            finally:
+                _cleanup_xpu_memory()
 
             print(f"{model=}")
             print(f"{prefill_latency=}")