sgl-project · mickqian · May 10, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/python/sglang/multimodal_gen/test/server/configs/cache_dit_config_1gpu.yaml b/python/sglang/multimodal_gen/test/server/configs/cache_dit_config_1gpu.yaml
@@ -0,0 +1,10 @@
+cache_config:
+  max_warmup_steps: 8
+  warmup_interval: 2
+  max_cached_steps: -1
+  max_continuous_cached_steps: 2
+  Fn_compute_blocks: 1
+  Bn_compute_blocks: 0
+  residual_diff_threshold: 0.12
+  enable_taylorseer: true
+  taylorseer_order: 1
@@ -0,0 +1,13 @@
+cache_config:
+  max_warmup_steps: 8
+  warmup_interval: 2
+  max_cached_steps: -1
+  max_continuous_cached_steps: 2
+  Fn_compute_blocks: 1
+  Bn_compute_blocks: 0
+  residual_diff_threshold: 0.12
+  enable_taylorseer: true
+  taylorseer_order: 1
+  num_inference_steps: 50
+  steps_computation_mask: "medium"
+  steps_computation_policy: dynamic
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 from sglang.multimodal_gen.runtime.platforms import current_platform
 from sglang.multimodal_gen.test.server.testcase_configs import (
     MODELOPT_FLUX1_FP8_TRANSFORMER,
@@ -68,6 +70,36 @@
         ),
         T2I_sampling_params,
     ),
+    DiffusionTestCase(
+        "qwen_image_t2i_cache_dit_config_diffusers_1gpu",
+        DiffusionServerArgs(
+            model_path=DEFAULT_QWEN_IMAGE_MODEL_NAME_FOR_TEST,
+            extras=[
+                "--backend",
+                "diffusers",
+                "--cache-dit-config",
+                str(Path(__file__).parent / "configs" / "cache_dit_config_1gpu.yaml"),
+            ],
+        ),
+        T2I_sampling_params,
+        run_consistency_check=False,
+        run_component_accuracy_check=False,
+    ),
+    DiffusionTestCase(
+        "qwen_image_t2i_cache_dit_scm_config_diffusers_1gpu",
+        DiffusionServerArgs(
+            model_path=DEFAULT_QWEN_IMAGE_MODEL_NAME_FOR_TEST,
+            extras=[
+                "--backend",
+                "diffusers",
+                "--cache-dit-config",
+                str(Path(__file__).parent / "configs" / "cache_dit_scm_config.yaml"),
+            ],
+        ),
+        T2I_sampling_params,
+        run_consistency_check=False,
+        run_component_accuracy_check=False,
+    ),
     DiffusionTestCase(
         "flux_image_t2i",
         DiffusionServerArgs(model_path=DEFAULT_FLUX_1_DEV_MODEL_NAME_FOR_TEST),
@@ -521,6 +553,17 @@
         ),
         T2V_sampling_params,
     ),
+    DiffusionTestCase(
+        "wan2_1_t2v_1_3b_cache_dit_sp_only_2gpu",
+        DiffusionServerArgs(
+            model_path=DEFAULT_WAN_2_1_T2V_1_3B_MODEL_NAME_FOR_TEST,
+            ulysses_degree=2,
+            enable_cache_dit=True,
+        ),
+        T2V_sampling_params,
+        run_consistency_check=False,
+        run_component_accuracy_check=False,
+    ),
     DiffusionTestCase(
         "fsdp-inference",
         DiffusionServerArgs(

diff --git a/python/sglang/multimodal_gen/test/server/perf_baselines.json b/python/sglang/multimodal_gen/test/server/perf_baselines.json
@@ -1073,6 +1073,73 @@
             "expected_median_denoise_ms": 100.05,
             "estimated_full_test_time_s": 127.6
         },
+        "wan2_1_t2v_1_3b_cache_dit_sp_only_2gpu": {
+            "stages_ms": {
+                "InputValidationStage": 0.08,
+                "TextEncodingStage": 853.63,
+                "LatentPreparationStage": 0.09,
+                "TimestepPreparationStage": 1.51,
+                "DenoisingStage": 2102.64,
+                "DecodingStage": 438.71,
+                "per_frame_generation": null
+            },
+            "denoise_step_ms": {
+                "0": 107.59,
+                "1": 103.11,
+                "2": 98.13,
+                "3": 94.12,
+                "4": 45.26,
+                "5": 8.66,
+                "6": 8.87,
+                "7": 105.03,
+                "8": 17.53,
+                "9": 9.26,
+                "10": 15.2,
+                "11": 93.17,
+                "12": 27.27,
+                "13": 7.78,
+                "14": 8.8,
+                "15": 96.54,
+                "16": 24.72,
+                "17": 9.27,
+                "18": 8.53,
+                "19": 106.58,
+                "20": 13.47,
+                "21": 10.13,
+                "22": 10.23,
+                "23": 96.49,
+                "24": 21.93,
+                "25": 8.73,
+                "26": 9.72,
+                "27": 103.29,
+                "28": 14.4,
+                "29": 9.07,
+                "30": 8.71,
+                "31": 105.88,
+                "32": 16.93,
+                "33": 9.15,
+                "34": 9.83,
+                "35": 100.03,
+                "36": 21.5,
+                "37": 9.41,
+                "38": 9.18,
+                "39": 105.14,
+                "40": 12.36,
+                "41": 10.1,
+                "42": 8.29,
+                "43": 102.88,
+                "44": 18.36,
+                "45": 7.67,
+                "46": 10.27,
+                "47": 103.19,
+                "48": 25.53,
+                "49": 106.23
+            },
+            "expected_e2e_ms": 3402.6,
+            "expected_avg_denoise_ms": 41.87,
+            "expected_median_denoise_ms": 16.07,
+            "estimated_full_test_time_s": 94.0
+        },
         "turbo_wan2_1_t2v_1.3b": {
             "stages_ms": {
                 "InputValidationStage": 0.06,
@@ -2042,6 +2109,26 @@
             "expected_median_denoise_ms": 268.51,
             "estimated_full_test_time_s": 122.7
         },
+        "qwen_image_t2i_cache_dit_config_diffusers_1gpu": {
+            "stages_ms": {
+                "DiffusersExecutionStage": 64520.18
+            },
+            "denoise_step_ms": {},
+            "expected_e2e_ms": 64523.93,
+            "expected_avg_denoise_ms": 0.0,
+            "expected_median_denoise_ms": 0.0,
+            "estimated_full_test_time_s": 142.2
+        },
+        "qwen_image_t2i_cache_dit_scm_config_diffusers_1gpu": {
+            "stages_ms": {
+                "DiffusersExecutionStage": 61106.66
+            },
+            "denoise_step_ms": {},
+            "expected_e2e_ms": 61110.0,
+            "expected_avg_denoise_ms": 0.0,
+            "expected_median_denoise_ms": 0.0,
+            "estimated_full_test_time_s": 135.3
+        },
         "hunyuan3d_shape_gen": {
             "stages_ms": {
                 "Hunyuan3DShapeBeforeDenoisingStage": 544.59,

diff --git a/python/sglang/multimodal_gen/test/server/test_server_utils.py b/python/sglang/multimodal_gen/test/server/test_server_utils.py
@@ -943,7 +943,8 @@ def generate_image(case_id, client) -> tuple[str, bytes]:
 
         # Build extra_body for optional features
         extra_body = dict(sampling_params.extras)
-
+        if sampling_params.diffusers_kwargs:
+            extra_body["diffusers_kwargs"] = sampling_params.diffusers_kwargs
         response = client.images.with_raw_response.generate(
             model=model_path,
             prompt=sampling_params.prompt,

diff --git a/python/sglang/multimodal_gen/test/server/testcase_configs.py b/python/sglang/multimodal_gen/test/server/testcase_configs.py
@@ -247,6 +247,8 @@ class DiffusionSamplingParams:
     # merged directly into the OpenAI extra_body dict.
     extras: dict = field(default_factory=dict)
 
+    diffusers_kwargs: dict | None = None
+
 
 @dataclass(frozen=True)
 class DiffusionTestCase: