diff --git a/python/sglang/multimodal_gen/runtime/platforms/cuda.py b/python/sglang/multimodal_gen/runtime/platforms/cuda.py
index 4f1d41b11d4e..2df5495b1bf7 100644
--- a/python/sglang/multimodal_gen/runtime/platforms/cuda.py
+++ b/python/sglang/multimodal_gen/runtime/platforms/cuda.py
@@ -234,9 +234,6 @@ def get_attn_backend_cls_str(
 
                 set_fa_ver(4)
             target_backend = AttentionBackendEnum.FA
-            logger.debug(
-                f"Using FlashAttention (FA3 for hopper, FA4 for blackwell) as default backend"
-            )
 
         if not cls.has_device_capability(80):
             logger.info(
diff --git a/python/sglang/multimodal_gen/test/server/perf_baselines.json b/python/sglang/multimodal_gen/test/server/perf_baselines.json
index 4b06fcb08884..cc668582aab6 100644
--- a/python/sglang/multimodal_gen/test/server/perf_baselines.json
+++ b/python/sglang/multimodal_gen/test/server/perf_baselines.json
@@ -97,25 +97,69 @@
         },
         "flux_image_t2i": {
             "stages_ms": {
-                "InputValidationStage": 0.08,
-                "TextEncodingStage": 87.36,
-                "ConditioningStage": 0.024,
-                "TimestepPreparationStage": 2.53,
-                "LatentPreparationStage": 6.21,
-                "DenoisingStage": 8161.97,
-                "DecodingStage": 378.21
+                "InputValidationStage": 1.06,
+                "TextEncodingStage": 56.29,
+                "ConditioningStage": 0.01,
+                "TimestepPreparationStage": 1.91,
+                "LatentPreparationStage": 4.92,
+                "DenoisingStage": 8154.07,
+                "DecodingStage": 302.01
             },
             "denoise_step_ms": {
-                "0": 56.16,
-                "10": 163.54,
-                "20": 165.94,
-                "29": 165.26,
-                "39": 165.16,
-                "49": 165.36
+                "0": 56.03,
+                "1": 57.19,
+                "2": 162.79,
+                "3": 162.71,
+                "4": 163.85,
+                "5": 163.34,
+                "6": 170.21,
+                "7": 165.57,
+                "8": 163.95,
+                "9": 164.27,
+                "10": 163.75,
+                "11": 163.55,
+                "12": 166.11,
+                "13": 166.15,
+                "14": 166.15,
+                "15": 164.95,
+                "16": 163.63,
+                "17": 164.53,
+                "18": 165.59,
+                "19": 165.06,
+                "20": 165.73,
+                "21": 164.28,
+                "22": 164.29,
+                "23": 165.02,
+                "24": 165.53,
+                "25": 164.88,
+                "26": 165.92,
+                "27": 164.68,
+                "28": 164.75,
+                "29": 165.02,
+                "30": 164.16,
+                "31": 164.61,
+                "32": 165.43,
+                "33": 165.0,
+                "34": 164.6,
+                "35": 164.78,
+                "36": 165.14,
+                "37": 164.53,
+                "38": 165.46,
+                "39": 165.48,
+                "40": 164.8,
+                "41": 166.22,
+                "42": 166.08,
+                "43": 164.82,
+                "44": 165.29,
+                "45": 165.72,
+                "46": 165.4,
+                "47": 165.57,
+                "48": 164.96,
+                "49": 166.05
             },
-            "expected_e2e_ms": 8764.73,
-            "expected_avg_denoise_ms": 160.79,
-            "expected_median_denoise_ms": 165.14
+            "expected_e2e_ms": 8677.09,
+            "expected_avg_denoise_ms": 160.67,
+            "expected_median_denoise_ms": 164.96
         },
         "qwen_image_edit_ti2i": {
             "notes": "single uploaded reference image, Qwen/Qwen-Image-Edit",
@@ -185,57 +229,132 @@
                 "49": 720.0
             }
         },
-        "fastwan2_1_t2v": {
-            "notes": "Single-video generation using the default prompt",
-            "expected_e2e_ms": 95616.59,
-            "expected_avg_denoise_ms": 1798.77,
-            "expected_median_denoise_ms": 1786.78,
+        "wan2_1_t2v_1.3b": {
             "stages_ms": {
-                "InputValidationStage": 1.03,
-                "TextEncodingStage": 3450.0,
-                "ConditioningStage": 1.0,
-                "TimestepPreparationStage": 6.0,
-                "LatentPreparationStage": 15.0,
-                "DenoisingStage": 90100.0,
-                "DecodingStage": 3650.0
+                "InputValidationStage": 0.08,
+                "TextEncodingStage": 2140.81,
+                "ConditioningStage": 0.01,
+                "TimestepPreparationStage": 1.97,
+                "LatentPreparationStage": 6.85,
+                "DenoisingStage": 69372.83,
+                "DecodingStage": 3617.56,
+                "per_frame_generation": null
             },
             "denoise_step_ms": {
-                "0": 3500.0,
-                "10": 1800.0,
-                "20": 1800.0,
-                "29": 1800.0,
-                "39": 1800.0,
-                "49": 1800.0
+                "0": 1998.19,
+                "1": 1356.14,
+                "2": 1368.21,
+                "3": 1369.72,
+                "4": 1369.99,
+                "5": 1369.59,
+                "6": 1373.51,
+                "7": 1370.87,
+                "8": 1372.93,
+                "9": 1374.36,
+                "10": 1377.02,
+                "11": 1369.3,
+                "12": 1371.37,
+                "13": 1372.18,
+                "14": 1374.18,
+                "15": 1372.42,
+                "16": 1372.78,
+                "17": 1373.88,
+                "18": 1374.75,
+                "19": 1373.75,
+                "20": 1374.48,
+                "21": 1373.56,
+                "22": 1372.96,
+                "23": 1373.94,
+                "24": 1375.37,
+                "25": 1374.7,
+                "26": 1375.98,
+                "27": 1375.46,
+                "28": 1375.66,
+                "29": 1374.0,
+                "30": 1374.51,
+                "31": 1378.0,
+                "32": 1377.06,
+                "33": 1373.76,
+                "34": 1373.49,
+                "35": 1374.65,
+                "36": 1376.83,
+                "37": 1376.62,
+                "38": 1376.01,
+                "39": 1376.99,
+                "40": 1374.0,
+                "41": 1374.25,
+                "42": 1374.06,
+                "43": 1373.53,
+                "44": 1372.87,
+                "45": 1374.37,
+                "46": 1376.22,
+                "47": 1375.73,
+                "48": 1376.33,
+                "49": 1373.45
             },
-            "frames_per_second": 0.51,
-            "total_frames": 49,
-            "avg_frame_time_ms": 1951.36
+            "expected_e2e_ms": 75157.98,
+            "expected_avg_denoise_ms": 1386.08,
+            "expected_median_denoise_ms": 1374.03
         },
+
         "wan2_2_i2v_a14b": {
-            "notes": "Wan-AI/Wan2.2-I2V-A14B",
-            "expected_e2e_ms": 282500.0,
-            "expected_avg_denoise_ms": 7000.0,
-            "expected_median_denoise_ms": 7000.19,
             "stages_ms": {
-                "InputValidationStage": 26.26,
-                "TextEncodingStage": 2749.6,
-                "ConditioningStage": 2.0,
-                "TimestepPreparationStage": 2.0,
-                "LatentPreparationStage": 10.0,
-                "ImageVAEEncodingStage": 2031.0,
-                "DenoisingStage": 278000.0,
-                "DecodingStage": 2849.6
+                "InputValidationStage": 33.55,
+                "TextEncodingStage": 2236.59,
+                "ConditioningStage": 0.01,
+                "TimestepPreparationStage": 1.52,
+                "LatentPreparationStage": 7.01,
+                "ImageVAEEncodingStage": 1904.33,
+                "DenoisingStage": 231725.24,
+                "DecodingStage": 2655.39,
+                "per_frame_generation": null
             },
             "denoise_step_ms": {
-                "0": 24000.0,
-                "8": 7000.0,
-                "16": 7000.0,
-                "23": 7000.0,
-                "31": 7000.0,
-                "39": 7000.0
-            }
+                "0": 18686.84,
+                "1": 4872.1,
+                "2": 4907.91,
+                "3": 4912.87,
+                "4": 4916.4,
+                "5": 4899.71,
+                "6": 4898.83,
+                "7": 4899.0,
+                "8": 4913.25,
+                "9": 4904.2,
+                "10": 4908.81,
+                "11": 4907.16,
+                "12": 4909.81,
+                "13": 4909.15,
+                "14": 4907.37,
+                "15": 26749.56,
+                "16": 4891.89,
+                "17": 4883.41,
+                "18": 4885.81,
+                "19": 4892.38,
+                "20": 4894.36,
+                "21": 4894.3,
+                "22": 4902.87,
+                "23": 4901.43,
+                "24": 4901.22,
+                "25": 4900.06,
+                "26": 4900.19,
+                "27": 4908.87,
+                "28": 4902.73,
+                "29": 4900.45,
+                "30": 4897.09,
+                "31": 4901.3,
+                "32": 4900.94,
+                "33": 4898.13,
+                "34": 4900.62,
+                "35": 4897.47,
+                "36": 4900.57,
+                "37": 4897.09,
+                "38": 4897.7,
+                "39": 4888.4
+            },
+            "expected_e2e_ms": 238587.95,
+            "expected_avg_denoise_ms": 5791.06,
+            "expected_median_denoise_ms": 4900.59
         },
-
         "wan2_1_i2v_14b_480P": {
             "stages_ms": {
                 "per_frame_generation": null
@@ -298,54 +417,139 @@
         },
         "wan2_2_i2v_14b_720P": {
             "stages_ms": {
-                "InputValidationStage": 47.04,
-                "TextEncodingStage": 2321.27,
-                "ImageEncodingStage": 3244.34,
-                "ConditioningStage": 0.0234,
-                "TimestepPreparationStage": 2.88,
-                "LatentPreparationStage": 5.24,
-                "ImageVAEEncodingStage": 1887.64,
-                "DenoisingStage": 245826.78,
-                "DecodingStage": 2882.45,
+                "InputValidationStage": 53.67,
+                "TextEncodingStage": 2228.89,
+                "ImageEncodingStage": 1908.14,
+                "ConditioningStage": 0.01,
+                "TimestepPreparationStage": 2.25,
+                "LatentPreparationStage": 5.27,
+                "ImageVAEEncodingStage": 1887.72,
+                "DenoisingStage": 245828.78,
+                "DecodingStage": 2872.0,
                 "per_frame_generation": null
             },
             "denoise_step_ms": {
-                "0": 5429.38,
-                "10": 4901.39,
-                "20": 4912.89,
-                "29": 4900.75,
-                "39": 4906.23,
-                "49": 4892.55
+                "0": 5432.36,
+                "1": 4867.45,
+                "2": 4903.52,
+                "3": 4916.85,
+                "4": 4928.58,
+                "5": 4927.72,
+                "6": 4916.65,
+                "7": 4910.61,
+                "8": 4900.32,
+                "9": 4901.14,
+                "10": 4907.02,
+                "11": 4908.95,
+                "12": 4903.16,
+                "13": 4910.68,
+                "14": 4907.48,
+                "15": 4901.52,
+                "16": 4909.76,
+                "17": 4905.68,
+                "18": 4904.09,
+                "19": 4908.87,
+                "20": 4897.14,
+                "21": 4905.94,
+                "22": 4916.29,
+                "23": 4904.12,
+                "24": 4912.86,
+                "25": 4904.16,
+                "26": 4916.7,
+                "27": 4918.63,
+                "28": 4903.83,
+                "29": 4909.2,
+                "30": 4898.29,
+                "31": 4901.82,
+                "32": 4902.38,
+                "33": 4910.32,
+                "34": 4903.55,
+                "35": 4902.21,
+                "36": 4907.05,
+                "37": 4911.2,
+                "38": 4898.33,
+                "39": 4902.63,
+                "40": 4895.15,
+                "41": 4898.9,
+                "42": 4901.14,
+                "43": 4896.49,
+                "44": 4899.01,
+                "45": 4896.71,
+                "46": 4887.24,
+                "47": 4892.91,
+                "48": 4886.71,
+                "49": 4885.16
             },
-            "expected_e2e_ms": 254850.94,
-            "expected_avg_denoise_ms": 4914.73,
-            "expected_median_denoise_ms": 4903.27
+            "expected_e2e_ms": 254809.55,
+            "expected_avg_denoise_ms": 4914.77,
+            "expected_median_denoise_ms": 4903.96
         },
         "wan2_2_ti2v_5b": {
-            "notes": "Text-and-Image-to-Video generation baseline for Wan2.2-TI2V-5B",
-            "expected_e2e_ms": 178300.0,
-            "expected_avg_denoise_ms": 3250.0,
-            "expected_median_denoise_ms": 3260.0,
             "stages_ms": {
-                "InputValidationStage": 150.0,
-                "TextEncodingStage": 3000.0,
-                "ConditioningStage": 1.0,
-                "TimestepPreparationStage": 6.0,
-                "LatentPreparationStage": 30.0,
-                "DenoisingStage": 162900.0,
-                "DecodingStage": 14767.0
+                "InputValidationStage": 79.9,
+                "TextEncodingStage": 2241.88,
+                "ConditioningStage": 0.01,
+                "TimestepPreparationStage": 1.63,
+                "LatentPreparationStage": 22.52,
+                "DenoisingStage": 130094.11,
+                "DecodingStage": 12793.58,
+                "per_frame_generation": null
             },
             "denoise_step_ms": {
-                "0": 3700.0,
-                "10": 3300.0,
-                "20": 3300.0,
-                "29": 3300.0,
-                "39": 3300.0,
-                "49": 3300.0
+                "0": 3092.88,
+                "1": 2526.43,
+                "2": 2544.76,
+                "3": 2543.33,
+                "4": 2545.33,
+                "5": 2542.44,
+                "6": 2540.33,
+                "7": 2542.23,
+                "8": 2544.87,
+                "9": 2547.86,
+                "10": 2548.18,
+                "11": 2551.13,
+                "12": 2547.02,
+                "13": 2551.31,
+                "14": 2551.97,
+                "15": 2549.61,
+                "16": 2551.75,
+                "17": 2552.97,
+                "18": 2551.2,
+                "19": 2555.07,
+                "20": 2552.72,
+                "21": 2551.24,
+                "22": 2554.63,
+                "23": 2555.52,
+                "24": 2555.06,
+                "25": 2550.04,
+                "26": 2554.88,
+                "27": 2553.69,
+                "28": 2550.75,
+                "29": 2555.17,
+                "30": 2556.75,
+                "31": 2554.22,
+                "32": 2552.74,
+                "33": 2554.31,
+                "34": 2554.98,
+                "35": 2553.65,
+                "36": 2552.21,
+                "37": 2554.85,
+                "38": 2555.96,
+                "39": 2553.78,
+                "40": 2553.5,
+                "41": 2550.98,
+                "42": 2555.66,
+                "43": 2551.91,
+                "44": 2551.23,
+                "45": 2555.91,
+                "46": 2556.11,
+                "47": 2548.55,
+                "48": 2552.78,
+                "49": 2553.49
             },
-            "frames_per_second": null,
-            "total_frames": null,
-            "avg_frame_time_ms": null
+            "expected_e2e_ms": 145253.72,
+            "expected_avg_denoise_ms": 2561.76,
+            "expected_median_denoise_ms": 2552.46
         }
     }
 }
diff --git a/python/sglang/multimodal_gen/test/server/test_server_performance.py b/python/sglang/multimodal_gen/test/server/test_server_performance.py
index 45c62433f6c4..ce517403aa1f 100644
--- a/python/sglang/multimodal_gen/test/server/test_server_performance.py
+++ b/python/sglang/multimodal_gen/test/server/test_server_performance.py
@@ -1,5 +1,8 @@
 """
 Config-driven diffusion performance test with pytest parametrization.
+
+
+If the actual run is significantly better than the baseline, the improved cases with their updated baseline will be printed
 """
 
 from __future__ import annotations
@@ -27,6 +30,7 @@
     DIFFUSION_CASES,
     DiffusionTestCase,
     PerformanceSummary,
+    ScenarioConfig,
 )
 from sglang.multimodal_gen.test.test_utils import (
     get_dynamic_server_port,
@@ -110,10 +114,12 @@ class TestDiffusionPerformance:
     """
 
     _perf_results: list[dict[str, Any]] = []
+    _improved_baselines: list[dict[str, Any]] = []
 
     @classmethod
     def setup_class(cls):
         cls._perf_results = []
+        cls._improved_baselines = []
 
     @classmethod
     def teardown_class(cls):
@@ -121,6 +127,20 @@ def teardown_class(cls):
             result["class_name"] = cls.__name__
             _GLOBAL_PERF_RESULTS.append(result)
 
+        if cls._improved_baselines:
+            import json
+
+            output = """
+--- POTENTIAL BASELINE IMPROVEMENTS DETECTED ---
+The following test cases performed significantly better than their baselines.
+Consider updating perf_baselines.json with the snippets below:
+"""
+            for item in cls._improved_baselines:
+                output += (
+                    f'\n"{item["id"]}": {json.dumps(item["baseline"], indent=4)},\n'
+                )
+            print(output)
+
     def _client(self, ctx: ServerContext) -> OpenAI:
         """Get OpenAI client for the server."""
         return OpenAI(
@@ -150,7 +170,6 @@ def _run_and_collect(
 
         stage_metrics = {}
         if perf_record:
-
             stage_metrics, _ = wait_for_stage_metrics(
                 perf_record.get("request_id", ""),
                 prev_len,
@@ -358,11 +377,23 @@ def _validate_and_record(
         is_baseline_generation_mode = os.environ.get("SGLANG_GEN_BASELINE", "0") == "1"
 
         scenario = BASELINE_CONFIG.scenarios.get(case.id)
+        missing_scenario = False
         if scenario is None:
-            if is_baseline_generation_mode:
-                scenario = {}  # Dummy scenario
-            else:
-                pytest.fail(f"Testcase '{case.id}' not in perf_baselines.json")
+            # Create dummy scenario to allow metric collection
+            scenario = type(
+                "DummyScenario",
+                (),
+                {
+                    "expected_e2e_ms": 0,
+                    "expected_avg_denoise_ms": 0,
+                    "expected_median_denoise_ms": 0,
+                    "stages_ms": {},
+                    "denoise_step_ms": {},
+                },
+            )()
+            if not is_baseline_generation_mode:
+                missing_scenario = True
+
         validator_name = case.custom_validator or "default"
         validator_class = VALIDATOR_REGISTRY.get(validator_name, PerformanceValidator)
 
@@ -374,10 +405,14 @@ def _validate_and_record(
 
         summary = validator.collect_metrics(perf_record, stage_metrics)
 
-        if is_baseline_generation_mode:
+        if is_baseline_generation_mode or missing_scenario:
             self._dump_baseline_scenario(case, summary)
+            if missing_scenario:
+                pytest.fail(f"Testcase '{case.id}' not found in perf_baselines.json")
             return
 
+        self._check_for_improvement(case, summary, scenario)
+
         try:
             validator.validate(perf_record, stage_metrics, case.num_frames)
         except AssertionError:
@@ -458,6 +493,80 @@ def _validate_and_record(
                 summary.avg_frame_time_ms,
             )
 
+    def _check_for_improvement(
+        self,
+        case: DiffusionTestCase,
+        summary: PerformanceSummary,
+        scenario: "ScenarioConfig",
+    ) -> None:
+        """Check for potential significant performance improvements and record them."""
+        is_improved = False
+        threshold = BASELINE_CONFIG.improvement_threshold
+
+        def is_sig_faster(actual, expected):
+            if expected == 0:
+                return False
+            return actual < expected * (1 - threshold)
+
+        # Check for any significant improvement
+        if (
+            is_sig_faster(summary.e2e_ms, scenario.expected_e2e_ms)
+            or is_sig_faster(summary.avg_denoise_ms, scenario.expected_avg_denoise_ms)
+            or is_sig_faster(
+                summary.median_denoise_ms, scenario.expected_median_denoise_ms
+            )
+        ):
+            is_improved = True
+
+        # Combine metrics, always taking the better (lower) value
+        new_stages = {
+            stage: min(
+                summary.stage_metrics.get(stage, float("inf")),
+                scenario.stages_ms.get(stage, float("inf")),
+            )
+            for stage in set(summary.stage_metrics) | set(scenario.stages_ms)
+        }
+        new_denoise_steps = {
+            step: min(
+                summary.all_denoise_steps.get(step, float("inf")),
+                scenario.denoise_step_ms.get(step, float("inf")),
+            )
+            for step in set(summary.all_denoise_steps) | set(scenario.denoise_step_ms)
+        }
+
+        # Check for stage-level improvements
+        if not is_improved:
+            for stage, new_val in new_stages.items():
+                if is_sig_faster(new_val, scenario.stages_ms.get(stage, float("inf"))):
+                    is_improved = True
+                    break
+        if not is_improved:
+            for step, new_val in new_denoise_steps.items():
+                if is_sig_faster(
+                    new_val, scenario.denoise_step_ms.get(step, float("inf"))
+                ):
+                    is_improved = True
+                    break
+
+        if is_improved:
+            new_baseline = {
+                "stages_ms": {k: round(v, 2) for k, v in new_stages.items()},
+                "denoise_step_ms": {
+                    str(k): round(v, 2) for k, v in new_denoise_steps.items()
+                },
+                "expected_e2e_ms": round(
+                    min(summary.e2e_ms, scenario.expected_e2e_ms), 2
+                ),
+                "expected_avg_denoise_ms": round(
+                    min(summary.avg_denoise_ms, scenario.expected_avg_denoise_ms), 2
+                ),
+                "expected_median_denoise_ms": round(
+                    min(summary.median_denoise_ms, scenario.expected_median_denoise_ms),
+                    2,
+                ),
+            }
+            self._improved_baselines.append({"id": case.id, "baseline": new_baseline})
+
     def _dump_baseline_scenario(
         self, case: DiffusionTestCase, summary: "PerformanceSummary"
     ) -> None:
@@ -465,7 +574,7 @@ def _dump_baseline_scenario(
         import json
 
         denoise_steps_formatted = {
-            str(k): round(v, 2) for k, v in summary.sampled_steps.items()
+            str(k): round(v, 2) for k, v in summary.all_denoise_steps.items()
         }
         stages_formatted = {k: round(v, 2) for k, v in summary.stage_metrics.items()}
 
diff --git a/python/sglang/multimodal_gen/test/server/test_server_utils.py b/python/sglang/multimodal_gen/test/server/test_server_utils.py
index eb32bc98ac6c..34f593096cd6 100644
--- a/python/sglang/multimodal_gen/test/server/test_server_utils.py
+++ b/python/sglang/multimodal_gen/test/server/test_server_utils.py
@@ -365,6 +365,7 @@ def collect_metrics(
             median_denoise_ms=median_denoise,
             stage_metrics=stage_metrics,
             sampled_steps=sampled_steps,
+            all_denoise_steps=per_step,
         )
 
     def _validate_e2e(self, summary: PerformanceSummary) -> None:
diff --git a/python/sglang/multimodal_gen/test/server/testcase_configs.py b/python/sglang/multimodal_gen/test/server/testcase_configs.py
index 94bfcb8b3d23..10d30e2e883e 100644
--- a/python/sglang/multimodal_gen/test/server/testcase_configs.py
+++ b/python/sglang/multimodal_gen/test/server/testcase_configs.py
@@ -54,6 +54,7 @@ class BaselineConfig:
     step_fractions: Sequence[float]
     warmup_defaults: dict[str, int]
     tolerances: ToleranceConfig
+    improvement_threshold: float
 
     @classmethod
     def load(cls, path: Path) -> BaselineConfig:
@@ -88,6 +89,9 @@ def load(cls, path: Path) -> BaselineConfig:
             step_fractions=tuple(data["sampling"]["step_fractions"]),
             warmup_defaults=data["sampling"].get("warmup_requests", {}),
             tolerances=tolerances,
+            improvement_threshold=data.get("improvement_reporting", {}).get(
+                "threshold", 0.2
+            ),
         )
 
 
@@ -134,6 +138,7 @@ class PerformanceSummary:
     median_denoise_ms: float
     stage_metrics: dict[str, float]
     sampled_steps: dict[int, float]
+    all_denoise_steps: dict[int, float]
     frames_per_second: float | None = None
     total_frames: int | None = None
     avg_frame_time_ms: float | None = None
@@ -182,7 +187,7 @@ class PerformanceSummary:
     # === Text to Video (T2V) ===
     # TODO: FastWan2.1, FastWan2.2
     DiffusionTestCase(
-        id="fastwan2_1_t2v",
+        id="wan2_1_t2v_1.3b",
         model_path="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
         modality="video",
         prompt="A curious raccoon",