diff --git a/python/sglang/multimodal_gen/runtime/platforms/cuda.py b/python/sglang/multimodal_gen/runtime/platforms/cuda.py index 4f1d41b11d4e..2df5495b1bf7 100644 --- a/python/sglang/multimodal_gen/runtime/platforms/cuda.py +++ b/python/sglang/multimodal_gen/runtime/platforms/cuda.py @@ -234,9 +234,6 @@ def get_attn_backend_cls_str( set_fa_ver(4) target_backend = AttentionBackendEnum.FA - logger.debug( - f"Using FlashAttention (FA3 for hopper, FA4 for blackwell) as default backend" - ) if not cls.has_device_capability(80): logger.info( diff --git a/python/sglang/multimodal_gen/test/server/perf_baselines.json b/python/sglang/multimodal_gen/test/server/perf_baselines.json index 4b06fcb08884..cc668582aab6 100644 --- a/python/sglang/multimodal_gen/test/server/perf_baselines.json +++ b/python/sglang/multimodal_gen/test/server/perf_baselines.json @@ -97,25 +97,69 @@ }, "flux_image_t2i": { "stages_ms": { - "InputValidationStage": 0.08, - "TextEncodingStage": 87.36, - "ConditioningStage": 0.024, - "TimestepPreparationStage": 2.53, - "LatentPreparationStage": 6.21, - "DenoisingStage": 8161.97, - "DecodingStage": 378.21 + "InputValidationStage": 1.06, + "TextEncodingStage": 56.29, + "ConditioningStage": 0.01, + "TimestepPreparationStage": 1.91, + "LatentPreparationStage": 4.92, + "DenoisingStage": 8154.07, + "DecodingStage": 302.01 }, "denoise_step_ms": { - "0": 56.16, - "10": 163.54, - "20": 165.94, - "29": 165.26, - "39": 165.16, - "49": 165.36 + "0": 56.03, + "1": 57.19, + "2": 162.79, + "3": 162.71, + "4": 163.85, + "5": 163.34, + "6": 170.21, + "7": 165.57, + "8": 163.95, + "9": 164.27, + "10": 163.75, + "11": 163.55, + "12": 166.11, + "13": 166.15, + "14": 166.15, + "15": 164.95, + "16": 163.63, + "17": 164.53, + "18": 165.59, + "19": 165.06, + "20": 165.73, + "21": 164.28, + "22": 164.29, + "23": 165.02, + "24": 165.53, + "25": 164.88, + "26": 165.92, + "27": 164.68, + "28": 164.75, + "29": 165.02, + "30": 164.16, + "31": 164.61, + "32": 165.43, + "33": 165.0, + "34": 164.6, + "35": 164.78, + "36": 165.14, + "37": 164.53, + "38": 165.46, + "39": 165.48, + "40": 164.8, + "41": 166.22, + "42": 166.08, + "43": 164.82, + "44": 165.29, + "45": 165.72, + "46": 165.4, + "47": 165.57, + "48": 164.96, + "49": 166.05 }, - "expected_e2e_ms": 8764.73, - "expected_avg_denoise_ms": 160.79, - "expected_median_denoise_ms": 165.14 + "expected_e2e_ms": 8677.09, + "expected_avg_denoise_ms": 160.67, + "expected_median_denoise_ms": 164.96 }, "qwen_image_edit_ti2i": { "notes": "single uploaded reference image, Qwen/Qwen-Image-Edit", @@ -185,57 +229,132 @@ "49": 720.0 } }, - "fastwan2_1_t2v": { - "notes": "Single-video generation using the default prompt", - "expected_e2e_ms": 95616.59, - "expected_avg_denoise_ms": 1798.77, - "expected_median_denoise_ms": 1786.78, + "wan2_1_t2v_1.3b": { "stages_ms": { - "InputValidationStage": 1.03, - "TextEncodingStage": 3450.0, - "ConditioningStage": 1.0, - "TimestepPreparationStage": 6.0, - "LatentPreparationStage": 15.0, - "DenoisingStage": 90100.0, - "DecodingStage": 3650.0 + "InputValidationStage": 0.08, + "TextEncodingStage": 2140.81, + "ConditioningStage": 0.01, + "TimestepPreparationStage": 1.97, + "LatentPreparationStage": 6.85, + "DenoisingStage": 69372.83, + "DecodingStage": 3617.56, + "per_frame_generation": null }, "denoise_step_ms": { - "0": 3500.0, - "10": 1800.0, - "20": 1800.0, - "29": 1800.0, - "39": 1800.0, - "49": 1800.0 + "0": 1998.19, + "1": 1356.14, + "2": 1368.21, + "3": 1369.72, + "4": 1369.99, + "5": 1369.59, + "6": 1373.51, + "7": 1370.87, + "8": 1372.93, + "9": 1374.36, + "10": 1377.02, + "11": 1369.3, + "12": 1371.37, + "13": 1372.18, + "14": 1374.18, + "15": 1372.42, + "16": 1372.78, + "17": 1373.88, + "18": 1374.75, + "19": 1373.75, + "20": 1374.48, + "21": 1373.56, + "22": 1372.96, + "23": 1373.94, + "24": 1375.37, + "25": 1374.7, + "26": 1375.98, + "27": 1375.46, + "28": 1375.66, + "29": 1374.0, + "30": 1374.51, + "31": 1378.0, + "32": 1377.06, + "33": 1373.76, + "34": 1373.49, + "35": 1374.65, + "36": 1376.83, + "37": 1376.62, + "38": 1376.01, + "39": 1376.99, + "40": 1374.0, + "41": 1374.25, + "42": 1374.06, + "43": 1373.53, + "44": 1372.87, + "45": 1374.37, + "46": 1376.22, + "47": 1375.73, + "48": 1376.33, + "49": 1373.45 }, - "frames_per_second": 0.51, - "total_frames": 49, - "avg_frame_time_ms": 1951.36 + "expected_e2e_ms": 75157.98, + "expected_avg_denoise_ms": 1386.08, + "expected_median_denoise_ms": 1374.03 }, + "wan2_2_i2v_a14b": { - "notes": "Wan-AI/Wan2.2-I2V-A14B", - "expected_e2e_ms": 282500.0, - "expected_avg_denoise_ms": 7000.0, - "expected_median_denoise_ms": 7000.19, "stages_ms": { - "InputValidationStage": 26.26, - "TextEncodingStage": 2749.6, - "ConditioningStage": 2.0, - "TimestepPreparationStage": 2.0, - "LatentPreparationStage": 10.0, - "ImageVAEEncodingStage": 2031.0, - "DenoisingStage": 278000.0, - "DecodingStage": 2849.6 + "InputValidationStage": 33.55, + "TextEncodingStage": 2236.59, + "ConditioningStage": 0.01, + "TimestepPreparationStage": 1.52, + "LatentPreparationStage": 7.01, + "ImageVAEEncodingStage": 1904.33, + "DenoisingStage": 231725.24, + "DecodingStage": 2655.39, + "per_frame_generation": null }, "denoise_step_ms": { - "0": 24000.0, - "8": 7000.0, - "16": 7000.0, - "23": 7000.0, - "31": 7000.0, - "39": 7000.0 - } + "0": 18686.84, + "1": 4872.1, + "2": 4907.91, + "3": 4912.87, + "4": 4916.4, + "5": 4899.71, + "6": 4898.83, + "7": 4899.0, + "8": 4913.25, + "9": 4904.2, + "10": 4908.81, + "11": 4907.16, + "12": 4909.81, + "13": 4909.15, + "14": 4907.37, + "15": 26749.56, + "16": 4891.89, + "17": 4883.41, + "18": 4885.81, + "19": 4892.38, + "20": 4894.36, + "21": 4894.3, + "22": 4902.87, + "23": 4901.43, + "24": 4901.22, + "25": 4900.06, + "26": 4900.19, + "27": 4908.87, + "28": 4902.73, + "29": 4900.45, + "30": 4897.09, + "31": 4901.3, + "32": 4900.94, + "33": 4898.13, + "34": 4900.62, + "35": 4897.47, + "36": 4900.57, + "37": 4897.09, + "38": 4897.7, + "39": 4888.4 + }, + "expected_e2e_ms": 238587.95, + "expected_avg_denoise_ms": 5791.06, + "expected_median_denoise_ms": 4900.59 }, - "wan2_1_i2v_14b_480P": { "stages_ms": { "per_frame_generation": null @@ -298,54 +417,139 @@ }, "wan2_2_i2v_14b_720P": { "stages_ms": { - "InputValidationStage": 47.04, - "TextEncodingStage": 2321.27, - "ImageEncodingStage": 3244.34, - "ConditioningStage": 0.0234, - "TimestepPreparationStage": 2.88, - "LatentPreparationStage": 5.24, - "ImageVAEEncodingStage": 1887.64, - "DenoisingStage": 245826.78, - "DecodingStage": 2882.45, + "InputValidationStage": 53.67, + "TextEncodingStage": 2228.89, + "ImageEncodingStage": 1908.14, + "ConditioningStage": 0.01, + "TimestepPreparationStage": 2.25, + "LatentPreparationStage": 5.27, + "ImageVAEEncodingStage": 1887.72, + "DenoisingStage": 245828.78, + "DecodingStage": 2872.0, "per_frame_generation": null }, "denoise_step_ms": { - "0": 5429.38, - "10": 4901.39, - "20": 4912.89, - "29": 4900.75, - "39": 4906.23, - "49": 4892.55 + "0": 5432.36, + "1": 4867.45, + "2": 4903.52, + "3": 4916.85, + "4": 4928.58, + "5": 4927.72, + "6": 4916.65, + "7": 4910.61, + "8": 4900.32, + "9": 4901.14, + "10": 4907.02, + "11": 4908.95, + "12": 4903.16, + "13": 4910.68, + "14": 4907.48, + "15": 4901.52, + "16": 4909.76, + "17": 4905.68, + "18": 4904.09, + "19": 4908.87, + "20": 4897.14, + "21": 4905.94, + "22": 4916.29, + "23": 4904.12, + "24": 4912.86, + "25": 4904.16, + "26": 4916.7, + "27": 4918.63, + "28": 4903.83, + "29": 4909.2, + "30": 4898.29, + "31": 4901.82, + "32": 4902.38, + "33": 4910.32, + "34": 4903.55, + "35": 4902.21, + "36": 4907.05, + "37": 4911.2, + "38": 4898.33, + "39": 4902.63, + "40": 4895.15, + "41": 4898.9, + "42": 4901.14, + "43": 4896.49, + "44": 4899.01, + "45": 4896.71, + "46": 4887.24, + "47": 4892.91, + "48": 4886.71, + "49": 4885.16 }, - "expected_e2e_ms": 254850.94, - "expected_avg_denoise_ms": 4914.73, - "expected_median_denoise_ms": 4903.27 + "expected_e2e_ms": 254809.55, + "expected_avg_denoise_ms": 4914.77, + "expected_median_denoise_ms": 4903.96 }, "wan2_2_ti2v_5b": { - "notes": "Text-and-Image-to-Video generation baseline for Wan2.2-TI2V-5B", - "expected_e2e_ms": 178300.0, - "expected_avg_denoise_ms": 3250.0, - "expected_median_denoise_ms": 3260.0, "stages_ms": { - "InputValidationStage": 150.0, - "TextEncodingStage": 3000.0, - "ConditioningStage": 1.0, - "TimestepPreparationStage": 6.0, - "LatentPreparationStage": 30.0, - "DenoisingStage": 162900.0, - "DecodingStage": 14767.0 + "InputValidationStage": 79.9, + "TextEncodingStage": 2241.88, + "ConditioningStage": 0.01, + "TimestepPreparationStage": 1.63, + "LatentPreparationStage": 22.52, + "DenoisingStage": 130094.11, + "DecodingStage": 12793.58, + "per_frame_generation": null }, "denoise_step_ms": { - "0": 3700.0, - "10": 3300.0, - "20": 3300.0, - "29": 3300.0, - "39": 3300.0, - "49": 3300.0 + "0": 3092.88, + "1": 2526.43, + "2": 2544.76, + "3": 2543.33, + "4": 2545.33, + "5": 2542.44, + "6": 2540.33, + "7": 2542.23, + "8": 2544.87, + "9": 2547.86, + "10": 2548.18, + "11": 2551.13, + "12": 2547.02, + "13": 2551.31, + "14": 2551.97, + "15": 2549.61, + "16": 2551.75, + "17": 2552.97, + "18": 2551.2, + "19": 2555.07, + "20": 2552.72, + "21": 2551.24, + "22": 2554.63, + "23": 2555.52, + "24": 2555.06, + "25": 2550.04, + "26": 2554.88, + "27": 2553.69, + "28": 2550.75, + "29": 2555.17, + "30": 2556.75, + "31": 2554.22, + "32": 2552.74, + "33": 2554.31, + "34": 2554.98, + "35": 2553.65, + "36": 2552.21, + "37": 2554.85, + "38": 2555.96, + "39": 2553.78, + "40": 2553.5, + "41": 2550.98, + "42": 2555.66, + "43": 2551.91, + "44": 2551.23, + "45": 2555.91, + "46": 2556.11, + "47": 2548.55, + "48": 2552.78, + "49": 2553.49 }, - "frames_per_second": null, - "total_frames": null, - "avg_frame_time_ms": null + "expected_e2e_ms": 145253.72, + "expected_avg_denoise_ms": 2561.76, + "expected_median_denoise_ms": 2552.46 } } } diff --git a/python/sglang/multimodal_gen/test/server/test_server_performance.py b/python/sglang/multimodal_gen/test/server/test_server_performance.py index 45c62433f6c4..ce517403aa1f 100644 --- a/python/sglang/multimodal_gen/test/server/test_server_performance.py +++ b/python/sglang/multimodal_gen/test/server/test_server_performance.py @@ -1,5 +1,8 @@ """ Config-driven diffusion performance test with pytest parametrization. + + +If the actual run is significantly better than the baseline, the improved cases with their updated baseline will be printed """ from __future__ import annotations @@ -27,6 +30,7 @@ DIFFUSION_CASES, DiffusionTestCase, PerformanceSummary, + ScenarioConfig, ) from sglang.multimodal_gen.test.test_utils import ( get_dynamic_server_port, @@ -110,10 +114,12 @@ class TestDiffusionPerformance: """ _perf_results: list[dict[str, Any]] = [] + _improved_baselines: list[dict[str, Any]] = [] @classmethod def setup_class(cls): cls._perf_results = [] + cls._improved_baselines = [] @classmethod def teardown_class(cls): @@ -121,6 +127,20 @@ def teardown_class(cls): result["class_name"] = cls.__name__ _GLOBAL_PERF_RESULTS.append(result) + if cls._improved_baselines: + import json + + output = """ +--- POTENTIAL BASELINE IMPROVEMENTS DETECTED --- +The following test cases performed significantly better than their baselines. +Consider updating perf_baselines.json with the snippets below: +""" + for item in cls._improved_baselines: + output += ( + f'\n"{item["id"]}": {json.dumps(item["baseline"], indent=4)},\n' + ) + print(output) + def _client(self, ctx: ServerContext) -> OpenAI: """Get OpenAI client for the server.""" return OpenAI( @@ -150,7 +170,6 @@ def _run_and_collect( stage_metrics = {} if perf_record: - stage_metrics, _ = wait_for_stage_metrics( perf_record.get("request_id", ""), prev_len, @@ -358,11 +377,23 @@ def _validate_and_record( is_baseline_generation_mode = os.environ.get("SGLANG_GEN_BASELINE", "0") == "1" scenario = BASELINE_CONFIG.scenarios.get(case.id) + missing_scenario = False if scenario is None: - if is_baseline_generation_mode: - scenario = {} # Dummy scenario - else: - pytest.fail(f"Testcase '{case.id}' not in perf_baselines.json") + # Create dummy scenario to allow metric collection + scenario = type( + "DummyScenario", + (), + { + "expected_e2e_ms": 0, + "expected_avg_denoise_ms": 0, + "expected_median_denoise_ms": 0, + "stages_ms": {}, + "denoise_step_ms": {}, + }, + )() + if not is_baseline_generation_mode: + missing_scenario = True + validator_name = case.custom_validator or "default" validator_class = VALIDATOR_REGISTRY.get(validator_name, PerformanceValidator) @@ -374,10 +405,14 @@ def _validate_and_record( summary = validator.collect_metrics(perf_record, stage_metrics) - if is_baseline_generation_mode: + if is_baseline_generation_mode or missing_scenario: self._dump_baseline_scenario(case, summary) + if missing_scenario: + pytest.fail(f"Testcase '{case.id}' not found in perf_baselines.json") return + self._check_for_improvement(case, summary, scenario) + try: validator.validate(perf_record, stage_metrics, case.num_frames) except AssertionError: @@ -458,6 +493,80 @@ def _validate_and_record( summary.avg_frame_time_ms, ) + def _check_for_improvement( + self, + case: DiffusionTestCase, + summary: PerformanceSummary, + scenario: "ScenarioConfig", + ) -> None: + """Check for potential significant performance improvements and record them.""" + is_improved = False + threshold = BASELINE_CONFIG.improvement_threshold + + def is_sig_faster(actual, expected): + if expected == 0: + return False + return actual < expected * (1 - threshold) + + # Check for any significant improvement + if ( + is_sig_faster(summary.e2e_ms, scenario.expected_e2e_ms) + or is_sig_faster(summary.avg_denoise_ms, scenario.expected_avg_denoise_ms) + or is_sig_faster( + summary.median_denoise_ms, scenario.expected_median_denoise_ms + ) + ): + is_improved = True + + # Combine metrics, always taking the better (lower) value + new_stages = { + stage: min( + summary.stage_metrics.get(stage, float("inf")), + scenario.stages_ms.get(stage, float("inf")), + ) + for stage in set(summary.stage_metrics) | set(scenario.stages_ms) + } + new_denoise_steps = { + step: min( + summary.all_denoise_steps.get(step, float("inf")), + scenario.denoise_step_ms.get(step, float("inf")), + ) + for step in set(summary.all_denoise_steps) | set(scenario.denoise_step_ms) + } + + # Check for stage-level improvements + if not is_improved: + for stage, new_val in new_stages.items(): + if is_sig_faster(new_val, scenario.stages_ms.get(stage, float("inf"))): + is_improved = True + break + if not is_improved: + for step, new_val in new_denoise_steps.items(): + if is_sig_faster( + new_val, scenario.denoise_step_ms.get(step, float("inf")) + ): + is_improved = True + break + + if is_improved: + new_baseline = { + "stages_ms": {k: round(v, 2) for k, v in new_stages.items()}, + "denoise_step_ms": { + str(k): round(v, 2) for k, v in new_denoise_steps.items() + }, + "expected_e2e_ms": round( + min(summary.e2e_ms, scenario.expected_e2e_ms), 2 + ), + "expected_avg_denoise_ms": round( + min(summary.avg_denoise_ms, scenario.expected_avg_denoise_ms), 2 + ), + "expected_median_denoise_ms": round( + min(summary.median_denoise_ms, scenario.expected_median_denoise_ms), + 2, + ), + } + self._improved_baselines.append({"id": case.id, "baseline": new_baseline}) + def _dump_baseline_scenario( self, case: DiffusionTestCase, summary: "PerformanceSummary" ) -> None: @@ -465,7 +574,7 @@ def _dump_baseline_scenario( import json denoise_steps_formatted = { - str(k): round(v, 2) for k, v in summary.sampled_steps.items() + str(k): round(v, 2) for k, v in summary.all_denoise_steps.items() } stages_formatted = {k: round(v, 2) for k, v in summary.stage_metrics.items()} diff --git a/python/sglang/multimodal_gen/test/server/test_server_utils.py b/python/sglang/multimodal_gen/test/server/test_server_utils.py index eb32bc98ac6c..34f593096cd6 100644 --- a/python/sglang/multimodal_gen/test/server/test_server_utils.py +++ b/python/sglang/multimodal_gen/test/server/test_server_utils.py @@ -365,6 +365,7 @@ def collect_metrics( median_denoise_ms=median_denoise, stage_metrics=stage_metrics, sampled_steps=sampled_steps, + all_denoise_steps=per_step, ) def _validate_e2e(self, summary: PerformanceSummary) -> None: diff --git a/python/sglang/multimodal_gen/test/server/testcase_configs.py b/python/sglang/multimodal_gen/test/server/testcase_configs.py index 94bfcb8b3d23..10d30e2e883e 100644 --- a/python/sglang/multimodal_gen/test/server/testcase_configs.py +++ b/python/sglang/multimodal_gen/test/server/testcase_configs.py @@ -54,6 +54,7 @@ class BaselineConfig: step_fractions: Sequence[float] warmup_defaults: dict[str, int] tolerances: ToleranceConfig + improvement_threshold: float @classmethod def load(cls, path: Path) -> BaselineConfig: @@ -88,6 +89,9 @@ def load(cls, path: Path) -> BaselineConfig: step_fractions=tuple(data["sampling"]["step_fractions"]), warmup_defaults=data["sampling"].get("warmup_requests", {}), tolerances=tolerances, + improvement_threshold=data.get("improvement_reporting", {}).get( + "threshold", 0.2 + ), ) @@ -134,6 +138,7 @@ class PerformanceSummary: median_denoise_ms: float stage_metrics: dict[str, float] sampled_steps: dict[int, float] + all_denoise_steps: dict[int, float] frames_per_second: float | None = None total_frames: int | None = None avg_frame_time_ms: float | None = None @@ -182,7 +187,7 @@ class PerformanceSummary: # === Text to Video (T2V) === # TODO: FastWan2.1, FastWan2.2 DiffusionTestCase( - id="fastwan2_1_t2v", + id="wan2_1_t2v_1.3b", model_path="Wan-AI/Wan2.1-T2V-1.3B-Diffusers", modality="video", prompt="A curious raccoon",