diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index fa99c1ff559..5c9f3b89a94 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -82,10 +82,10 @@ class TestFile: TestFile("test_triton_moe_channel_fp8_kernel.py", 25), ], "per-commit-2-gpu": [ - TestFile("models/lora/test_lora_tp.py", 300), + TestFile("models/lora/test_lora_tp.py", 150), TestFile("test_data_parallelism.py", 90), - TestFile("test_dp_attention.py", 90), - TestFile("test_mla_tp.py", 420), + TestFile("test_dp_attention.py", 150), + TestFile("test_mla_tp.py", 174), TestFile("test_moe_ep.py", 220), TestFile("test_patch_torch.py", 30), TestFile("test_update_weights_from_distributed.py", 100), diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py index 9973b1fa99e..098e2df9f76 100644 --- a/test/srt/test_bench_one_batch.py +++ b/test/srt/test_bench_one_batch.py @@ -51,7 +51,7 @@ def test_torch_compile_tp2_bs1(self): f"### test_torch_compile_tp2_bs1\n" f"output_throughput : {output_throughput:.2f} token/s\n" ) - self.assertGreater(output_throughput, 235) + self.assertGreater(output_throughput, 225) if __name__ == "__main__": diff --git a/test/srt/test_torch_native_attention_backend.py b/test/srt/test_torch_native_attention_backend.py index 3af0557d04c..2a80440542b 100644 --- a/test/srt/test_torch_native_attention_backend.py +++ b/test/srt/test_torch_native_attention_backend.py @@ -28,7 +28,7 @@ def test_latency(self): if is_in_ci(): # Torch native backend is expected to be slower - assert output_throughput > 50, f"{output_throughput=}" + self.assertGreater(output_throughput, 40) def test_mmlu(self): model = DEFAULT_MODEL_NAME_FOR_TEST