From a82569e1005396e85f7fb95fc8c70b23003a3f71 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 2 Sep 2025 03:20:47 +0000 Subject: [PATCH 1/2] Done Signed-off-by: Jee Jee Li --- benchmarks/kernels/benchmark_moe.py | 4 +- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 80 +++++++++---------- 2 files changed, 43 insertions(+), 41 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 710d30adfd84..ada70f90c86e 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -678,7 +678,9 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) search_space = get_configs_compute_bound(is_fp16, block_quant_shape) print(f"Start tuning over {len(search_space)} configurations...") - + if use_deep_gemm: + print("Only supports tuning triton kernels, set use_deep_gemm=False.") + use_deep_gemm = False start = time.time() configs = _distribute( "tune", diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json index 307c9240938c..c7998718dab4 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json @@ -18,18 +18,18 @@ "4": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, "8": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, "16": { "BLOCK_SIZE_M": 16, @@ -58,7 +58,7 @@ "48": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4 @@ -74,73 +74,73 @@ "96": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "128": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 2 + "num_stages": 4 }, "256": { - "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "512": { - "BLOCK_SIZE_M": 256, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, - "num_warps": 8, + "num_warps": 4, "num_stages": 4 }, "1024": { - "BLOCK_SIZE_M": 256, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4 }, "1536": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 4 - }, - "2048": { - "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, - "num_warps": 8, - "num_stages": 5 + "num_warps": 4, + "num_stages": 3 }, - "3072": { - "BLOCK_SIZE_M": 128, + "2048": { + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 }, "4096": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 64, - "num_warps": 8, - "num_stages": 5 + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 } } From cc160c9b1745603db9a1bfc724f0e25b6e4cac00 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 6 Sep 2025 01:40:02 +0000 Subject: [PATCH 2/2] Fix Signed-off-by: Jee Jee Li --- benchmarks/kernels/benchmark_moe.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index ada70f90c86e..6259aa0dd629 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -679,8 +679,10 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: search_space = get_configs_compute_bound(is_fp16, block_quant_shape) print(f"Start tuning over {len(search_space)} configurations...") if use_deep_gemm: - print("Only supports tuning triton kernels, set use_deep_gemm=False.") - use_deep_gemm = False + raise ValueError( + "Tuning with --use-deep-gemm is not supported as it only tunes Triton " + "kernels. Please remove the flag." + ) start = time.time() configs = _distribute( "tune",