From 6c1b110784266c5c88c9b7c7ac3c0d82d8eaae4d Mon Sep 17 00:00:00 2001 From: diwei sun Date: Wed, 24 Sep 2025 01:30:34 -0700 Subject: [PATCH 01/10] refine kernel benchmark structure --- benchmark/bench_moe_topk_softmax.py | 111 ++++++++++---------- benchmark/utils.py | 155 ++++++++++++++++++++++++++++ 2 files changed, 214 insertions(+), 52 deletions(-) create mode 100644 benchmark/utils.py diff --git a/benchmark/bench_moe_topk_softmax.py b/benchmark/bench_moe_topk_softmax.py index eebecdb..d50a231 100644 --- a/benchmark/bench_moe_topk_softmax.py +++ b/benchmark/bench_moe_topk_softmax.py @@ -1,77 +1,76 @@ -import itertools +# benchmark_topk_softmax.py +# Benchmark script for topk_softmax operator: compares VLLM vs SGLang implementations +# Supports two modes: +# 1. --model-name provided → load config from HF model +# 2. No --model-name → use default hardcoded test configurations +import itertools import torch import triton +from utils import parse_args, get_model_config from sgl_kernel import topk_softmax def vllm_topk_softmax(gating_output, topk): + """ + Simulate vLLM's topk_softmax using torch.ops._moe_C (mock if not available). + Output: topk_weights, topk_indices + """ num_tokens, num_experts = gating_output.shape - topk_weights = torch.empty( - (num_tokens, topk), device=gating_output.device, dtype=torch.float32 - ) - topk_indices = torch.empty( - (num_tokens, topk), dtype=torch.int32, device=gating_output.device - ) - token_expert_indices = torch.empty( - (num_tokens, topk), dtype=torch.int32, device=gating_output.device - ) - torch.ops._moe_C.topk_softmax( - topk_weights, topk_indices, token_expert_indices, gating_output - ) + topk_weights = torch.empty((num_tokens, topk), device=gating_output.device, dtype=torch.float32) + topk_indices = torch.empty((num_tokens, topk), dtype=torch.int32, device=gating_output.device) + token_expert_indices = torch.empty((num_tokens, topk), dtype=torch.int32, device=gating_output.device) + + try: + torch.ops._moe_C.topk_softmax(topk_weights, topk_indices, token_expert_indices, gating_output) + except (AttributeError, ImportError): + # Mock behavior if vLLM ops not available + scores = torch.softmax(gating_output, dim=-1) + topk_vals, topk_idx = torch.topk(scores, topk, dim=-1) + topk_weights.copy_(topk_vals) + topk_indices.copy_(topk_idx) + return topk_weights, topk_indices def sglang_topk_softmax(gating_output, topk): + """ + Call SGLang's custom topk_softmax kernel. + Output: topk_weights, topk_indices + """ num_tokens, num_experts = gating_output.shape - topk_weights = torch.empty( - (num_tokens, topk), device=gating_output.device, dtype=torch.float32 - ) - topk_indices = torch.empty( - (num_tokens, topk), dtype=torch.int32, device=gating_output.device - ) - token_expert_indices = torch.empty( - (num_tokens, topk), dtype=torch.int32, device=gating_output.device - ) + topk_weights = torch.empty((num_tokens, topk), device=gating_output.device, dtype=torch.float32) + topk_indices = torch.empty((num_tokens, topk), dtype=torch.int32, device=gating_output.device) + # Call the actual SGLang kernel topk_softmax( topk_weights=topk_weights, topk_ids=topk_indices, - token_expert_indices=token_expert_indices, gating_output=gating_output, + renormalize=True, ) return topk_weights, topk_indices def calculate_diff(num_tokens, num_experts, topk): - gating_output = torch.randn( - (num_tokens, num_experts), device="cuda", dtype=torch.float32 - ) + """ + Compare output difference between VLLM and SGLang implementations. + """ + gating_output = torch.randn((num_tokens, num_experts), device="cuda", dtype=torch.float32) + weights_vllm, indices_vllm = vllm_topk_softmax(gating_output.clone(), topk) weights_sglang, indices_sglang = sglang_topk_softmax(gating_output.clone(), topk) weights_diff = torch.abs(weights_vllm - weights_sglang).mean().item() indices_match = torch.equal(indices_vllm, indices_sglang) - if ( - torch.allclose(weights_vllm, weights_sglang, atol=1e-3, rtol=1e-3) - and indices_match - ): - print("✅ VLLM and SGLang topk_softmax implementations match") + if torch.allclose(weights_vllm, weights_sglang, atol=1e-3, rtol=1e-3) and indices_match: + print(f"✅ Match | Tokens={num_tokens}, Experts={num_experts}, TopK={topk}") else: - print( - f"❌ Implementations differ: Weights diff={weights_diff}, Indices match={indices_match}" - ) - - -num_tokens_range = [128, 512, 1024, 2048, 4096, 8192, 16384, 32768] -num_experts_range = [32, 64, 128, 256, 12, 512] -topk_range = [1, 2, 4, 8] - -configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range)) + print(f"❌ Diff | Tokens={num_tokens}, Δ={weights_diff:.6f}, Indices={indices_match}") @triton.testing.perf_report( @@ -105,14 +104,22 @@ def benchmark(num_tokens, num_experts, topk, provider): if __name__ == "__main__": - configs = [ - (20, 256, 4), - (20, 256, 8), - (20, 12, 4), - (20, 12, 1), - (20, 512, 4), - (20, 512, 1), - ] - for num_tokens, num_experts, topk in configs: - calculate_diff(num_tokens, num_experts, topk) - benchmark.run(print_data=True) + # Run correctness test on small configs if not using a real model + args = parse_args() + config = get_model_config(args) + if args.model_name is None: + print("🧪 Running correctness tests on default configs...") + test_configs = [ + (20, 256, 4), + (20, 256, 8), + (20, 12, 4), + (20, 12, 1), + (20, 512, 4), + (20, 512, 1), + ] + for n, e, k in test_configs: + calculate_diff(n, e, k) + + # Run benchmark + print("🚀 Starting performance benchmark...") + benchmark.run(print_data=True, show_plots=False, save_path=".") \ No newline at end of file diff --git a/benchmark/utils.py b/benchmark/utils.py new file mode 100644 index 0000000..56eaa58 --- /dev/null +++ b/benchmark/utils.py @@ -0,0 +1,155 @@ +# utils.py +# Flexible config loader: supports +# 1. Hugging Face model config (--model-name) +# 2. Manual override via CLI args (e.g., --num-experts) +# 3. Safe fallback defaults + +from transformers import AutoConfig +import argparse + + +def get_model_config(args): + """ + Get model config with priority: + 1. CLI args override (e.g., --num-experts) + 2. Hugging Face config (if --model-name given) + 3. Hardcoded defaults (last resort) + + Args: + args: Parsed command-line arguments + + Returns: + dict: Standardized model config + """ + config_dict = {} + + # Step 1: Load from Hugging Face model (if provided) + if args.model_name: + print(f"📡 Loading config from Hugging Face: {args.model_name}") + try: + hf_config = AutoConfig.from_pretrained(args.model_name) + except Exception as e: + raise ValueError(f"Failed to load {args.model_name}: {e}") + + # Extract with fallbacks + config_dict.update({ + "num_experts": getattr(hf_config, "moe_num_experts", None) or \ + getattr(hf_config, "num_experts", None) or \ + getattr(hf_config, "num_local_experts", None), + "top_k": getattr(hf_config, "moe_top_k", None) or \ + getattr(hf_config, "top_k", None) or \ + getattr(hf_config, "num_experts_per_tok", None), + "num_layers": getattr(hf_config, "num_hidden_layers", None) or \ + getattr(hf_config, "num_layers", None), + "hidden_size": getattr(hf_config, "hidden_size", None) or \ + getattr(hf_config, "d_model", None), + "ffn_hidden_size": getattr(hf_config, "intermediate_size", None) or \ + getattr(hf_config, "ffn_dim", None), + "num_heads": getattr(hf_config, "num_attention_heads", None), + "num_kv_heads": getattr(hf_config, "num_key_value_heads", None) or \ + getattr(hf_config, "num_attention_heads", None), + "head_dim": getattr(hf_config, "head_dim", None) or \ + (getattr(hf_config, "hidden_size", None) // getattr(hf_config, "num_attention_heads", 1) if getattr(hf_config, "hidden_size") and getattr(hf_config, "num_attention_heads") else None), + "vocab_size": getattr(hf_config, "vocab_size", None), + "max_seq_len": getattr(hf_config, "max_position_embeddings", None) or \ + getattr(hf_config, "n_positions", 32768), + "norm_eps": getattr(hf_config, "rms_norm_eps", None) or \ + getattr(hf_config, "layer_norm_eps", 1e-6), + "architectures": getattr(hf_config, "architectures", ["Unknown"]), + "dtype": getattr(hf_config, "torch_dtype", "float16"), + }) + else: + print("🔧 No --model-name provided. Using CLI args or defaults.") + + # Step 2: CLI args override everything + cli_overrides = { + "num_experts": args.num_experts, + "top_k": args.top_k, + "num_layers": args.num_layers, + "hidden_size": args.hidden_size, + "ffn_hidden_size": args.ffn_hidden_size, + "num_heads": args.num_heads, + "num_kv_heads": args.num_kv_heads, + "head_dim": args.head_dim, + "vocab_size": args.vocab_size, + "max_seq_len": args.max_seq_len, + "norm_eps": args.norm_eps, + } + + for k, v in cli_overrides.items(): + if v is not None: + config_dict[k] = v + print(f"⚙️ Overriding {k} = {v} (from CLI)") + + # Step 3: Fill missing with safe defaults + defaults = { + "num_experts": 64, + "top_k": 2, + "num_layers": 32, + "hidden_size": 4096, + "ffn_hidden_size": 11008, + "num_heads": 32, + "num_kv_heads": 8, + "head_dim": 128, + "vocab_size": 32000, + "max_seq_len": 32768, + "norm_eps": 1e-6, + "architectures": ["LlamaForCausalLM"], + "dtype": "float16", + } + + for k, v in defaults.items(): + if k not in config_dict or config_dict[k] is None: + config_dict[k] = v + if args.model_name or any(getattr(args, field) is not None + for field in ["num_experts", "top_k", "num_layers"]): + pass # Don't log if user expected override + else: + print(f"💡 Using default {k} = {v}") + + # Add model name + config_dict["model_name"] = args.model_name + + return config_dict + + +def parse_args(): + """Parse all possible model and benchmark arguments.""" + parser = argparse.ArgumentParser(description="Flexible benchmark with model config support") + + # Model source + parser.add_argument("--model-name", type=str, default=None, + help="Hugging Face model name (e.g., deepseek-ai/DeepSeek-R1). If not set, use CLI args.") + + # MoE parameters + parser.add_argument("--num-experts", type=int, default=None, + help="Number of experts (override if not from model)") + parser.add_argument("--top-k", type=int, default=None, + help="Top-k experts per token") + + # Transformer parameters + parser.add_argument("--num-layers", type=int, default=None, + help="Number of transformer layers") + parser.add_argument("--hidden-size", type=int, default=None, + help="Hidden size (d_model)") + parser.add_argument("--ffn-hidden-size", type=int, default=None, + help="FFN/intermediate size") + parser.add_argument("--num-heads", type=int, default=None, + help="Number of attention heads") + parser.add_argument("--num-kv-heads", type=int, default=None, + help="Number of KV heads (for GQA)") + parser.add_argument("--head-dim", type=int, default=None, + help="Dimension per attention head") + parser.add_argument("--vocab-size", type=int, default=None, + help="Vocabulary size") + parser.add_argument("--max-seq-len", type=int, default=None, + help="Maximum sequence length") + parser.add_argument("--norm-eps", type=float, default=None, + help="Normalization epsilon (rms_norm_eps)") + + # Benchmark settings + parser.add_argument("--device", type=str, default="cuda", help="Device (default: cuda)") + parser.add_argument("--dtype", type=str, default="float32", + choices=["float32", "float16", "bfloat16"], help="Data type") + + return parser.parse_args() \ No newline at end of file From ec92d0b5d1339e3ebb109e46e288e9593049c18c Mon Sep 17 00:00:00 2001 From: diwei sun Date: Wed, 24 Sep 2025 20:16:15 -0700 Subject: [PATCH 02/10] refine --- benchmark/bench_moe_topk_softmax.py | 169 ++++++++++++++-------------- benchmark/utils.py | 40 +++---- 2 files changed, 106 insertions(+), 103 deletions(-) diff --git a/benchmark/bench_moe_topk_softmax.py b/benchmark/bench_moe_topk_softmax.py index d50a231..400f8b4 100644 --- a/benchmark/bench_moe_topk_softmax.py +++ b/benchmark/bench_moe_topk_softmax.py @@ -1,125 +1,128 @@ -# benchmark_topk_softmax.py -# Benchmark script for topk_softmax operator: compares VLLM vs SGLang implementations -# Supports two modes: -# 1. --model-name provided → load config from HF model -# 2. No --model-name → use default hardcoded test configurations - import itertools + import torch import triton -from utils import parse_args, get_model_config +from utils import parse_args from sgl_kernel import topk_softmax def vllm_topk_softmax(gating_output, topk): - """ - Simulate vLLM's topk_softmax using torch.ops._moe_C (mock if not available). - Output: topk_weights, topk_indices - """ num_tokens, num_experts = gating_output.shape - topk_weights = torch.empty((num_tokens, topk), device=gating_output.device, dtype=torch.float32) - topk_indices = torch.empty((num_tokens, topk), dtype=torch.int32, device=gating_output.device) - token_expert_indices = torch.empty((num_tokens, topk), dtype=torch.int32, device=gating_output.device) - - try: - torch.ops._moe_C.topk_softmax(topk_weights, topk_indices, token_expert_indices, gating_output) - except (AttributeError, ImportError): - # Mock behavior if vLLM ops not available - scores = torch.softmax(gating_output, dim=-1) - topk_vals, topk_idx = torch.topk(scores, topk, dim=-1) - topk_weights.copy_(topk_vals) - topk_indices.copy_(topk_idx) - + topk_weights = torch.empty( + (num_tokens, topk), device=gating_output.device, dtype=torch.float32 + ) + topk_indices = torch.empty( + (num_tokens, topk), dtype=torch.int32, device=gating_output.device + ) + token_expert_indices = torch.empty( + (num_tokens, topk), dtype=torch.int32, device=gating_output.device + ) + torch.ops._moe_C.topk_softmax( + topk_weights, topk_indices, token_expert_indices, gating_output + ) return topk_weights, topk_indices + def sglang_topk_softmax(gating_output, topk): - """ - Call SGLang's custom topk_softmax kernel. - Output: topk_weights, topk_indices - """ num_tokens, num_experts = gating_output.shape - topk_weights = torch.empty((num_tokens, topk), device=gating_output.device, dtype=torch.float32) - topk_indices = torch.empty((num_tokens, topk), dtype=torch.int32, device=gating_output.device) + topk_weights = torch.empty( + (num_tokens, topk), device=gating_output.device, dtype=torch.float32 + ) + topk_indices = torch.empty( + (num_tokens, topk), dtype=torch.int32, device=gating_output.device + ) + token_expert_indices = torch.empty( + (num_tokens, topk), dtype=torch.int32, device=gating_output.device + ) - # Call the actual SGLang kernel topk_softmax( - topk_weights=topk_weights, - topk_ids=topk_indices, - gating_output=gating_output, - renormalize=True, + topk_weights, + topk_indices, + gating_output, + renormalize=False, ) return topk_weights, topk_indices def calculate_diff(num_tokens, num_experts, topk): - """ - Compare output difference between VLLM and SGLang implementations. - """ - gating_output = torch.randn((num_tokens, num_experts), device="cuda", dtype=torch.float32) - + gating_output = torch.randn( + (num_tokens, num_experts), device=gating_output.device, dtype=torch.float32 + ) weights_vllm, indices_vllm = vllm_topk_softmax(gating_output.clone(), topk) weights_sglang, indices_sglang = sglang_topk_softmax(gating_output.clone(), topk) weights_diff = torch.abs(weights_vllm - weights_sglang).mean().item() indices_match = torch.equal(indices_vllm, indices_sglang) - if torch.allclose(weights_vllm, weights_sglang, atol=1e-3, rtol=1e-3) and indices_match: - print(f"✅ Match | Tokens={num_tokens}, Experts={num_experts}, TopK={topk}") + if ( + torch.allclose(weights_vllm, weights_sglang, atol=1e-3, rtol=1e-3) + and indices_match + ): + print("✅ VLLM and SGLang topk_softmax implementations match") else: - print(f"❌ Diff | Tokens={num_tokens}, Δ={weights_diff:.6f}, Indices={indices_match}") - - -@triton.testing.perf_report( - triton.testing.Benchmark( - x_names=["num_tokens", "num_experts", "topk"], - x_vals=configs, - line_arg="provider", - line_vals=["sglang", "vllm"], - line_names=["SGLang", "VLLM"], - styles=[("blue", "-"), ("green", "-")], - ylabel="Latency (us)", - plot_name="topk-softmax-performance", - args={}, + print( + f"❌ Implementations differ: Weights diff={weights_diff}, Indices match={indices_match}" + ) + +def get_benchmark(device='xpu'): + @triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["num_tokens", "num_experts", "topk", "dtype"], + x_vals=configs, + line_arg="provider", + line_vals=["sglang", "vllm"], + line_names=["SGLang", "VLLM"], + styles=[("blue", "-"), ("green", "-")], + ylabel="Latency (us)", + plot_name="topk-softmax-performance", + args={}, + ) ) -) -def benchmark(num_tokens, num_experts, topk, provider): + def benchmark(num_tokens, num_experts, topk, dtype, provider): - gating_output = torch.randn( - (num_tokens, num_experts), device="cuda", dtype=torch.float32 - ) + gating_output = torch.randn( + (num_tokens, num_experts), device=device, dtype=dtype + ) + + if provider == "vllm" or provider == "vllm1": + fn = lambda: vllm_topk_softmax(gating_output, topk) + elif provider == "sglang" or provider == "sglang1": + fn = lambda: sglang_topk_softmax(gating_output, topk) - if provider == "vllm" or provider == "vllm1": - fn = lambda: vllm_topk_softmax(gating_output, topk) - elif provider == "sglang" or provider == "sglang1": - fn = lambda: sglang_topk_softmax(gating_output, topk) + quantiles = [0.5, 0.2, 0.8] + ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles) - quantiles = [0.5, 0.2, 0.8] - ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles) + return 1000 * ms, 1000 * max_ms, 1000 * min_ms - return 1000 * ms, 1000 * max_ms, 1000 * min_ms + return benchmark if __name__ == "__main__": # Run correctness test on small configs if not using a real model args = parse_args() - config = get_model_config(args) - if args.model_name is None: - print("🧪 Running correctness tests on default configs...") - test_configs = [ - (20, 256, 4), - (20, 256, 8), - (20, 12, 4), - (20, 12, 1), - (20, 512, 4), - (20, 512, 1), - ] - for n, e, k in test_configs: - calculate_diff(n, e, k) + sweep_params = { + "num_tokens": [1, 32, 128, 512], + "num_experts": args.num_experts or [64], + "top_k": args.top_k or [2, 4], + "dtype": [torch.float16, torch.bfloat16], + } + keys = sweep_params.keys() + configs = list(itertools.product(*sweep_params.values())) + print(f"Testing {len(configs)} configurations...") + for config in configs: + num_tokens, num_experts, topk, dtype = config + print(f"Config: num_tokens={num_tokens}, num_experts={num_experts}, topk={topk}, dtype={dtype}") + + calculate_diff(num_tokens, num_experts, topk) + + global benchmark_configs + benchmark_configs = configs # Run benchmark - print("🚀 Starting performance benchmark...") - benchmark.run(print_data=True, show_plots=False, save_path=".") \ No newline at end of file + print("Starting performance benchmark...") + benchmark = get_benchmark() + benchmark.run(print_data=True, show_plots=False, save_path=".") diff --git a/benchmark/utils.py b/benchmark/utils.py index 56eaa58..c1a8baf 100644 --- a/benchmark/utils.py +++ b/benchmark/utils.py @@ -114,42 +114,42 @@ def get_model_config(args): def parse_args(): - """Parse all possible model and benchmark arguments.""" + """Parse all possible model and benchmark arguments (support list values).""" parser = argparse.ArgumentParser(description="Flexible benchmark with model config support") # Model source parser.add_argument("--model-name", type=str, default=None, help="Hugging Face model name (e.g., deepseek-ai/DeepSeek-R1). If not set, use CLI args.") - # MoE parameters - parser.add_argument("--num-experts", type=int, default=None, - help="Number of experts (override if not from model)") - parser.add_argument("--top-k", type=int, default=None, - help="Top-k experts per token") + # MoE parameters (support list) + parser.add_argument("--num-experts", type=int, default=None, nargs='*', + help="Number of experts (can provide multiple values for sweep)") + parser.add_argument("--top-k", type=int, default=None, nargs='*', + help="Top-k experts per token (multiple values allowed)") - # Transformer parameters - parser.add_argument("--num-layers", type=int, default=None, - help="Number of transformer layers") - parser.add_argument("--hidden-size", type=int, default=None, + # Transformer parameters (support list) + parser.add_argument("--num-layers", type=int, default=None, nargs='*', + help="Number of transformer layers (multiple values)") + parser.add_argument("--hidden-size", type=int, default=None, nargs='*', help="Hidden size (d_model)") - parser.add_argument("--ffn-hidden-size", type=int, default=None, + parser.add_argument("--ffn-hidden-size", type=int, default=None, nargs='*', help="FFN/intermediate size") - parser.add_argument("--num-heads", type=int, default=None, + parser.add_argument("--num-heads", type=int, default=None, nargs='*', help="Number of attention heads") - parser.add_argument("--num-kv-heads", type=int, default=None, + parser.add_argument("--num-kv-heads", type=int, default=None, nargs='*', help="Number of KV heads (for GQA)") - parser.add_argument("--head-dim", type=int, default=None, + parser.add_argument("--head-dim", type=int, default=None, nargs='*', help="Dimension per attention head") - parser.add_argument("--vocab-size", type=int, default=None, + parser.add_argument("--vocab-size", type=int, default=None, nargs='*', help="Vocabulary size") - parser.add_argument("--max-seq-len", type=int, default=None, + parser.add_argument("--max-seq-len", type=int, default=None, nargs='*', help="Maximum sequence length") - parser.add_argument("--norm-eps", type=float, default=None, + parser.add_argument("--norm-eps", type=float, default=None, nargs='*', help="Normalization epsilon (rms_norm_eps)") # Benchmark settings - parser.add_argument("--device", type=str, default="cuda", help="Device (default: cuda)") - parser.add_argument("--dtype", type=str, default="float32", - choices=["float32", "float16", "bfloat16"], help="Data type") + parser.add_argument("--device", type=str, default="xpu", help="Device (default: xpu)") + parser.add_argument("--dtype", type=str, default="torch.float32", + choices=["torch.float32", "torch.float16", "torch.bfloat16"], help="Data type") return parser.parse_args() \ No newline at end of file From 6029859a658f14472f2e174e67fa041f05321ae0 Mon Sep 17 00:00:00 2001 From: diwei sun Date: Wed, 24 Sep 2025 20:16:41 -0700 Subject: [PATCH 03/10] format fix --- benchmark/bench_moe_topk_softmax.py | 12 +- benchmark/utils.py | 189 +++++++++++++++++++--------- 2 files changed, 137 insertions(+), 64 deletions(-) diff --git a/benchmark/bench_moe_topk_softmax.py b/benchmark/bench_moe_topk_softmax.py index 400f8b4..7d43ca3 100644 --- a/benchmark/bench_moe_topk_softmax.py +++ b/benchmark/bench_moe_topk_softmax.py @@ -2,8 +2,8 @@ import torch import triton -from utils import parse_args from sgl_kernel import topk_softmax +from utils import parse_args def vllm_topk_softmax(gating_output, topk): @@ -24,7 +24,6 @@ def vllm_topk_softmax(gating_output, topk): return topk_weights, topk_indices - def sglang_topk_softmax(gating_output, topk): num_tokens, num_experts = gating_output.shape @@ -68,10 +67,11 @@ def calculate_diff(num_tokens, num_experts, topk): f"❌ Implementations differ: Weights diff={weights_diff}, Indices match={indices_match}" ) -def get_benchmark(device='xpu'): + +def get_benchmark(device="xpu"): @triton.testing.perf_report( triton.testing.Benchmark( - x_names=["num_tokens", "num_experts", "topk", "dtype"], + x_names=["num_tokens", "num_experts", "topk", "dtype"], x_vals=configs, line_arg="provider", line_vals=["sglang", "vllm"], @@ -115,7 +115,9 @@ def benchmark(num_tokens, num_experts, topk, dtype, provider): print(f"Testing {len(configs)} configurations...") for config in configs: num_tokens, num_experts, topk, dtype = config - print(f"Config: num_tokens={num_tokens}, num_experts={num_experts}, topk={topk}, dtype={dtype}") + print( + f"Config: num_tokens={num_tokens}, num_experts={num_experts}, topk={topk}, dtype={dtype}" + ) calculate_diff(num_tokens, num_experts, topk) diff --git a/benchmark/utils.py b/benchmark/utils.py index c1a8baf..f890f5e 100644 --- a/benchmark/utils.py +++ b/benchmark/utils.py @@ -4,9 +4,10 @@ # 2. Manual override via CLI args (e.g., --num-experts) # 3. Safe fallback defaults -from transformers import AutoConfig import argparse +from transformers import AutoConfig + def get_model_config(args): """ @@ -32,32 +33,40 @@ def get_model_config(args): raise ValueError(f"Failed to load {args.model_name}: {e}") # Extract with fallbacks - config_dict.update({ - "num_experts": getattr(hf_config, "moe_num_experts", None) or \ - getattr(hf_config, "num_experts", None) or \ - getattr(hf_config, "num_local_experts", None), - "top_k": getattr(hf_config, "moe_top_k", None) or \ - getattr(hf_config, "top_k", None) or \ - getattr(hf_config, "num_experts_per_tok", None), - "num_layers": getattr(hf_config, "num_hidden_layers", None) or \ - getattr(hf_config, "num_layers", None), - "hidden_size": getattr(hf_config, "hidden_size", None) or \ - getattr(hf_config, "d_model", None), - "ffn_hidden_size": getattr(hf_config, "intermediate_size", None) or \ - getattr(hf_config, "ffn_dim", None), - "num_heads": getattr(hf_config, "num_attention_heads", None), - "num_kv_heads": getattr(hf_config, "num_key_value_heads", None) or \ - getattr(hf_config, "num_attention_heads", None), - "head_dim": getattr(hf_config, "head_dim", None) or \ - (getattr(hf_config, "hidden_size", None) // getattr(hf_config, "num_attention_heads", 1) if getattr(hf_config, "hidden_size") and getattr(hf_config, "num_attention_heads") else None), - "vocab_size": getattr(hf_config, "vocab_size", None), - "max_seq_len": getattr(hf_config, "max_position_embeddings", None) or \ - getattr(hf_config, "n_positions", 32768), - "norm_eps": getattr(hf_config, "rms_norm_eps", None) or \ - getattr(hf_config, "layer_norm_eps", 1e-6), - "architectures": getattr(hf_config, "architectures", ["Unknown"]), - "dtype": getattr(hf_config, "torch_dtype", "float16"), - }) + config_dict.update( + { + "num_experts": getattr(hf_config, "moe_num_experts", None) + or getattr(hf_config, "num_experts", None) + or getattr(hf_config, "num_local_experts", None), + "top_k": getattr(hf_config, "moe_top_k", None) + or getattr(hf_config, "top_k", None) + or getattr(hf_config, "num_experts_per_tok", None), + "num_layers": getattr(hf_config, "num_hidden_layers", None) + or getattr(hf_config, "num_layers", None), + "hidden_size": getattr(hf_config, "hidden_size", None) + or getattr(hf_config, "d_model", None), + "ffn_hidden_size": getattr(hf_config, "intermediate_size", None) + or getattr(hf_config, "ffn_dim", None), + "num_heads": getattr(hf_config, "num_attention_heads", None), + "num_kv_heads": getattr(hf_config, "num_key_value_heads", None) + or getattr(hf_config, "num_attention_heads", None), + "head_dim": getattr(hf_config, "head_dim", None) + or ( + getattr(hf_config, "hidden_size", None) + // getattr(hf_config, "num_attention_heads", 1) + if getattr(hf_config, "hidden_size") + and getattr(hf_config, "num_attention_heads") + else None + ), + "vocab_size": getattr(hf_config, "vocab_size", None), + "max_seq_len": getattr(hf_config, "max_position_embeddings", None) + or getattr(hf_config, "n_positions", 32768), + "norm_eps": getattr(hf_config, "rms_norm_eps", None) + or getattr(hf_config, "layer_norm_eps", 1e-6), + "architectures": getattr(hf_config, "architectures", ["Unknown"]), + "dtype": getattr(hf_config, "torch_dtype", "float16"), + } + ) else: print("🔧 No --model-name provided. Using CLI args or defaults.") @@ -101,8 +110,10 @@ def get_model_config(args): for k, v in defaults.items(): if k not in config_dict or config_dict[k] is None: config_dict[k] = v - if args.model_name or any(getattr(args, field) is not None - for field in ["num_experts", "top_k", "num_layers"]): + if args.model_name or any( + getattr(args, field) is not None + for field in ["num_experts", "top_k", "num_layers"] + ): pass # Don't log if user expected override else: print(f"💡 Using default {k} = {v}") @@ -115,41 +126,101 @@ def get_model_config(args): def parse_args(): """Parse all possible model and benchmark arguments (support list values).""" - parser = argparse.ArgumentParser(description="Flexible benchmark with model config support") + parser = argparse.ArgumentParser( + description="Flexible benchmark with model config support" + ) # Model source - parser.add_argument("--model-name", type=str, default=None, - help="Hugging Face model name (e.g., deepseek-ai/DeepSeek-R1). If not set, use CLI args.") + parser.add_argument( + "--model-name", + type=str, + default=None, + help="Hugging Face model name (e.g., deepseek-ai/DeepSeek-R1). If not set, use CLI args.", + ) # MoE parameters (support list) - parser.add_argument("--num-experts", type=int, default=None, nargs='*', - help="Number of experts (can provide multiple values for sweep)") - parser.add_argument("--top-k", type=int, default=None, nargs='*', - help="Top-k experts per token (multiple values allowed)") + parser.add_argument( + "--num-experts", + type=int, + default=None, + nargs="*", + help="Number of experts (can provide multiple values for sweep)", + ) + parser.add_argument( + "--top-k", + type=int, + default=None, + nargs="*", + help="Top-k experts per token (multiple values allowed)", + ) # Transformer parameters (support list) - parser.add_argument("--num-layers", type=int, default=None, nargs='*', - help="Number of transformer layers (multiple values)") - parser.add_argument("--hidden-size", type=int, default=None, nargs='*', - help="Hidden size (d_model)") - parser.add_argument("--ffn-hidden-size", type=int, default=None, nargs='*', - help="FFN/intermediate size") - parser.add_argument("--num-heads", type=int, default=None, nargs='*', - help="Number of attention heads") - parser.add_argument("--num-kv-heads", type=int, default=None, nargs='*', - help="Number of KV heads (for GQA)") - parser.add_argument("--head-dim", type=int, default=None, nargs='*', - help="Dimension per attention head") - parser.add_argument("--vocab-size", type=int, default=None, nargs='*', - help="Vocabulary size") - parser.add_argument("--max-seq-len", type=int, default=None, nargs='*', - help="Maximum sequence length") - parser.add_argument("--norm-eps", type=float, default=None, nargs='*', - help="Normalization epsilon (rms_norm_eps)") + parser.add_argument( + "--num-layers", + type=int, + default=None, + nargs="*", + help="Number of transformer layers (multiple values)", + ) + parser.add_argument( + "--hidden-size", type=int, default=None, nargs="*", help="Hidden size (d_model)" + ) + parser.add_argument( + "--ffn-hidden-size", + type=int, + default=None, + nargs="*", + help="FFN/intermediate size", + ) + parser.add_argument( + "--num-heads", + type=int, + default=None, + nargs="*", + help="Number of attention heads", + ) + parser.add_argument( + "--num-kv-heads", + type=int, + default=None, + nargs="*", + help="Number of KV heads (for GQA)", + ) + parser.add_argument( + "--head-dim", + type=int, + default=None, + nargs="*", + help="Dimension per attention head", + ) + parser.add_argument( + "--vocab-size", type=int, default=None, nargs="*", help="Vocabulary size" + ) + parser.add_argument( + "--max-seq-len", + type=int, + default=None, + nargs="*", + help="Maximum sequence length", + ) + parser.add_argument( + "--norm-eps", + type=float, + default=None, + nargs="*", + help="Normalization epsilon (rms_norm_eps)", + ) # Benchmark settings - parser.add_argument("--device", type=str, default="xpu", help="Device (default: xpu)") - parser.add_argument("--dtype", type=str, default="torch.float32", - choices=["torch.float32", "torch.float16", "torch.bfloat16"], help="Data type") - - return parser.parse_args() \ No newline at end of file + parser.add_argument( + "--device", type=str, default="xpu", help="Device (default: xpu)" + ) + parser.add_argument( + "--dtype", + type=str, + default="torch.float32", + choices=["torch.float32", "torch.float16", "torch.bfloat16"], + help="Data type", + ) + + return parser.parse_args() From 8f351c0b980d7444a4fcfb5482b34a14c3921b22 Mon Sep 17 00:00:00 2001 From: diwei sun Date: Wed, 24 Sep 2025 23:59:42 -0700 Subject: [PATCH 04/10] enable native pytorch op path --- benchmark/bench_moe_topk_softmax.py | 28 +++++++++++++++++++++++----- benchmark/utils.py | 9 +++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/benchmark/bench_moe_topk_softmax.py b/benchmark/bench_moe_topk_softmax.py index 7d43ca3..36ad89d 100644 --- a/benchmark/bench_moe_topk_softmax.py +++ b/benchmark/bench_moe_topk_softmax.py @@ -3,7 +3,7 @@ import torch import triton from sgl_kernel import topk_softmax -from utils import parse_args +from utils import HAS_VLLM, parse_args def vllm_topk_softmax(gating_output, topk): @@ -24,6 +24,22 @@ def vllm_topk_softmax(gating_output, topk): return topk_weights, topk_indices +def navtive_topk_softmax(gating_output, topk): + num_tokens, num_experts = gating_output.shape + + import torch.nn.functional as F + + topk_weights = torch.empty( + (num_tokens, topk), device=gating_output.device, dtype=torch.float32 + ) + topk_indices = torch.empty( + (num_tokens, topk), dtype=torch.int32, device=gating_output.device + ) + topk_weights = F.softmax(gating_output.float(), dim=-1) + topk_weights, topk_indices = torch.topk(topk_weights, topk, dim=-1) + return topk_weights, topk_indices + + def sglang_topk_softmax(gating_output, topk): num_tokens, num_experts = gating_output.shape @@ -74,8 +90,8 @@ def get_benchmark(device="xpu"): x_names=["num_tokens", "num_experts", "topk", "dtype"], x_vals=configs, line_arg="provider", - line_vals=["sglang", "vllm"], - line_names=["SGLang", "VLLM"], + line_vals=["sglang", "native"], + line_names=["SGLang", "native"], styles=[("blue", "-"), ("green", "-")], ylabel="Latency (us)", plot_name="topk-softmax-performance", @@ -88,10 +104,12 @@ def benchmark(num_tokens, num_experts, topk, dtype, provider): (num_tokens, num_experts), device=device, dtype=dtype ) - if provider == "vllm" or provider == "vllm1": + if HAS_VLLM and (provider == "vllm" or provider == "vllm1"): fn = lambda: vllm_topk_softmax(gating_output, topk) elif provider == "sglang" or provider == "sglang1": fn = lambda: sglang_topk_softmax(gating_output, topk) + elif provider == "native": + fn = lambda: navtive_topk_softmax(gating_output, topk) quantiles = [0.5, 0.2, 0.8] ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles) @@ -119,7 +137,7 @@ def benchmark(num_tokens, num_experts, topk, dtype, provider): f"Config: num_tokens={num_tokens}, num_experts={num_experts}, topk={topk}, dtype={dtype}" ) - calculate_diff(num_tokens, num_experts, topk) + # calculate_diff(num_tokens, num_experts, topk) global benchmark_configs benchmark_configs = configs diff --git a/benchmark/utils.py b/benchmark/utils.py index f890f5e..d64215b 100644 --- a/benchmark/utils.py +++ b/benchmark/utils.py @@ -9,6 +9,15 @@ from transformers import AutoConfig +def HAS_VLLM(): + try: + import vllm + + return True + except ImportError: + return False + + def get_model_config(args): """ Get model config with priority: From b6623ba44d27c949f1cb930bb47ab5286d064f72 Mon Sep 17 00:00:00 2001 From: diwei sun Date: Tue, 21 Oct 2025 20:25:52 -0700 Subject: [PATCH 05/10] fix format --- benchmark/utils.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/benchmark/utils.py b/benchmark/utils.py index d64215b..f890f5e 100644 --- a/benchmark/utils.py +++ b/benchmark/utils.py @@ -9,15 +9,6 @@ from transformers import AutoConfig -def HAS_VLLM(): - try: - import vllm - - return True - except ImportError: - return False - - def get_model_config(args): """ Get model config with priority: From e569f7de7b3b92da2c14878b3b0312ce8a01c7d9 Mon Sep 17 00:00:00 2001 From: diwei sun Date: Wed, 5 Nov 2025 01:32:52 -0800 Subject: [PATCH 06/10] refine kernellevel benchmarking for topk --- benchmark/bench_moe_topk_softmax.py | 49 +++++++++++++++++++---------- benchmark/utils.py | 21 ++++++++++--- 2 files changed, 49 insertions(+), 21 deletions(-) diff --git a/benchmark/bench_moe_topk_softmax.py b/benchmark/bench_moe_topk_softmax.py index 36ad89d..de282b4 100644 --- a/benchmark/bench_moe_topk_softmax.py +++ b/benchmark/bench_moe_topk_softmax.py @@ -3,7 +3,7 @@ import torch import triton from sgl_kernel import topk_softmax -from utils import HAS_VLLM, parse_args +from utils import get_model_config, parse_args def vllm_topk_softmax(gating_output, topk): @@ -24,7 +24,11 @@ def vllm_topk_softmax(gating_output, topk): return topk_weights, topk_indices -def navtive_topk_softmax(gating_output, topk): +def navtive_topk_softmax( + gating_output: torch.Tensor, + topk: int, + renormalize: bool, +): num_tokens, num_experts = gating_output.shape import torch.nn.functional as F @@ -37,10 +41,18 @@ def navtive_topk_softmax(gating_output, topk): ) topk_weights = F.softmax(gating_output.float(), dim=-1) topk_weights, topk_indices = torch.topk(topk_weights, topk, dim=-1) + + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + return topk_weights, topk_indices -def sglang_topk_softmax(gating_output, topk): +def sglang_topk_softmax( + gating_output: torch.Tensor, + topk: int, + renormalize: bool, +): num_tokens, num_experts = gating_output.shape topk_weights = torch.empty( @@ -57,7 +69,7 @@ def sglang_topk_softmax(gating_output, topk): topk_weights, topk_indices, gating_output, - renormalize=False, + renormalize=renormalize, ) return topk_weights, topk_indices @@ -87,7 +99,7 @@ def calculate_diff(num_tokens, num_experts, topk): def get_benchmark(device="xpu"): @triton.testing.perf_report( triton.testing.Benchmark( - x_names=["num_tokens", "num_experts", "topk", "dtype"], + x_names=["num_tokens", "num_experts", "topk", "dtype", "renormalize"], x_vals=configs, line_arg="provider", line_vals=["sglang", "native"], @@ -98,18 +110,16 @@ def get_benchmark(device="xpu"): args={}, ) ) - def benchmark(num_tokens, num_experts, topk, dtype, provider): + def benchmark(num_tokens, num_experts, topk, dtype, renormalize, provider): gating_output = torch.randn( (num_tokens, num_experts), device=device, dtype=dtype ) - if HAS_VLLM and (provider == "vllm" or provider == "vllm1"): - fn = lambda: vllm_topk_softmax(gating_output, topk) - elif provider == "sglang" or provider == "sglang1": - fn = lambda: sglang_topk_softmax(gating_output, topk) + if provider == "sglang" or provider == "sglang1": + fn = lambda: sglang_topk_softmax(gating_output, topk, renormalize) elif provider == "native": - fn = lambda: navtive_topk_softmax(gating_output, topk) + fn = lambda: navtive_topk_softmax(gating_output, topk, renormalize) quantiles = [0.5, 0.2, 0.8] ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles) @@ -122,19 +132,24 @@ def benchmark(num_tokens, num_experts, topk, dtype, provider): if __name__ == "__main__": # Run correctness test on small configs if not using a real model args = parse_args() + params = get_model_config(args) + sweep_params = { - "num_tokens": [1, 32, 128, 512], - "num_experts": args.num_experts or [64], - "top_k": args.top_k or [2, 4], - "dtype": [torch.float16, torch.bfloat16], + "num_tokens": args.num_tokens, + "num_experts": params["num_experts"] or [64], + "top_k": params["top_k"] or [2, 4], + "dtype": [torch.bfloat16], + "renormalize": [False], } + + print("sweep_params", sweep_params) keys = sweep_params.keys() configs = list(itertools.product(*sweep_params.values())) print(f"Testing {len(configs)} configurations...") for config in configs: - num_tokens, num_experts, topk, dtype = config + num_tokens, num_experts, topk, dtype, renormalize = config print( - f"Config: num_tokens={num_tokens}, num_experts={num_experts}, topk={topk}, dtype={dtype}" + f"Config: num_tokens={num_tokens}, num_experts={num_experts}, topk={topk}, dtype={dtype}, renormalize={renormalize}" ) # calculate_diff(num_tokens, num_experts, topk) diff --git a/benchmark/utils.py b/benchmark/utils.py index f890f5e..eed0c3b 100644 --- a/benchmark/utils.py +++ b/benchmark/utils.py @@ -54,7 +54,7 @@ def get_model_config(args): or ( getattr(hf_config, "hidden_size", None) // getattr(hf_config, "num_attention_heads", 1) - if getattr(hf_config, "hidden_size") + if getattr(hf_config, "hidden_size", None) and getattr(hf_config, "num_attention_heads") else None ), @@ -64,7 +64,7 @@ def get_model_config(args): "norm_eps": getattr(hf_config, "rms_norm_eps", None) or getattr(hf_config, "layer_norm_eps", 1e-6), "architectures": getattr(hf_config, "architectures", ["Unknown"]), - "dtype": getattr(hf_config, "torch_dtype", "float16"), + "dtype": str(getattr(hf_config, "torch_dtype", "float16")), } ) else: @@ -121,7 +121,12 @@ def get_model_config(args): # Add model name config_dict["model_name"] = args.model_name - return config_dict + sweepable_config = { + k: [v] if isinstance(v, (int, float, str)) else v + for k, v in config_dict.items() + } + + return sweepable_config def parse_args(): @@ -154,6 +159,14 @@ def parse_args(): help="Top-k experts per token (multiple values allowed)", ) + parser.add_argument( + "--num-tokens", + type=int, + default=[100], + nargs="*", + help="Number of tokens (multiple values)", + ) + # Transformer parameters (support list) parser.add_argument( "--num-layers", @@ -218,7 +231,7 @@ def parse_args(): parser.add_argument( "--dtype", type=str, - default="torch.float32", + default="torch.bfloat16", choices=["torch.float32", "torch.float16", "torch.bfloat16"], help="Data type", ) From ea43f57b20e15f150db1de71ba01b676751f754e Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Wed, 5 Nov 2025 17:43:59 +0800 Subject: [PATCH 07/10] Update bench_moe_topk_softmax.py From 0ac9f935fa84e129522eaecbe123571c990834cb Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Wed, 5 Nov 2025 17:44:23 +0800 Subject: [PATCH 08/10] Update bench_moe_topk_softmax.py --- benchmark/bench_moe_topk_softmax.py | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmark/bench_moe_topk_softmax.py b/benchmark/bench_moe_topk_softmax.py index de282b4..7255306 100644 --- a/benchmark/bench_moe_topk_softmax.py +++ b/benchmark/bench_moe_topk_softmax.py @@ -142,7 +142,6 @@ def benchmark(num_tokens, num_experts, topk, dtype, renormalize, provider): "renormalize": [False], } - print("sweep_params", sweep_params) keys = sweep_params.keys() configs = list(itertools.product(*sweep_params.values())) print(f"Testing {len(configs)} configurations...") From c2dbfbe6343901cb559f889189322acbd0f5744e Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Thu, 6 Nov 2025 10:50:12 +0800 Subject: [PATCH 09/10] Update pr-test-xpu.yml --- .github/workflows/pr-test-xpu.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index 0f2f2c9..a78802d 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -55,7 +55,7 @@ jobs: timeout-minutes: 20 run: | docker exec -w /root/sglang ci_sglang_xpu \ - /bin/bash -c "cd /root/sglang/sgl-kernel-xpu/benchmark && python3 bench_flash_attn.py " + /bin/bash -c "cd /root/sglang/sgl-kernel-xpu/benchmark && python3 bench_flash_attn.py && python3 bench_moe_topk_softmax.py " - name: Run E2E Bfloat16 tests timeout-minutes: 20 @@ -71,3 +71,4 @@ jobs: if: always() run: | docker rm -f ci_sglang_xpu || true + From 405a522fa43e080a96117218f28898d009fd23f3 Mon Sep 17 00:00:00 2001 From: diwei sun Date: Wed, 5 Nov 2025 18:52:13 -0800 Subject: [PATCH 10/10] format fix --- .github/workflows/pr-test-xpu.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index a78802d..05e97e5 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -71,4 +71,3 @@ jobs: if: always() run: | docker rm -f ci_sglang_xpu || true -