From 6c1b110784266c5c88c9b7c7ac3c0d82d8eaae4d Mon Sep 17 00:00:00 2001
From: diwei sun <diwei.sun@intel.com>
Date: Wed, 24 Sep 2025 01:30:34 -0700
Subject: [PATCH 01/10] refine kernel benchmark structure

---
 benchmark/bench_moe_topk_softmax.py | 111 ++++++++++----------
 benchmark/utils.py                  | 155 ++++++++++++++++++++++++++++
 2 files changed, 214 insertions(+), 52 deletions(-)
 create mode 100644 benchmark/utils.py

diff --git a/benchmark/bench_moe_topk_softmax.py b/benchmark/bench_moe_topk_softmax.py
index eebecdb..d50a231 100644
--- a/benchmark/bench_moe_topk_softmax.py
+++ b/benchmark/bench_moe_topk_softmax.py
@@ -1,77 +1,76 @@
-import itertools
+# benchmark_topk_softmax.py
+# Benchmark script for topk_softmax operator: compares VLLM vs SGLang implementations
+# Supports two modes:
+#   1. --model-name provided → load config from HF model
+#   2. No --model-name → use default hardcoded test configurations
 
+import itertools
 import torch
 import triton
+from utils import parse_args, get_model_config
 from sgl_kernel import topk_softmax
 
 
 def vllm_topk_softmax(gating_output, topk):
+    """
+    Simulate vLLM's topk_softmax using torch.ops._moe_C (mock if not available).
+    Output: topk_weights, topk_indices
+    """
     num_tokens, num_experts = gating_output.shape
 
-    topk_weights = torch.empty(
-        (num_tokens, topk), device=gating_output.device, dtype=torch.float32
-    )
-    topk_indices = torch.empty(
-        (num_tokens, topk), dtype=torch.int32, device=gating_output.device
-    )
-    token_expert_indices = torch.empty(
-        (num_tokens, topk), dtype=torch.int32, device=gating_output.device
-    )
-    torch.ops._moe_C.topk_softmax(
-        topk_weights, topk_indices, token_expert_indices, gating_output
-    )
+    topk_weights = torch.empty((num_tokens, topk), device=gating_output.device, dtype=torch.float32)
+    topk_indices = torch.empty((num_tokens, topk), dtype=torch.int32, device=gating_output.device)
+    token_expert_indices = torch.empty((num_tokens, topk), dtype=torch.int32, device=gating_output.device)
+
+    try:
+        torch.ops._moe_C.topk_softmax(topk_weights, topk_indices, token_expert_indices, gating_output)
+    except (AttributeError, ImportError):
+        # Mock behavior if vLLM ops not available
+        scores = torch.softmax(gating_output, dim=-1)
+        topk_vals, topk_idx = torch.topk(scores, topk, dim=-1)
+        topk_weights.copy_(topk_vals)
+        topk_indices.copy_(topk_idx)
+
     return topk_weights, topk_indices
 
 
 def sglang_topk_softmax(gating_output, topk):
+    """
+    Call SGLang's custom topk_softmax kernel.
+    Output: topk_weights, topk_indices
+    """
     num_tokens, num_experts = gating_output.shape
 
-    topk_weights = torch.empty(
-        (num_tokens, topk), device=gating_output.device, dtype=torch.float32
-    )
-    topk_indices = torch.empty(
-        (num_tokens, topk), dtype=torch.int32, device=gating_output.device
-    )
-    token_expert_indices = torch.empty(
-        (num_tokens, topk), dtype=torch.int32, device=gating_output.device
-    )
+    topk_weights = torch.empty((num_tokens, topk), device=gating_output.device, dtype=torch.float32)
+    topk_indices = torch.empty((num_tokens, topk), dtype=torch.int32, device=gating_output.device)
 
+    # Call the actual SGLang kernel
     topk_softmax(
         topk_weights=topk_weights,
         topk_ids=topk_indices,
-        token_expert_indices=token_expert_indices,
         gating_output=gating_output,
+        renormalize=True,
     )
 
     return topk_weights, topk_indices
 
 
 def calculate_diff(num_tokens, num_experts, topk):
-    gating_output = torch.randn(
-        (num_tokens, num_experts), device="cuda", dtype=torch.float32
-    )
+    """
+    Compare output difference between VLLM and SGLang implementations.
+    """
+    gating_output = torch.randn((num_tokens, num_experts), device="cuda", dtype=torch.float32)
+
     weights_vllm, indices_vllm = vllm_topk_softmax(gating_output.clone(), topk)
     weights_sglang, indices_sglang = sglang_topk_softmax(gating_output.clone(), topk)
 
     weights_diff = torch.abs(weights_vllm - weights_sglang).mean().item()
     indices_match = torch.equal(indices_vllm, indices_sglang)
 
-    if (
-        torch.allclose(weights_vllm, weights_sglang, atol=1e-3, rtol=1e-3)
-        and indices_match
-    ):
-        print("✅ VLLM and SGLang topk_softmax implementations match")
+    if torch.allclose(weights_vllm, weights_sglang, atol=1e-3, rtol=1e-3) and indices_match:
+        print(f"✅ Match | Tokens={num_tokens}, Experts={num_experts}, TopK={topk}")
     else:
-        print(
-            f"❌ Implementations differ: Weights diff={weights_diff}, Indices match={indices_match}"
-        )
-
-
-num_tokens_range = [128, 512, 1024, 2048, 4096, 8192, 16384, 32768]
-num_experts_range = [32, 64, 128, 256, 12, 512]
-topk_range = [1, 2, 4, 8]
-
-configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+        print(f"❌ Diff    | Tokens={num_tokens}, Δ={weights_diff:.6f}, Indices={indices_match}")
 
 
 @triton.testing.perf_report(
@@ -105,14 +104,22 @@ def benchmark(num_tokens, num_experts, topk, provider):
 
 
 if __name__ == "__main__":
-    configs = [
-        (20, 256, 4),
-        (20, 256, 8),
-        (20, 12, 4),
-        (20, 12, 1),
-        (20, 512, 4),
-        (20, 512, 1),
-    ]
-    for num_tokens, num_experts, topk in configs:
-        calculate_diff(num_tokens, num_experts, topk)
-    benchmark.run(print_data=True)
+    # Run correctness test on small configs if not using a real model
+    args = parse_args()
+    config = get_model_config(args)
+    if args.model_name is None:
+        print("🧪 Running correctness tests on default configs...")
+        test_configs = [
+            (20, 256, 4),
+            (20, 256, 8),
+            (20, 12, 4),
+            (20, 12, 1),
+            (20, 512, 4),
+            (20, 512, 1),
+        ]
+        for n, e, k in test_configs:
+            calculate_diff(n, e, k)
+
+    # Run benchmark
+    print("🚀 Starting performance benchmark...")
+    benchmark.run(print_data=True, show_plots=False, save_path=".")
\ No newline at end of file
diff --git a/benchmark/utils.py b/benchmark/utils.py
new file mode 100644
index 0000000..56eaa58
--- /dev/null
+++ b/benchmark/utils.py
@@ -0,0 +1,155 @@
+# utils.py
+# Flexible config loader: supports
+#   1. Hugging Face model config (--model-name)
+#   2. Manual override via CLI args (e.g., --num-experts)
+#   3. Safe fallback defaults
+
+from transformers import AutoConfig
+import argparse
+
+
+def get_model_config(args):
+    """
+    Get model config with priority:
+    1. CLI args override (e.g., --num-experts)
+    2. Hugging Face config (if --model-name given)
+    3. Hardcoded defaults (last resort)
+
+    Args:
+        args: Parsed command-line arguments
+
+    Returns:
+        dict: Standardized model config
+    """
+    config_dict = {}
+
+    # Step 1: Load from Hugging Face model (if provided)
+    if args.model_name:
+        print(f"📡 Loading config from Hugging Face: {args.model_name}")
+        try:
+            hf_config = AutoConfig.from_pretrained(args.model_name)
+        except Exception as e:
+            raise ValueError(f"Failed to load {args.model_name}: {e}")
+
+        # Extract with fallbacks
+        config_dict.update({
+            "num_experts": getattr(hf_config, "moe_num_experts", None) or \
+                          getattr(hf_config, "num_experts", None) or \
+                          getattr(hf_config, "num_local_experts", None),
+            "top_k": getattr(hf_config, "moe_top_k", None) or \
+                     getattr(hf_config, "top_k", None) or \
+                     getattr(hf_config, "num_experts_per_tok", None),
+            "num_layers": getattr(hf_config, "num_hidden_layers", None) or \
+                          getattr(hf_config, "num_layers", None),
+            "hidden_size": getattr(hf_config, "hidden_size", None) or \
+                           getattr(hf_config, "d_model", None),
+            "ffn_hidden_size": getattr(hf_config, "intermediate_size", None) or \
+                               getattr(hf_config, "ffn_dim", None),
+            "num_heads": getattr(hf_config, "num_attention_heads", None),
+            "num_kv_heads": getattr(hf_config, "num_key_value_heads", None) or \
+                            getattr(hf_config, "num_attention_heads", None),
+            "head_dim": getattr(hf_config, "head_dim", None) or \
+                        (getattr(hf_config, "hidden_size", None) // getattr(hf_config, "num_attention_heads", 1) if getattr(hf_config, "hidden_size") and getattr(hf_config, "num_attention_heads") else None),
+            "vocab_size": getattr(hf_config, "vocab_size", None),
+            "max_seq_len": getattr(hf_config, "max_position_embeddings", None) or \
+                           getattr(hf_config, "n_positions", 32768),
+            "norm_eps": getattr(hf_config, "rms_norm_eps", None) or \
+                        getattr(hf_config, "layer_norm_eps", 1e-6),
+            "architectures": getattr(hf_config, "architectures", ["Unknown"]),
+            "dtype": getattr(hf_config, "torch_dtype", "float16"),
+        })
+    else:
+        print("🔧 No --model-name provided. Using CLI args or defaults.")
+
+    # Step 2: CLI args override everything
+    cli_overrides = {
+        "num_experts": args.num_experts,
+        "top_k": args.top_k,
+        "num_layers": args.num_layers,
+        "hidden_size": args.hidden_size,
+        "ffn_hidden_size": args.ffn_hidden_size,
+        "num_heads": args.num_heads,
+        "num_kv_heads": args.num_kv_heads,
+        "head_dim": args.head_dim,
+        "vocab_size": args.vocab_size,
+        "max_seq_len": args.max_seq_len,
+        "norm_eps": args.norm_eps,
+    }
+
+    for k, v in cli_overrides.items():
+        if v is not None:
+            config_dict[k] = v
+            print(f"⚙️ Overriding {k} = {v} (from CLI)")
+
+    # Step 3: Fill missing with safe defaults
+    defaults = {
+        "num_experts": 64,
+        "top_k": 2,
+        "num_layers": 32,
+        "hidden_size": 4096,
+        "ffn_hidden_size": 11008,
+        "num_heads": 32,
+        "num_kv_heads": 8,
+        "head_dim": 128,
+        "vocab_size": 32000,
+        "max_seq_len": 32768,
+        "norm_eps": 1e-6,
+        "architectures": ["LlamaForCausalLM"],
+        "dtype": "float16",
+    }
+
+    for k, v in defaults.items():
+        if k not in config_dict or config_dict[k] is None:
+            config_dict[k] = v
+            if args.model_name or any(getattr(args, field) is not None
+                                     for field in ["num_experts", "top_k", "num_layers"]):
+                pass  # Don't log if user expected override
+            else:
+                print(f"💡 Using default {k} = {v}")
+
+    # Add model name
+    config_dict["model_name"] = args.model_name
+
+    return config_dict
+
+
+def parse_args():
+    """Parse all possible model and benchmark arguments."""
+    parser = argparse.ArgumentParser(description="Flexible benchmark with model config support")
+
+    # Model source
+    parser.add_argument("--model-name", type=str, default=None,
+                        help="Hugging Face model name (e.g., deepseek-ai/DeepSeek-R1). If not set, use CLI args.")
+
+    # MoE parameters
+    parser.add_argument("--num-experts", type=int, default=None,
+                        help="Number of experts (override if not from model)")
+    parser.add_argument("--top-k", type=int, default=None,
+                        help="Top-k experts per token")
+
+    # Transformer parameters
+    parser.add_argument("--num-layers", type=int, default=None,
+                        help="Number of transformer layers")
+    parser.add_argument("--hidden-size", type=int, default=None,
+                        help="Hidden size (d_model)")
+    parser.add_argument("--ffn-hidden-size", type=int, default=None,
+                        help="FFN/intermediate size")
+    parser.add_argument("--num-heads", type=int, default=None,
+                        help="Number of attention heads")
+    parser.add_argument("--num-kv-heads", type=int, default=None,
+                        help="Number of KV heads (for GQA)")
+    parser.add_argument("--head-dim", type=int, default=None,
+                        help="Dimension per attention head")
+    parser.add_argument("--vocab-size", type=int, default=None,
+                        help="Vocabulary size")
+    parser.add_argument("--max-seq-len", type=int, default=None,
+                        help="Maximum sequence length")
+    parser.add_argument("--norm-eps", type=float, default=None,
+                        help="Normalization epsilon (rms_norm_eps)")
+
+    # Benchmark settings
+    parser.add_argument("--device", type=str, default="cuda", help="Device (default: cuda)")
+    parser.add_argument("--dtype", type=str, default="float32",
+                        choices=["float32", "float16", "bfloat16"], help="Data type")
+
+    return parser.parse_args()
\ No newline at end of file

From ec92d0b5d1339e3ebb109e46e288e9593049c18c Mon Sep 17 00:00:00 2001
From: diwei sun <diwei.sun@intel.com>
Date: Wed, 24 Sep 2025 20:16:15 -0700
Subject: [PATCH 02/10] refine

---
 benchmark/bench_moe_topk_softmax.py | 169 ++++++++++++++--------------
 benchmark/utils.py                  |  40 +++----
 2 files changed, 106 insertions(+), 103 deletions(-)

diff --git a/benchmark/bench_moe_topk_softmax.py b/benchmark/bench_moe_topk_softmax.py
index d50a231..400f8b4 100644
--- a/benchmark/bench_moe_topk_softmax.py
+++ b/benchmark/bench_moe_topk_softmax.py
@@ -1,125 +1,128 @@
-# benchmark_topk_softmax.py
-# Benchmark script for topk_softmax operator: compares VLLM vs SGLang implementations
-# Supports two modes:
-#   1. --model-name provided → load config from HF model
-#   2. No --model-name → use default hardcoded test configurations
-
 import itertools
+
 import torch
 import triton
-from utils import parse_args, get_model_config
+from utils import parse_args
 from sgl_kernel import topk_softmax
 
 
 def vllm_topk_softmax(gating_output, topk):
-    """
-    Simulate vLLM's topk_softmax using torch.ops._moe_C (mock if not available).
-    Output: topk_weights, topk_indices
-    """
     num_tokens, num_experts = gating_output.shape
 
-    topk_weights = torch.empty((num_tokens, topk), device=gating_output.device, dtype=torch.float32)
-    topk_indices = torch.empty((num_tokens, topk), dtype=torch.int32, device=gating_output.device)
-    token_expert_indices = torch.empty((num_tokens, topk), dtype=torch.int32, device=gating_output.device)
-
-    try:
-        torch.ops._moe_C.topk_softmax(topk_weights, topk_indices, token_expert_indices, gating_output)
-    except (AttributeError, ImportError):
-        # Mock behavior if vLLM ops not available
-        scores = torch.softmax(gating_output, dim=-1)
-        topk_vals, topk_idx = torch.topk(scores, topk, dim=-1)
-        topk_weights.copy_(topk_vals)
-        topk_indices.copy_(topk_idx)
-
+    topk_weights = torch.empty(
+        (num_tokens, topk), device=gating_output.device, dtype=torch.float32
+    )
+    topk_indices = torch.empty(
+        (num_tokens, topk), dtype=torch.int32, device=gating_output.device
+    )
+    token_expert_indices = torch.empty(
+        (num_tokens, topk), dtype=torch.int32, device=gating_output.device
+    )
+    torch.ops._moe_C.topk_softmax(
+        topk_weights, topk_indices, token_expert_indices, gating_output
+    )
     return topk_weights, topk_indices
 
 
+
 def sglang_topk_softmax(gating_output, topk):
-    """
-    Call SGLang's custom topk_softmax kernel.
-    Output: topk_weights, topk_indices
-    """
     num_tokens, num_experts = gating_output.shape
 
-    topk_weights = torch.empty((num_tokens, topk), device=gating_output.device, dtype=torch.float32)
-    topk_indices = torch.empty((num_tokens, topk), dtype=torch.int32, device=gating_output.device)
+    topk_weights = torch.empty(
+        (num_tokens, topk), device=gating_output.device, dtype=torch.float32
+    )
+    topk_indices = torch.empty(
+        (num_tokens, topk), dtype=torch.int32, device=gating_output.device
+    )
+    token_expert_indices = torch.empty(
+        (num_tokens, topk), dtype=torch.int32, device=gating_output.device
+    )
 
-    # Call the actual SGLang kernel
     topk_softmax(
-        topk_weights=topk_weights,
-        topk_ids=topk_indices,
-        gating_output=gating_output,
-        renormalize=True,
+        topk_weights,
+        topk_indices,
+        gating_output,
+        renormalize=False,
     )
 
     return topk_weights, topk_indices
 
 
 def calculate_diff(num_tokens, num_experts, topk):
-    """
-    Compare output difference between VLLM and SGLang implementations.
-    """
-    gating_output = torch.randn((num_tokens, num_experts), device="cuda", dtype=torch.float32)
-
+    gating_output = torch.randn(
+        (num_tokens, num_experts), device=gating_output.device, dtype=torch.float32
+    )
     weights_vllm, indices_vllm = vllm_topk_softmax(gating_output.clone(), topk)
     weights_sglang, indices_sglang = sglang_topk_softmax(gating_output.clone(), topk)
 
     weights_diff = torch.abs(weights_vllm - weights_sglang).mean().item()
     indices_match = torch.equal(indices_vllm, indices_sglang)
 
-    if torch.allclose(weights_vllm, weights_sglang, atol=1e-3, rtol=1e-3) and indices_match:
-        print(f"✅ Match | Tokens={num_tokens}, Experts={num_experts}, TopK={topk}")
+    if (
+        torch.allclose(weights_vllm, weights_sglang, atol=1e-3, rtol=1e-3)
+        and indices_match
+    ):
+        print("✅ VLLM and SGLang topk_softmax implementations match")
     else:
-        print(f"❌ Diff    | Tokens={num_tokens}, Δ={weights_diff:.6f}, Indices={indices_match}")
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=["num_tokens", "num_experts", "topk"],
-        x_vals=configs,
-        line_arg="provider",
-        line_vals=["sglang", "vllm"],
-        line_names=["SGLang", "VLLM"],
-        styles=[("blue", "-"), ("green", "-")],
-        ylabel="Latency (us)",
-        plot_name="topk-softmax-performance",
-        args={},
+        print(
+            f"❌ Implementations differ: Weights diff={weights_diff}, Indices match={indices_match}"
+        )
+
+def get_benchmark(device='xpu'):
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["num_tokens", "num_experts", "topk", "dtype"], 
+            x_vals=configs,
+            line_arg="provider",
+            line_vals=["sglang", "vllm"],
+            line_names=["SGLang", "VLLM"],
+            styles=[("blue", "-"), ("green", "-")],
+            ylabel="Latency (us)",
+            plot_name="topk-softmax-performance",
+            args={},
+        )
     )
-)
-def benchmark(num_tokens, num_experts, topk, provider):
+    def benchmark(num_tokens, num_experts, topk, dtype, provider):
 
-    gating_output = torch.randn(
-        (num_tokens, num_experts), device="cuda", dtype=torch.float32
-    )
+        gating_output = torch.randn(
+            (num_tokens, num_experts), device=device, dtype=dtype
+        )
+
+        if provider == "vllm" or provider == "vllm1":
+            fn = lambda: vllm_topk_softmax(gating_output, topk)
+        elif provider == "sglang" or provider == "sglang1":
+            fn = lambda: sglang_topk_softmax(gating_output, topk)
 
-    if provider == "vllm" or provider == "vllm1":
-        fn = lambda: vllm_topk_softmax(gating_output, topk)
-    elif provider == "sglang" or provider == "sglang1":
-        fn = lambda: sglang_topk_softmax(gating_output, topk)
+        quantiles = [0.5, 0.2, 0.8]
+        ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles)
 
-    quantiles = [0.5, 0.2, 0.8]
-    ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles)
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
 
-    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+    return benchmark
 
 
 if __name__ == "__main__":
     # Run correctness test on small configs if not using a real model
     args = parse_args()
-    config = get_model_config(args)
-    if args.model_name is None:
-        print("🧪 Running correctness tests on default configs...")
-        test_configs = [
-            (20, 256, 4),
-            (20, 256, 8),
-            (20, 12, 4),
-            (20, 12, 1),
-            (20, 512, 4),
-            (20, 512, 1),
-        ]
-        for n, e, k in test_configs:
-            calculate_diff(n, e, k)
+    sweep_params = {
+        "num_tokens": [1, 32, 128, 512],
+        "num_experts": args.num_experts or [64],
+        "top_k": args.top_k or [2, 4],
+        "dtype": [torch.float16, torch.bfloat16],
+    }
+    keys = sweep_params.keys()
+    configs = list(itertools.product(*sweep_params.values()))
+    print(f"Testing {len(configs)} configurations...")
+    for config in configs:
+        num_tokens, num_experts, topk, dtype = config
+        print(f"Config: num_tokens={num_tokens}, num_experts={num_experts}, topk={topk}, dtype={dtype}")
+
+        calculate_diff(num_tokens, num_experts, topk)
+
+    global benchmark_configs
+    benchmark_configs = configs
 
     # Run benchmark
-    print("🚀 Starting performance benchmark...")
-    benchmark.run(print_data=True, show_plots=False, save_path=".")
\ No newline at end of file
+    print("Starting performance benchmark...")
+    benchmark = get_benchmark()
+    benchmark.run(print_data=True, show_plots=False, save_path=".")
diff --git a/benchmark/utils.py b/benchmark/utils.py
index 56eaa58..c1a8baf 100644
--- a/benchmark/utils.py
+++ b/benchmark/utils.py
@@ -114,42 +114,42 @@ def get_model_config(args):
 
 
 def parse_args():
-    """Parse all possible model and benchmark arguments."""
+    """Parse all possible model and benchmark arguments (support list values)."""
     parser = argparse.ArgumentParser(description="Flexible benchmark with model config support")
 
     # Model source
     parser.add_argument("--model-name", type=str, default=None,
                         help="Hugging Face model name (e.g., deepseek-ai/DeepSeek-R1). If not set, use CLI args.")
 
-    # MoE parameters
-    parser.add_argument("--num-experts", type=int, default=None,
-                        help="Number of experts (override if not from model)")
-    parser.add_argument("--top-k", type=int, default=None,
-                        help="Top-k experts per token")
+    # MoE parameters (support list)
+    parser.add_argument("--num-experts", type=int, default=None, nargs='*',
+                        help="Number of experts (can provide multiple values for sweep)")
+    parser.add_argument("--top-k", type=int, default=None, nargs='*',
+                        help="Top-k experts per token (multiple values allowed)")
 
-    # Transformer parameters
-    parser.add_argument("--num-layers", type=int, default=None,
-                        help="Number of transformer layers")
-    parser.add_argument("--hidden-size", type=int, default=None,
+    # Transformer parameters (support list)
+    parser.add_argument("--num-layers", type=int, default=None, nargs='*',
+                        help="Number of transformer layers (multiple values)")
+    parser.add_argument("--hidden-size", type=int, default=None, nargs='*',
                         help="Hidden size (d_model)")
-    parser.add_argument("--ffn-hidden-size", type=int, default=None,
+    parser.add_argument("--ffn-hidden-size", type=int, default=None, nargs='*',
                         help="FFN/intermediate size")
-    parser.add_argument("--num-heads", type=int, default=None,
+    parser.add_argument("--num-heads", type=int, default=None, nargs='*',
                         help="Number of attention heads")
-    parser.add_argument("--num-kv-heads", type=int, default=None,
+    parser.add_argument("--num-kv-heads", type=int, default=None, nargs='*',
                         help="Number of KV heads (for GQA)")
-    parser.add_argument("--head-dim", type=int, default=None,
+    parser.add_argument("--head-dim", type=int, default=None, nargs='*',
                         help="Dimension per attention head")
-    parser.add_argument("--vocab-size", type=int, default=None,
+    parser.add_argument("--vocab-size", type=int, default=None, nargs='*',
                         help="Vocabulary size")
-    parser.add_argument("--max-seq-len", type=int, default=None,
+    parser.add_argument("--max-seq-len", type=int, default=None, nargs='*',
                         help="Maximum sequence length")
-    parser.add_argument("--norm-eps", type=float, default=None,
+    parser.add_argument("--norm-eps", type=float, default=None, nargs='*',
                         help="Normalization epsilon (rms_norm_eps)")
 
     # Benchmark settings
-    parser.add_argument("--device", type=str, default="cuda", help="Device (default: cuda)")
-    parser.add_argument("--dtype", type=str, default="float32",
-                        choices=["float32", "float16", "bfloat16"], help="Data type")
+    parser.add_argument("--device", type=str, default="xpu", help="Device (default: xpu)")
+    parser.add_argument("--dtype", type=str, default="torch.float32",
+                        choices=["torch.float32", "torch.float16", "torch.bfloat16"], help="Data type")
 
     return parser.parse_args()
\ No newline at end of file

From 6029859a658f14472f2e174e67fa041f05321ae0 Mon Sep 17 00:00:00 2001
From: diwei sun <diwei.sun@intel.com>
Date: Wed, 24 Sep 2025 20:16:41 -0700
Subject: [PATCH 03/10] format fix

---
 benchmark/bench_moe_topk_softmax.py |  12 +-
 benchmark/utils.py                  | 189 +++++++++++++++++++---------
 2 files changed, 137 insertions(+), 64 deletions(-)

diff --git a/benchmark/bench_moe_topk_softmax.py b/benchmark/bench_moe_topk_softmax.py
index 400f8b4..7d43ca3 100644
--- a/benchmark/bench_moe_topk_softmax.py
+++ b/benchmark/bench_moe_topk_softmax.py
@@ -2,8 +2,8 @@
 
 import torch
 import triton
-from utils import parse_args
 from sgl_kernel import topk_softmax
+from utils import parse_args
 
 
 def vllm_topk_softmax(gating_output, topk):
@@ -24,7 +24,6 @@ def vllm_topk_softmax(gating_output, topk):
     return topk_weights, topk_indices
 
 
-
 def sglang_topk_softmax(gating_output, topk):
     num_tokens, num_experts = gating_output.shape
 
@@ -68,10 +67,11 @@ def calculate_diff(num_tokens, num_experts, topk):
             f"❌ Implementations differ: Weights diff={weights_diff}, Indices match={indices_match}"
         )
 
-def get_benchmark(device='xpu'):
+
+def get_benchmark(device="xpu"):
     @triton.testing.perf_report(
         triton.testing.Benchmark(
-            x_names=["num_tokens", "num_experts", "topk", "dtype"], 
+            x_names=["num_tokens", "num_experts", "topk", "dtype"],
             x_vals=configs,
             line_arg="provider",
             line_vals=["sglang", "vllm"],
@@ -115,7 +115,9 @@ def benchmark(num_tokens, num_experts, topk, dtype, provider):
     print(f"Testing {len(configs)} configurations...")
     for config in configs:
         num_tokens, num_experts, topk, dtype = config
-        print(f"Config: num_tokens={num_tokens}, num_experts={num_experts}, topk={topk}, dtype={dtype}")
+        print(
+            f"Config: num_tokens={num_tokens}, num_experts={num_experts}, topk={topk}, dtype={dtype}"
+        )
 
         calculate_diff(num_tokens, num_experts, topk)
 
diff --git a/benchmark/utils.py b/benchmark/utils.py
index c1a8baf..f890f5e 100644
--- a/benchmark/utils.py
+++ b/benchmark/utils.py
@@ -4,9 +4,10 @@
 #   2. Manual override via CLI args (e.g., --num-experts)
 #   3. Safe fallback defaults
 
-from transformers import AutoConfig
 import argparse
 
+from transformers import AutoConfig
+
 
 def get_model_config(args):
     """
@@ -32,32 +33,40 @@ def get_model_config(args):
             raise ValueError(f"Failed to load {args.model_name}: {e}")
 
         # Extract with fallbacks
-        config_dict.update({
-            "num_experts": getattr(hf_config, "moe_num_experts", None) or \
-                          getattr(hf_config, "num_experts", None) or \
-                          getattr(hf_config, "num_local_experts", None),
-            "top_k": getattr(hf_config, "moe_top_k", None) or \
-                     getattr(hf_config, "top_k", None) or \
-                     getattr(hf_config, "num_experts_per_tok", None),
-            "num_layers": getattr(hf_config, "num_hidden_layers", None) or \
-                          getattr(hf_config, "num_layers", None),
-            "hidden_size": getattr(hf_config, "hidden_size", None) or \
-                           getattr(hf_config, "d_model", None),
-            "ffn_hidden_size": getattr(hf_config, "intermediate_size", None) or \
-                               getattr(hf_config, "ffn_dim", None),
-            "num_heads": getattr(hf_config, "num_attention_heads", None),
-            "num_kv_heads": getattr(hf_config, "num_key_value_heads", None) or \
-                            getattr(hf_config, "num_attention_heads", None),
-            "head_dim": getattr(hf_config, "head_dim", None) or \
-                        (getattr(hf_config, "hidden_size", None) // getattr(hf_config, "num_attention_heads", 1) if getattr(hf_config, "hidden_size") and getattr(hf_config, "num_attention_heads") else None),
-            "vocab_size": getattr(hf_config, "vocab_size", None),
-            "max_seq_len": getattr(hf_config, "max_position_embeddings", None) or \
-                           getattr(hf_config, "n_positions", 32768),
-            "norm_eps": getattr(hf_config, "rms_norm_eps", None) or \
-                        getattr(hf_config, "layer_norm_eps", 1e-6),
-            "architectures": getattr(hf_config, "architectures", ["Unknown"]),
-            "dtype": getattr(hf_config, "torch_dtype", "float16"),
-        })
+        config_dict.update(
+            {
+                "num_experts": getattr(hf_config, "moe_num_experts", None)
+                or getattr(hf_config, "num_experts", None)
+                or getattr(hf_config, "num_local_experts", None),
+                "top_k": getattr(hf_config, "moe_top_k", None)
+                or getattr(hf_config, "top_k", None)
+                or getattr(hf_config, "num_experts_per_tok", None),
+                "num_layers": getattr(hf_config, "num_hidden_layers", None)
+                or getattr(hf_config, "num_layers", None),
+                "hidden_size": getattr(hf_config, "hidden_size", None)
+                or getattr(hf_config, "d_model", None),
+                "ffn_hidden_size": getattr(hf_config, "intermediate_size", None)
+                or getattr(hf_config, "ffn_dim", None),
+                "num_heads": getattr(hf_config, "num_attention_heads", None),
+                "num_kv_heads": getattr(hf_config, "num_key_value_heads", None)
+                or getattr(hf_config, "num_attention_heads", None),
+                "head_dim": getattr(hf_config, "head_dim", None)
+                or (
+                    getattr(hf_config, "hidden_size", None)
+                    // getattr(hf_config, "num_attention_heads", 1)
+                    if getattr(hf_config, "hidden_size")
+                    and getattr(hf_config, "num_attention_heads")
+                    else None
+                ),
+                "vocab_size": getattr(hf_config, "vocab_size", None),
+                "max_seq_len": getattr(hf_config, "max_position_embeddings", None)
+                or getattr(hf_config, "n_positions", 32768),
+                "norm_eps": getattr(hf_config, "rms_norm_eps", None)
+                or getattr(hf_config, "layer_norm_eps", 1e-6),
+                "architectures": getattr(hf_config, "architectures", ["Unknown"]),
+                "dtype": getattr(hf_config, "torch_dtype", "float16"),
+            }
+        )
     else:
         print("🔧 No --model-name provided. Using CLI args or defaults.")
 
@@ -101,8 +110,10 @@ def get_model_config(args):
     for k, v in defaults.items():
         if k not in config_dict or config_dict[k] is None:
             config_dict[k] = v
-            if args.model_name or any(getattr(args, field) is not None
-                                     for field in ["num_experts", "top_k", "num_layers"]):
+            if args.model_name or any(
+                getattr(args, field) is not None
+                for field in ["num_experts", "top_k", "num_layers"]
+            ):
                 pass  # Don't log if user expected override
             else:
                 print(f"💡 Using default {k} = {v}")
@@ -115,41 +126,101 @@ def get_model_config(args):
 
 def parse_args():
     """Parse all possible model and benchmark arguments (support list values)."""
-    parser = argparse.ArgumentParser(description="Flexible benchmark with model config support")
+    parser = argparse.ArgumentParser(
+        description="Flexible benchmark with model config support"
+    )
 
     # Model source
-    parser.add_argument("--model-name", type=str, default=None,
-                        help="Hugging Face model name (e.g., deepseek-ai/DeepSeek-R1). If not set, use CLI args.")
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default=None,
+        help="Hugging Face model name (e.g., deepseek-ai/DeepSeek-R1). If not set, use CLI args.",
+    )
 
     # MoE parameters (support list)
-    parser.add_argument("--num-experts", type=int, default=None, nargs='*',
-                        help="Number of experts (can provide multiple values for sweep)")
-    parser.add_argument("--top-k", type=int, default=None, nargs='*',
-                        help="Top-k experts per token (multiple values allowed)")
+    parser.add_argument(
+        "--num-experts",
+        type=int,
+        default=None,
+        nargs="*",
+        help="Number of experts (can provide multiple values for sweep)",
+    )
+    parser.add_argument(
+        "--top-k",
+        type=int,
+        default=None,
+        nargs="*",
+        help="Top-k experts per token (multiple values allowed)",
+    )
 
     # Transformer parameters (support list)
-    parser.add_argument("--num-layers", type=int, default=None, nargs='*',
-                        help="Number of transformer layers (multiple values)")
-    parser.add_argument("--hidden-size", type=int, default=None, nargs='*',
-                        help="Hidden size (d_model)")
-    parser.add_argument("--ffn-hidden-size", type=int, default=None, nargs='*',
-                        help="FFN/intermediate size")
-    parser.add_argument("--num-heads", type=int, default=None, nargs='*',
-                        help="Number of attention heads")
-    parser.add_argument("--num-kv-heads", type=int, default=None, nargs='*',
-                        help="Number of KV heads (for GQA)")
-    parser.add_argument("--head-dim", type=int, default=None, nargs='*',
-                        help="Dimension per attention head")
-    parser.add_argument("--vocab-size", type=int, default=None, nargs='*',
-                        help="Vocabulary size")
-    parser.add_argument("--max-seq-len", type=int, default=None, nargs='*',
-                        help="Maximum sequence length")
-    parser.add_argument("--norm-eps", type=float, default=None, nargs='*',
-                        help="Normalization epsilon (rms_norm_eps)")
+    parser.add_argument(
+        "--num-layers",
+        type=int,
+        default=None,
+        nargs="*",
+        help="Number of transformer layers (multiple values)",
+    )
+    parser.add_argument(
+        "--hidden-size", type=int, default=None, nargs="*", help="Hidden size (d_model)"
+    )
+    parser.add_argument(
+        "--ffn-hidden-size",
+        type=int,
+        default=None,
+        nargs="*",
+        help="FFN/intermediate size",
+    )
+    parser.add_argument(
+        "--num-heads",
+        type=int,
+        default=None,
+        nargs="*",
+        help="Number of attention heads",
+    )
+    parser.add_argument(
+        "--num-kv-heads",
+        type=int,
+        default=None,
+        nargs="*",
+        help="Number of KV heads (for GQA)",
+    )
+    parser.add_argument(
+        "--head-dim",
+        type=int,
+        default=None,
+        nargs="*",
+        help="Dimension per attention head",
+    )
+    parser.add_argument(
+        "--vocab-size", type=int, default=None, nargs="*", help="Vocabulary size"
+    )
+    parser.add_argument(
+        "--max-seq-len",
+        type=int,
+        default=None,
+        nargs="*",
+        help="Maximum sequence length",
+    )
+    parser.add_argument(
+        "--norm-eps",
+        type=float,
+        default=None,
+        nargs="*",
+        help="Normalization epsilon (rms_norm_eps)",
+    )
 
     # Benchmark settings
-    parser.add_argument("--device", type=str, default="xpu", help="Device (default: xpu)")
-    parser.add_argument("--dtype", type=str, default="torch.float32",
-                        choices=["torch.float32", "torch.float16", "torch.bfloat16"], help="Data type")
-
-    return parser.parse_args()
\ No newline at end of file
+    parser.add_argument(
+        "--device", type=str, default="xpu", help="Device (default: xpu)"
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="torch.float32",
+        choices=["torch.float32", "torch.float16", "torch.bfloat16"],
+        help="Data type",
+    )
+
+    return parser.parse_args()

From 8f351c0b980d7444a4fcfb5482b34a14c3921b22 Mon Sep 17 00:00:00 2001
From: diwei sun <diwei.sun@intel.com>
Date: Wed, 24 Sep 2025 23:59:42 -0700
Subject: [PATCH 04/10] enable native pytorch op path

---
 benchmark/bench_moe_topk_softmax.py | 28 +++++++++++++++++++++++-----
 benchmark/utils.py                  |  9 +++++++++
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/benchmark/bench_moe_topk_softmax.py b/benchmark/bench_moe_topk_softmax.py
index 7d43ca3..36ad89d 100644
--- a/benchmark/bench_moe_topk_softmax.py
+++ b/benchmark/bench_moe_topk_softmax.py
@@ -3,7 +3,7 @@
 import torch
 import triton
 from sgl_kernel import topk_softmax
-from utils import parse_args
+from utils import HAS_VLLM, parse_args
 
 
 def vllm_topk_softmax(gating_output, topk):
@@ -24,6 +24,22 @@ def vllm_topk_softmax(gating_output, topk):
     return topk_weights, topk_indices
 
 
+def navtive_topk_softmax(gating_output, topk):
+    num_tokens, num_experts = gating_output.shape
+
+    import torch.nn.functional as F
+
+    topk_weights = torch.empty(
+        (num_tokens, topk), device=gating_output.device, dtype=torch.float32
+    )
+    topk_indices = torch.empty(
+        (num_tokens, topk), dtype=torch.int32, device=gating_output.device
+    )
+    topk_weights = F.softmax(gating_output.float(), dim=-1)
+    topk_weights, topk_indices = torch.topk(topk_weights, topk, dim=-1)
+    return topk_weights, topk_indices
+
+
 def sglang_topk_softmax(gating_output, topk):
     num_tokens, num_experts = gating_output.shape
 
@@ -74,8 +90,8 @@ def get_benchmark(device="xpu"):
             x_names=["num_tokens", "num_experts", "topk", "dtype"],
             x_vals=configs,
             line_arg="provider",
-            line_vals=["sglang", "vllm"],
-            line_names=["SGLang", "VLLM"],
+            line_vals=["sglang", "native"],
+            line_names=["SGLang", "native"],
             styles=[("blue", "-"), ("green", "-")],
             ylabel="Latency (us)",
             plot_name="topk-softmax-performance",
@@ -88,10 +104,12 @@ def benchmark(num_tokens, num_experts, topk, dtype, provider):
             (num_tokens, num_experts), device=device, dtype=dtype
         )
 
-        if provider == "vllm" or provider == "vllm1":
+        if HAS_VLLM and (provider == "vllm" or provider == "vllm1"):
             fn = lambda: vllm_topk_softmax(gating_output, topk)
         elif provider == "sglang" or provider == "sglang1":
             fn = lambda: sglang_topk_softmax(gating_output, topk)
+        elif provider == "native":
+            fn = lambda: navtive_topk_softmax(gating_output, topk)
 
         quantiles = [0.5, 0.2, 0.8]
         ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles)
@@ -119,7 +137,7 @@ def benchmark(num_tokens, num_experts, topk, dtype, provider):
             f"Config: num_tokens={num_tokens}, num_experts={num_experts}, topk={topk}, dtype={dtype}"
         )
 
-        calculate_diff(num_tokens, num_experts, topk)
+        # calculate_diff(num_tokens, num_experts, topk)
 
     global benchmark_configs
     benchmark_configs = configs
diff --git a/benchmark/utils.py b/benchmark/utils.py
index f890f5e..d64215b 100644
--- a/benchmark/utils.py
+++ b/benchmark/utils.py
@@ -9,6 +9,15 @@
 from transformers import AutoConfig
 
 
+def HAS_VLLM():
+    try:
+        import vllm
+
+        return True
+    except ImportError:
+        return False
+
+
 def get_model_config(args):
     """
     Get model config with priority:

From b6623ba44d27c949f1cb930bb47ab5286d064f72 Mon Sep 17 00:00:00 2001
From: diwei sun <diwei.sun@intel.com>
Date: Tue, 21 Oct 2025 20:25:52 -0700
Subject: [PATCH 05/10] fix format

---
 benchmark/utils.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/benchmark/utils.py b/benchmark/utils.py
index d64215b..f890f5e 100644
--- a/benchmark/utils.py
+++ b/benchmark/utils.py
@@ -9,15 +9,6 @@
 from transformers import AutoConfig
 
 
-def HAS_VLLM():
-    try:
-        import vllm
-
-        return True
-    except ImportError:
-        return False
-
-
 def get_model_config(args):
     """
     Get model config with priority:

From e569f7de7b3b92da2c14878b3b0312ce8a01c7d9 Mon Sep 17 00:00:00 2001
From: diwei sun <diwei.sun@intel.com>
Date: Wed, 5 Nov 2025 01:32:52 -0800
Subject: [PATCH 06/10] refine kernellevel benchmarking for topk

---
 benchmark/bench_moe_topk_softmax.py | 49 +++++++++++++++++++----------
 benchmark/utils.py                  | 21 ++++++++++---
 2 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/benchmark/bench_moe_topk_softmax.py b/benchmark/bench_moe_topk_softmax.py
index 36ad89d..de282b4 100644
--- a/benchmark/bench_moe_topk_softmax.py
+++ b/benchmark/bench_moe_topk_softmax.py
@@ -3,7 +3,7 @@
 import torch
 import triton
 from sgl_kernel import topk_softmax
-from utils import HAS_VLLM, parse_args
+from utils import get_model_config, parse_args
 
 
 def vllm_topk_softmax(gating_output, topk):
@@ -24,7 +24,11 @@ def vllm_topk_softmax(gating_output, topk):
     return topk_weights, topk_indices
 
 
-def navtive_topk_softmax(gating_output, topk):
+def navtive_topk_softmax(
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
     num_tokens, num_experts = gating_output.shape
 
     import torch.nn.functional as F
@@ -37,10 +41,18 @@ def navtive_topk_softmax(gating_output, topk):
     )
     topk_weights = F.softmax(gating_output.float(), dim=-1)
     topk_weights, topk_indices = torch.topk(topk_weights, topk, dim=-1)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
     return topk_weights, topk_indices
 
 
-def sglang_topk_softmax(gating_output, topk):
+def sglang_topk_softmax(
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
     num_tokens, num_experts = gating_output.shape
 
     topk_weights = torch.empty(
@@ -57,7 +69,7 @@ def sglang_topk_softmax(gating_output, topk):
         topk_weights,
         topk_indices,
         gating_output,
-        renormalize=False,
+        renormalize=renormalize,
     )
 
     return topk_weights, topk_indices
@@ -87,7 +99,7 @@ def calculate_diff(num_tokens, num_experts, topk):
 def get_benchmark(device="xpu"):
     @triton.testing.perf_report(
         triton.testing.Benchmark(
-            x_names=["num_tokens", "num_experts", "topk", "dtype"],
+            x_names=["num_tokens", "num_experts", "topk", "dtype", "renormalize"],
             x_vals=configs,
             line_arg="provider",
             line_vals=["sglang", "native"],
@@ -98,18 +110,16 @@ def get_benchmark(device="xpu"):
             args={},
         )
     )
-    def benchmark(num_tokens, num_experts, topk, dtype, provider):
+    def benchmark(num_tokens, num_experts, topk, dtype, renormalize, provider):
 
         gating_output = torch.randn(
             (num_tokens, num_experts), device=device, dtype=dtype
         )
 
-        if HAS_VLLM and (provider == "vllm" or provider == "vllm1"):
-            fn = lambda: vllm_topk_softmax(gating_output, topk)
-        elif provider == "sglang" or provider == "sglang1":
-            fn = lambda: sglang_topk_softmax(gating_output, topk)
+        if provider == "sglang" or provider == "sglang1":
+            fn = lambda: sglang_topk_softmax(gating_output, topk, renormalize)
         elif provider == "native":
-            fn = lambda: navtive_topk_softmax(gating_output, topk)
+            fn = lambda: navtive_topk_softmax(gating_output, topk, renormalize)
 
         quantiles = [0.5, 0.2, 0.8]
         ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles)
@@ -122,19 +132,24 @@ def benchmark(num_tokens, num_experts, topk, dtype, provider):
 if __name__ == "__main__":
     # Run correctness test on small configs if not using a real model
     args = parse_args()
+    params = get_model_config(args)
+
     sweep_params = {
-        "num_tokens": [1, 32, 128, 512],
-        "num_experts": args.num_experts or [64],
-        "top_k": args.top_k or [2, 4],
-        "dtype": [torch.float16, torch.bfloat16],
+        "num_tokens": args.num_tokens,
+        "num_experts": params["num_experts"] or [64],
+        "top_k": params["top_k"] or [2, 4],
+        "dtype": [torch.bfloat16],
+        "renormalize": [False],
     }
+
+    print("sweep_params", sweep_params)
     keys = sweep_params.keys()
     configs = list(itertools.product(*sweep_params.values()))
     print(f"Testing {len(configs)} configurations...")
     for config in configs:
-        num_tokens, num_experts, topk, dtype = config
+        num_tokens, num_experts, topk, dtype, renormalize = config
         print(
-            f"Config: num_tokens={num_tokens}, num_experts={num_experts}, topk={topk}, dtype={dtype}"
+            f"Config: num_tokens={num_tokens}, num_experts={num_experts}, topk={topk}, dtype={dtype}, renormalize={renormalize}"
         )
 
         # calculate_diff(num_tokens, num_experts, topk)
diff --git a/benchmark/utils.py b/benchmark/utils.py
index f890f5e..eed0c3b 100644
--- a/benchmark/utils.py
+++ b/benchmark/utils.py
@@ -54,7 +54,7 @@ def get_model_config(args):
                 or (
                     getattr(hf_config, "hidden_size", None)
                     // getattr(hf_config, "num_attention_heads", 1)
-                    if getattr(hf_config, "hidden_size")
+                    if getattr(hf_config, "hidden_size", None)
                     and getattr(hf_config, "num_attention_heads")
                     else None
                 ),
@@ -64,7 +64,7 @@ def get_model_config(args):
                 "norm_eps": getattr(hf_config, "rms_norm_eps", None)
                 or getattr(hf_config, "layer_norm_eps", 1e-6),
                 "architectures": getattr(hf_config, "architectures", ["Unknown"]),
-                "dtype": getattr(hf_config, "torch_dtype", "float16"),
+                "dtype": str(getattr(hf_config, "torch_dtype", "float16")),
             }
         )
     else:
@@ -121,7 +121,12 @@ def get_model_config(args):
     # Add model name
     config_dict["model_name"] = args.model_name
 
-    return config_dict
+    sweepable_config = {
+        k: [v] if isinstance(v, (int, float, str)) else v
+        for k, v in config_dict.items()
+    }
+
+    return sweepable_config
 
 
 def parse_args():
@@ -154,6 +159,14 @@ def parse_args():
         help="Top-k experts per token (multiple values allowed)",
     )
 
+    parser.add_argument(
+        "--num-tokens",
+        type=int,
+        default=[100],
+        nargs="*",
+        help="Number of tokens (multiple values)",
+    )
+
     # Transformer parameters (support list)
     parser.add_argument(
         "--num-layers",
@@ -218,7 +231,7 @@ def parse_args():
     parser.add_argument(
         "--dtype",
         type=str,
-        default="torch.float32",
+        default="torch.bfloat16",
         choices=["torch.float32", "torch.float16", "torch.bfloat16"],
         help="Data type",
     )

From ea43f57b20e15f150db1de71ba01b676751f754e Mon Sep 17 00:00:00 2001
From: DiweiSun <105627594+DiweiSun@users.noreply.github.com>
Date: Wed, 5 Nov 2025 17:43:59 +0800
Subject: [PATCH 07/10] Update bench_moe_topk_softmax.py


From 0ac9f935fa84e129522eaecbe123571c990834cb Mon Sep 17 00:00:00 2001
From: DiweiSun <105627594+DiweiSun@users.noreply.github.com>
Date: Wed, 5 Nov 2025 17:44:23 +0800
Subject: [PATCH 08/10] Update bench_moe_topk_softmax.py

---
 benchmark/bench_moe_topk_softmax.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmark/bench_moe_topk_softmax.py b/benchmark/bench_moe_topk_softmax.py
index de282b4..7255306 100644
--- a/benchmark/bench_moe_topk_softmax.py
+++ b/benchmark/bench_moe_topk_softmax.py
@@ -142,7 +142,6 @@ def benchmark(num_tokens, num_experts, topk, dtype, renormalize, provider):
         "renormalize": [False],
     }
 
-    print("sweep_params", sweep_params)
     keys = sweep_params.keys()
     configs = list(itertools.product(*sweep_params.values()))
     print(f"Testing {len(configs)} configurations...")

From c2dbfbe6343901cb559f889189322acbd0f5744e Mon Sep 17 00:00:00 2001
From: DiweiSun <105627594+DiweiSun@users.noreply.github.com>
Date: Thu, 6 Nov 2025 10:50:12 +0800
Subject: [PATCH 09/10] Update pr-test-xpu.yml

---
 .github/workflows/pr-test-xpu.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml
index 0f2f2c9..a78802d 100644
--- a/.github/workflows/pr-test-xpu.yml
+++ b/.github/workflows/pr-test-xpu.yml
@@ -55,7 +55,7 @@ jobs:
         timeout-minutes: 20
         run: |
           docker exec -w /root/sglang ci_sglang_xpu \
-            /bin/bash -c "cd /root/sglang/sgl-kernel-xpu/benchmark &&  python3 bench_flash_attn.py "
+            /bin/bash -c "cd /root/sglang/sgl-kernel-xpu/benchmark &&  python3 bench_flash_attn.py && python3 bench_moe_topk_softmax.py "
 
       - name: Run E2E Bfloat16 tests
         timeout-minutes: 20
@@ -71,3 +71,4 @@ jobs:
         if: always()
         run: |
           docker rm -f ci_sglang_xpu || true
+

From 405a522fa43e080a96117218f28898d009fd23f3 Mon Sep 17 00:00:00 2001
From: diwei sun <diwei.sun@intel.com>
Date: Wed, 5 Nov 2025 18:52:13 -0800
Subject: [PATCH 10/10] format fix

---
 .github/workflows/pr-test-xpu.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml
index a78802d..05e97e5 100644
--- a/.github/workflows/pr-test-xpu.yml
+++ b/.github/workflows/pr-test-xpu.yml
@@ -71,4 +71,3 @@ jobs:
         if: always()
         run: |
           docker rm -f ci_sglang_xpu || true
-