From 6f48bbd558d6476eafd94bee91dbaf91ac75e63b Mon Sep 17 00:00:00 2001
From: BBuf <1182563586@qq.com>
Date: Sat, 29 Nov 2025 13:57:58 +0800
Subject: [PATCH 1/5] add moe_wna16_marlin_gemm_v2

---
 .../fused_moe_triton/moe_align_block_size.py  |  16 +-
 .../benchmark/bench_moe_align_block_size.py   |  15 +-
 sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu   |   1 +
 sgl-kernel/csrc/moe/moe_align_kernel.cu       |  90 ++++++---
 sgl-kernel/tests/test_moe_align.py            | 181 ++++++++++++++++++
 5 files changed, 278 insertions(+), 25 deletions(-)

diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py b/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py
index ce1cae66e9e8..f86867077783 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py
@@ -15,7 +15,10 @@
 
 
 def moe_align_block_size(
-    topk_ids: torch.Tensor, block_size: int, num_experts: int
+    topk_ids: torch.Tensor, 
+    block_size: int, 
+    num_experts: int,
+    pad_to_block_size: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Aligns the token distribution across experts to be compatible with block
@@ -26,6 +29,9 @@ def moe_align_block_size(
         top-k expert indices for each token.
     - block_size: The block size used in block matrix multiplication.
     - num_experts: The total number of experts.
+    - pad_to_block_size: Whether to pad the sorted_ids size to a multiple
+        of block_size. For small batch sizes, setting this to False can
+        save memory.
 
     Returns:
     - sorted_token_ids: A tensor containing the sorted token indices according
@@ -54,7 +60,15 @@ def moe_align_block_size(
     - The padding ensures that the total number of tokens is now divisible
         by block_size for proper block matrix operations.
     """
+    # Optimization 1: More precise memory allocation for small batches
+    # Calculate the minimum required size
     max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
+    
+    # Only round up to block_size if explicitly requested
+    # This saves memory for small batch sizes
+    if pad_to_block_size:
+        max_num_tokens_padded = triton.cdiv(max_num_tokens_padded, block_size) * block_size
+    
     sorted_ids = torch.empty(
         (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
     )
diff --git a/sgl-kernel/benchmark/bench_moe_align_block_size.py b/sgl-kernel/benchmark/bench_moe_align_block_size.py
index 2156c5cd41a7..a99e58c0f79d 100644
--- a/sgl-kernel/benchmark/bench_moe_align_block_size.py
+++ b/sgl-kernel/benchmark/bench_moe_align_block_size.py
@@ -380,7 +380,11 @@ def benchmark(num_tokens, num_experts, topk, provider):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(
+        description="Benchmark moe_align_block_size kernel. "
+                    "Includes optimizations: "
+                    "1) Precise memory allocation 2) Parallel init 3) EP mode filtering 4) expert_ids padding"
+    )
     parser.add_argument(
         "--save_path",
         type=str,
@@ -418,6 +422,15 @@ def benchmark(num_tokens, num_experts, topk, provider):
         num_experts = args.num_experts
         topk = args.topk
 
+    print("\n" + "=" * 80)
+    print("MoE Align Block Size Kernel Benchmark")
+    print("Includes optimizations:")
+    print("  1. Precise memory allocation for small batches")
+    print("  2. Parallel initialization of sorted_token_ids")
+    print("  3. EP mode invalid expert filtering")
+    print("  4. expert_ids padding")
+    print("=" * 80 + "\n")
+
     calculate_diff(num_tokens=num_tokens, num_experts=num_experts, topk=topk)
 
     if not args.skip_full_benchmark and not IS_CI:  # Skip full benchmark in CI
diff --git a/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu b/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu
index b249f64156da..84148e7df526 100644
--- a/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu
@@ -24,6 +24,7 @@
 #endif
 
 #include "kernel.h"
+#include "marlin_template.h"
 #include "kernel_marlin.cuh"
 
 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)                                        \
diff --git a/sgl-kernel/csrc/moe/moe_align_kernel.cu b/sgl-kernel/csrc/moe/moe_align_kernel.cu
index 92fd342707e6..bcca07033f4a 100644
--- a/sgl-kernel/csrc/moe/moe_align_kernel.cu
+++ b/sgl-kernel/csrc/moe/moe_align_kernel.cu
@@ -29,12 +29,17 @@ __global__ void count_and_sort_expert_tokens_kernel(
     const scalar_t* __restrict__ topk_ids,
     int32_t* __restrict__ sorted_token_ids,
     int32_t* __restrict__ cumsum_buffer,
-    size_t numel) {
+    size_t numel,
+    int32_t num_experts) {
   const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   const size_t stride = blockDim.x * gridDim.x;
 
   for (size_t i = tid; i < numel; i += stride) {
     int32_t expert_id = topk_ids[i] + 1;
+    // Filter out invalid experts (for EP mode)
+    if (expert_id < 0 || expert_id > num_experts) {
+      continue;
+    }
     int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
     sorted_token_ids[rank_post_pad] = i;
   }
@@ -63,7 +68,8 @@ __global__ void moe_align_block_size_kernel(
     size_t numel,
     int32_t* __restrict__ cumsum,
     bool pad_sorted_token_ids,
-    const int32_t scan_size) {
+    const int32_t scan_size,
+    const int32_t max_num_tokens_padded) {
   extern __shared__ int32_t smem[];
   int32_t* shared_counts = smem;                  // [num_experts]
   int32_t* prefix = shared_counts + num_experts;  // [num_experts + 1]
@@ -73,14 +79,29 @@ __global__ void moe_align_block_size_kernel(
   const size_t tid = threadIdx.x;
   const size_t stride = blockDim.x;
 
+  // Optimization 2: Parallel initialization of sorted_token_ids
+  if (pad_sorted_token_ids) {
+    Vec fill_vec;
+    fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel;
+    int32_t total_vecs = (max_num_tokens_padded + VEC_SIZE - 1) / VEC_SIZE;
+    Vec* out_ptr = reinterpret_cast<Vec*>(sorted_token_ids);
+    for (int32_t i = tid; i < total_vecs; i += stride) {
+      out_ptr[i] = fill_vec;
+    }
+  }
+
   if (tid < num_experts) {
     shared_counts[tid] = 0;
   }
 
   __syncthreads();
 
+  // Optimization 3: Filter out invalid experts
   for (size_t i = tid; i < numel; i += stride) {
     int expert_id = topk_ids[i] + 1;
+    if (expert_id < 0 || expert_id > num_experts) {
+      continue;
+    }
     atomicAdd(&shared_counts[expert_id], 1);
   }
 
@@ -200,6 +221,7 @@ __global__ void moe_align_block_size_kernel(
   if (tid <= num_experts) {
     cumsum[tid] = prefix[tid];
   }
+  
   // fill expert_ids
   const int32_t num_blocks = s_total_tokens_post_pad / block_size;
   for (int32_t i = tid; i < num_blocks; i += stride) {
@@ -216,14 +238,11 @@ __global__ void moe_align_block_size_kernel(
     expert_ids[i] = left - 2;
   }
 
-  if (pad_sorted_token_ids) {
-    Vec fill_vec;
-    fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel;
-    int32_t total_vecs = (s_total_tokens_post_pad + VEC_SIZE - 1) / VEC_SIZE;
-    Vec* out_ptr = reinterpret_cast<Vec*>(sorted_token_ids);
-    for (int32_t i = tid; i < total_vecs; i += stride) {
-      out_ptr[i] = fill_vec;
-    }
+  // Optimization 4: Fill remaining expert_ids with -1 (invalid expert)
+  const int32_t expert_ids_size = (max_num_tokens_padded + block_size - 1) / block_size;
+  const int32_t fill_start_idx = num_blocks + tid;
+  for (int32_t i = fill_start_idx; i < expert_ids_size; i += stride) {
+    expert_ids[i] = -1;
   }
 }
 
@@ -236,7 +255,8 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
     int32_t num_experts,
     int32_t block_size,
     size_t numel,
-    bool pad_sorted_token_ids) {
+    bool pad_sorted_token_ids,
+    const int32_t max_num_tokens_padded) {
   const size_t tid = threadIdx.x;
   const size_t stride = blockDim.x;
 
@@ -244,12 +264,28 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
   int32_t* cumsum = shared_mem;
   int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
 
+  // Optimization 2: Parallel initialization of sorted_token_ids
+  if (pad_sorted_token_ids) {
+    Vec fill_vec;
+    fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel;
+    int32_t total_vecs = (max_num_tokens_padded + VEC_SIZE - 1) / VEC_SIZE;
+    Vec* out_ptr = reinterpret_cast<Vec*>(sorted_token_ids);
+    for (int32_t i = tid; i < total_vecs; i += stride) {
+      out_ptr[i] = fill_vec;
+    }
+  }
+
   for (int i = 0; i < num_experts; ++i) {
     tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
   }
 
+  // Optimization 3: Filter out invalid experts
   for (size_t i = tid; i < numel; i += stride) {
-    ++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i] + 1];
+    int32_t expert_id = topk_ids[i] + 1;
+    if (expert_id < 0 || expert_id > num_experts) {
+      continue;
+    }
+    ++tokens_cnts[(threadIdx.x + 1) * num_experts + expert_id];
   }
 
   __syncthreads();
@@ -279,20 +315,22 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
     }
   }
 
-  if (pad_sorted_token_ids) {
-    Vec fill_vec;
-    fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel;
-    int32_t total_vecs = (*total_tokens_post_pad + VEC_SIZE - 1) / VEC_SIZE;
-    Vec* out_ptr = reinterpret_cast<Vec*>(sorted_token_ids);
-    for (int32_t i = tid; i < total_vecs; i += stride) {
-      out_ptr[i] = fill_vec;
-    }
+  // Optimization 4: Fill remaining expert_ids with -1
+  const int32_t num_valid_blocks = (*total_tokens_post_pad + block_size - 1) / block_size;
+  const int32_t expert_ids_size = (max_num_tokens_padded + block_size - 1) / block_size;
+  const int32_t fill_start_idx = num_valid_blocks + tid;
+  for (int32_t i = fill_start_idx; i < expert_ids_size; i += stride) {
+    expert_ids[i] = -1;
   }
 
   __syncthreads();
 
+  // Optimization 3: Filter out invalid experts in sorting phase
   for (size_t i = tid; i < numel; i += stride) {
     int32_t expert_id = topk_ids[i] + 1;
+    if (expert_id < 0 || expert_id > num_experts) {
+      continue;
+    }
     int32_t rank_post_pad = tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
     sorted_token_ids[rank_post_pad] = i;
     ++tokens_cnts[threadIdx.x * num_experts + expert_id];
@@ -314,6 +352,9 @@ void moe_align_block_size(
 
   threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
 
+  // Optimization 1: Pass the actual allocated size to kernel
+  const int32_t max_num_tokens_padded = sorted_token_ids.size(0);
+
   DISPATCH_INTEGRAL_TYPES(topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
     bool small_batch_expert_mode = (topk_ids.numel() < 1024) && (num_experts <= 64);
 
@@ -330,7 +371,8 @@ void moe_align_block_size(
           num_experts,
           block_size,
           topk_ids.numel(),
-          pad_sorted_token_ids);
+          pad_sorted_token_ids,
+          max_num_tokens_padded);
     } else {
       auto align_kernel = moe_align_block_size_kernel<scalar_t>;
 
@@ -346,7 +388,8 @@ void moe_align_block_size(
           topk_ids.numel(),
           cumsum_buffer.data_ptr<int32_t>(),
           pad_sorted_token_ids,
-          scan_size);
+          scan_size,
+          max_num_tokens_padded);
 
       const int block_threads = std::min(256, (int)threads);
       const int num_blocks = (topk_ids.numel() + block_threads - 1) / block_threads;
@@ -358,7 +401,8 @@ void moe_align_block_size(
           topk_ids.data_ptr<scalar_t>(),
           sorted_token_ids.data_ptr<int32_t>(),
           cumsum_buffer.data_ptr<int32_t>(),
-          topk_ids.numel());
+          topk_ids.numel(),
+          num_experts);
     }
   });
 }
diff --git a/sgl-kernel/tests/test_moe_align.py b/sgl-kernel/tests/test_moe_align.py
index 40a37f563278..25ce5c98e332 100644
--- a/sgl-kernel/tests/test_moe_align.py
+++ b/sgl-kernel/tests/test_moe_align.py
@@ -268,5 +268,186 @@ def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype):
     torch.testing.assert_close(actual, expected, atol=2e-2, rtol=0)
 
 
+# Additional optimization tests
+def test_memory_allocation_optimization():
+    """
+    Test precise memory allocation for small batches
+    """
+    num_tokens = 10
+    num_experts = 8
+    topk = 2
+    block_size = 64
+    
+    topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda")
+    
+    # Test with pad_to_block_size=False (should use less memory)
+    max_num_tokens_padded_no_pad = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
+    
+    # Test with pad_to_block_size=True (should round up)
+    max_num_tokens_padded_with_pad = triton.cdiv(max_num_tokens_padded_no_pad, block_size) * block_size
+    
+    assert max_num_tokens_padded_no_pad < max_num_tokens_padded_with_pad, \
+        "Without padding should use less memory"
+
+
+def test_parallel_initialization():
+    """
+    Test parallel initialization of sorted_token_ids
+    """
+    num_tokens = 100
+    num_experts = 16
+    topk = 4
+    block_size = 64
+    
+    topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda")
+    max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
+    
+    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
+    expert_ids = torch.empty(
+        (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda"
+    )
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda")
+    cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda")
+    
+    # Run with pad_sorted_token_ids=True
+    moe_align_block_size(
+        topk_ids,
+        num_experts + 1,
+        block_size,
+        sorted_ids,
+        expert_ids,
+        num_tokens_post_pad,
+        cumsum_buffer,
+        pad_sorted_token_ids=True,
+    )
+    
+    # Check that padding values are correctly set to numel
+    valid_count = num_tokens_post_pad.item()
+    padding_values = sorted_ids[valid_count:valid_count+10]
+    
+    # All padding values should be equal to numel (the sentinel value)
+    assert torch.all(padding_values == topk_ids.numel()), \
+        f"Padding values should all be {topk_ids.numel()}, got {padding_values}"
+
+
+def test_invalid_expert_filtering():
+    """
+    Test filtering out invalid experts (for EP mode)
+    """
+    num_tokens = 50
+    num_experts = 16
+    topk = 2
+    block_size = 32
+    
+    # Create topk_ids with some invalid expert IDs
+    topk_ids = torch.randint(0, num_experts + 5, (num_tokens, topk), dtype=torch.int32, device="cuda")
+    
+    max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
+    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
+    expert_ids = torch.empty(
+        (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda"
+    )
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda")
+    cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda")
+    
+    # Should not crash even with invalid expert IDs
+    moe_align_block_size(
+        topk_ids,
+        num_experts + 1,
+        block_size,
+        sorted_ids,
+        expert_ids,
+        num_tokens_post_pad,
+        cumsum_buffer,
+        pad_sorted_token_ids=True,
+    )
+    
+    # Count how many invalid expert IDs we had
+    invalid_count = torch.sum(topk_ids >= num_experts).item()
+    valid_count = topk_ids.numel() - invalid_count
+
+
+def test_expert_ids_padding():
+    """
+    Test filling remaining expert_ids with -1
+    """
+    num_tokens = 30
+    num_experts = 8
+    topk = 2
+    block_size = 32
+    
+    topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda")
+    max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
+    
+    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
+    expert_ids = torch.empty(
+        (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda"
+    )
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda")
+    cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda")
+    
+    moe_align_block_size(
+        topk_ids,
+        num_experts + 1,
+        block_size,
+        sorted_ids,
+        expert_ids,
+        num_tokens_post_pad,
+        cumsum_buffer,
+        pad_sorted_token_ids=True,
+    )
+    
+    # Calculate the number of valid blocks
+    valid_blocks = (num_tokens_post_pad.item() + block_size - 1) // block_size
+    
+    # Check that remaining expert_ids are filled with -1
+    remaining_expert_ids = expert_ids[valid_blocks:]
+    
+    # All remaining blocks should have expert_id = -1
+    if len(remaining_expert_ids) > 0:
+        assert torch.all(remaining_expert_ids == -1), \
+            f"Remaining expert_ids should be -1, got {remaining_expert_ids[:10]}"
+
+
+@pytest.mark.parametrize(
+    "num_tokens,num_experts,topk,block_size",
+    [
+        (1, 8, 1, 32),      # Small batch
+        (10, 16, 2, 64),    # Medium batch
+        (100, 32, 4, 128),  # Larger batch
+        (1000, 64, 8, 256), # Large batch
+    ]
+)
+def test_all_optimizations_combined(num_tokens, num_experts, topk, block_size):
+    """
+    Test all optimizations work together correctly
+    """
+    topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda")
+    max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
+    
+    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
+    expert_ids = torch.empty(
+        (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda"
+    )
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda")
+    cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda")
+    
+    # Should complete successfully with all optimizations
+    moe_align_block_size(
+        topk_ids,
+        num_experts + 1,
+        block_size,
+        sorted_ids,
+        expert_ids,
+        num_tokens_post_pad,
+        cumsum_buffer,
+        pad_sorted_token_ids=True,
+    )
+    
+    # Basic sanity checks
+    assert num_tokens_post_pad.item() > 0, "Should have some valid tokens"
+    assert num_tokens_post_pad.item() % block_size == 0, "Result should be aligned to block_size"
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From eeea208cb0b9af4ed9fd70e58c4969f12b625684 Mon Sep 17 00:00:00 2001
From: BBuf <1182563586@qq.com>
Date: Sat, 29 Nov 2025 14:04:31 +0800
Subject: [PATCH 2/5] Revert "add moe_wna16_marlin_gemm_v2"

This reverts commit 6f48bbd558d6476eafd94bee91dbaf91ac75e63b.
---
 .../fused_moe_triton/moe_align_block_size.py  |  16 +-
 .../benchmark/bench_moe_align_block_size.py   |  15 +-
 sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu   |   1 -
 sgl-kernel/csrc/moe/moe_align_kernel.cu       |  90 +++------
 sgl-kernel/tests/test_moe_align.py            | 181 ------------------
 5 files changed, 25 insertions(+), 278 deletions(-)

diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py b/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py
index f86867077783..ce1cae66e9e8 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py
@@ -15,10 +15,7 @@
 
 
 def moe_align_block_size(
-    topk_ids: torch.Tensor, 
-    block_size: int, 
-    num_experts: int,
-    pad_to_block_size: bool = False,
+    topk_ids: torch.Tensor, block_size: int, num_experts: int
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Aligns the token distribution across experts to be compatible with block
@@ -29,9 +26,6 @@ def moe_align_block_size(
         top-k expert indices for each token.
     - block_size: The block size used in block matrix multiplication.
     - num_experts: The total number of experts.
-    - pad_to_block_size: Whether to pad the sorted_ids size to a multiple
-        of block_size. For small batch sizes, setting this to False can
-        save memory.
 
     Returns:
     - sorted_token_ids: A tensor containing the sorted token indices according
@@ -60,15 +54,7 @@ def moe_align_block_size(
     - The padding ensures that the total number of tokens is now divisible
         by block_size for proper block matrix operations.
     """
-    # Optimization 1: More precise memory allocation for small batches
-    # Calculate the minimum required size
     max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
-    
-    # Only round up to block_size if explicitly requested
-    # This saves memory for small batch sizes
-    if pad_to_block_size:
-        max_num_tokens_padded = triton.cdiv(max_num_tokens_padded, block_size) * block_size
-    
     sorted_ids = torch.empty(
         (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
     )
diff --git a/sgl-kernel/benchmark/bench_moe_align_block_size.py b/sgl-kernel/benchmark/bench_moe_align_block_size.py
index a99e58c0f79d..2156c5cd41a7 100644
--- a/sgl-kernel/benchmark/bench_moe_align_block_size.py
+++ b/sgl-kernel/benchmark/bench_moe_align_block_size.py
@@ -380,11 +380,7 @@ def benchmark(num_tokens, num_experts, topk, provider):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Benchmark moe_align_block_size kernel. "
-                    "Includes optimizations: "
-                    "1) Precise memory allocation 2) Parallel init 3) EP mode filtering 4) expert_ids padding"
-    )
+    parser = argparse.ArgumentParser()
     parser.add_argument(
         "--save_path",
         type=str,
@@ -422,15 +418,6 @@ def benchmark(num_tokens, num_experts, topk, provider):
         num_experts = args.num_experts
         topk = args.topk
 
-    print("\n" + "=" * 80)
-    print("MoE Align Block Size Kernel Benchmark")
-    print("Includes optimizations:")
-    print("  1. Precise memory allocation for small batches")
-    print("  2. Parallel initialization of sorted_token_ids")
-    print("  3. EP mode invalid expert filtering")
-    print("  4. expert_ids padding")
-    print("=" * 80 + "\n")
-
     calculate_diff(num_tokens=num_tokens, num_experts=num_experts, topk=topk)
 
     if not args.skip_full_benchmark and not IS_CI:  # Skip full benchmark in CI
diff --git a/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu b/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu
index 84148e7df526..b249f64156da 100644
--- a/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu
@@ -24,7 +24,6 @@
 #endif
 
 #include "kernel.h"
-#include "marlin_template.h"
 #include "kernel_marlin.cuh"
 
 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)                                        \
diff --git a/sgl-kernel/csrc/moe/moe_align_kernel.cu b/sgl-kernel/csrc/moe/moe_align_kernel.cu
index bcca07033f4a..92fd342707e6 100644
--- a/sgl-kernel/csrc/moe/moe_align_kernel.cu
+++ b/sgl-kernel/csrc/moe/moe_align_kernel.cu
@@ -29,17 +29,12 @@ __global__ void count_and_sort_expert_tokens_kernel(
     const scalar_t* __restrict__ topk_ids,
     int32_t* __restrict__ sorted_token_ids,
     int32_t* __restrict__ cumsum_buffer,
-    size_t numel,
-    int32_t num_experts) {
+    size_t numel) {
   const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   const size_t stride = blockDim.x * gridDim.x;
 
   for (size_t i = tid; i < numel; i += stride) {
     int32_t expert_id = topk_ids[i] + 1;
-    // Filter out invalid experts (for EP mode)
-    if (expert_id < 0 || expert_id > num_experts) {
-      continue;
-    }
     int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
     sorted_token_ids[rank_post_pad] = i;
   }
@@ -68,8 +63,7 @@ __global__ void moe_align_block_size_kernel(
     size_t numel,
     int32_t* __restrict__ cumsum,
     bool pad_sorted_token_ids,
-    const int32_t scan_size,
-    const int32_t max_num_tokens_padded) {
+    const int32_t scan_size) {
   extern __shared__ int32_t smem[];
   int32_t* shared_counts = smem;                  // [num_experts]
   int32_t* prefix = shared_counts + num_experts;  // [num_experts + 1]
@@ -79,29 +73,14 @@ __global__ void moe_align_block_size_kernel(
   const size_t tid = threadIdx.x;
   const size_t stride = blockDim.x;
 
-  // Optimization 2: Parallel initialization of sorted_token_ids
-  if (pad_sorted_token_ids) {
-    Vec fill_vec;
-    fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel;
-    int32_t total_vecs = (max_num_tokens_padded + VEC_SIZE - 1) / VEC_SIZE;
-    Vec* out_ptr = reinterpret_cast<Vec*>(sorted_token_ids);
-    for (int32_t i = tid; i < total_vecs; i += stride) {
-      out_ptr[i] = fill_vec;
-    }
-  }
-
   if (tid < num_experts) {
     shared_counts[tid] = 0;
   }
 
   __syncthreads();
 
-  // Optimization 3: Filter out invalid experts
   for (size_t i = tid; i < numel; i += stride) {
     int expert_id = topk_ids[i] + 1;
-    if (expert_id < 0 || expert_id > num_experts) {
-      continue;
-    }
     atomicAdd(&shared_counts[expert_id], 1);
   }
 
@@ -221,7 +200,6 @@ __global__ void moe_align_block_size_kernel(
   if (tid <= num_experts) {
     cumsum[tid] = prefix[tid];
   }
-  
   // fill expert_ids
   const int32_t num_blocks = s_total_tokens_post_pad / block_size;
   for (int32_t i = tid; i < num_blocks; i += stride) {
@@ -238,11 +216,14 @@ __global__ void moe_align_block_size_kernel(
     expert_ids[i] = left - 2;
   }
 
-  // Optimization 4: Fill remaining expert_ids with -1 (invalid expert)
-  const int32_t expert_ids_size = (max_num_tokens_padded + block_size - 1) / block_size;
-  const int32_t fill_start_idx = num_blocks + tid;
-  for (int32_t i = fill_start_idx; i < expert_ids_size; i += stride) {
-    expert_ids[i] = -1;
+  if (pad_sorted_token_ids) {
+    Vec fill_vec;
+    fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel;
+    int32_t total_vecs = (s_total_tokens_post_pad + VEC_SIZE - 1) / VEC_SIZE;
+    Vec* out_ptr = reinterpret_cast<Vec*>(sorted_token_ids);
+    for (int32_t i = tid; i < total_vecs; i += stride) {
+      out_ptr[i] = fill_vec;
+    }
   }
 }
 
@@ -255,8 +236,7 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
     int32_t num_experts,
     int32_t block_size,
     size_t numel,
-    bool pad_sorted_token_ids,
-    const int32_t max_num_tokens_padded) {
+    bool pad_sorted_token_ids) {
   const size_t tid = threadIdx.x;
   const size_t stride = blockDim.x;
 
@@ -264,28 +244,12 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
   int32_t* cumsum = shared_mem;
   int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
 
-  // Optimization 2: Parallel initialization of sorted_token_ids
-  if (pad_sorted_token_ids) {
-    Vec fill_vec;
-    fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel;
-    int32_t total_vecs = (max_num_tokens_padded + VEC_SIZE - 1) / VEC_SIZE;
-    Vec* out_ptr = reinterpret_cast<Vec*>(sorted_token_ids);
-    for (int32_t i = tid; i < total_vecs; i += stride) {
-      out_ptr[i] = fill_vec;
-    }
-  }
-
   for (int i = 0; i < num_experts; ++i) {
     tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
   }
 
-  // Optimization 3: Filter out invalid experts
   for (size_t i = tid; i < numel; i += stride) {
-    int32_t expert_id = topk_ids[i] + 1;
-    if (expert_id < 0 || expert_id > num_experts) {
-      continue;
-    }
-    ++tokens_cnts[(threadIdx.x + 1) * num_experts + expert_id];
+    ++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i] + 1];
   }
 
   __syncthreads();
@@ -315,22 +279,20 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
     }
   }
 
-  // Optimization 4: Fill remaining expert_ids with -1
-  const int32_t num_valid_blocks = (*total_tokens_post_pad + block_size - 1) / block_size;
-  const int32_t expert_ids_size = (max_num_tokens_padded + block_size - 1) / block_size;
-  const int32_t fill_start_idx = num_valid_blocks + tid;
-  for (int32_t i = fill_start_idx; i < expert_ids_size; i += stride) {
-    expert_ids[i] = -1;
+  if (pad_sorted_token_ids) {
+    Vec fill_vec;
+    fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel;
+    int32_t total_vecs = (*total_tokens_post_pad + VEC_SIZE - 1) / VEC_SIZE;
+    Vec* out_ptr = reinterpret_cast<Vec*>(sorted_token_ids);
+    for (int32_t i = tid; i < total_vecs; i += stride) {
+      out_ptr[i] = fill_vec;
+    }
   }
 
   __syncthreads();
 
-  // Optimization 3: Filter out invalid experts in sorting phase
   for (size_t i = tid; i < numel; i += stride) {
     int32_t expert_id = topk_ids[i] + 1;
-    if (expert_id < 0 || expert_id > num_experts) {
-      continue;
-    }
     int32_t rank_post_pad = tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
     sorted_token_ids[rank_post_pad] = i;
     ++tokens_cnts[threadIdx.x * num_experts + expert_id];
@@ -352,9 +314,6 @@ void moe_align_block_size(
 
   threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
 
-  // Optimization 1: Pass the actual allocated size to kernel
-  const int32_t max_num_tokens_padded = sorted_token_ids.size(0);
-
   DISPATCH_INTEGRAL_TYPES(topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
     bool small_batch_expert_mode = (topk_ids.numel() < 1024) && (num_experts <= 64);
 
@@ -371,8 +330,7 @@ void moe_align_block_size(
           num_experts,
           block_size,
           topk_ids.numel(),
-          pad_sorted_token_ids,
-          max_num_tokens_padded);
+          pad_sorted_token_ids);
     } else {
       auto align_kernel = moe_align_block_size_kernel<scalar_t>;
 
@@ -388,8 +346,7 @@ void moe_align_block_size(
           topk_ids.numel(),
           cumsum_buffer.data_ptr<int32_t>(),
           pad_sorted_token_ids,
-          scan_size,
-          max_num_tokens_padded);
+          scan_size);
 
       const int block_threads = std::min(256, (int)threads);
       const int num_blocks = (topk_ids.numel() + block_threads - 1) / block_threads;
@@ -401,8 +358,7 @@ void moe_align_block_size(
           topk_ids.data_ptr<scalar_t>(),
           sorted_token_ids.data_ptr<int32_t>(),
           cumsum_buffer.data_ptr<int32_t>(),
-          topk_ids.numel(),
-          num_experts);
+          topk_ids.numel());
     }
   });
 }
diff --git a/sgl-kernel/tests/test_moe_align.py b/sgl-kernel/tests/test_moe_align.py
index 25ce5c98e332..40a37f563278 100644
--- a/sgl-kernel/tests/test_moe_align.py
+++ b/sgl-kernel/tests/test_moe_align.py
@@ -268,186 +268,5 @@ def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype):
     torch.testing.assert_close(actual, expected, atol=2e-2, rtol=0)
 
 
-# Additional optimization tests
-def test_memory_allocation_optimization():
-    """
-    Test precise memory allocation for small batches
-    """
-    num_tokens = 10
-    num_experts = 8
-    topk = 2
-    block_size = 64
-    
-    topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda")
-    
-    # Test with pad_to_block_size=False (should use less memory)
-    max_num_tokens_padded_no_pad = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
-    
-    # Test with pad_to_block_size=True (should round up)
-    max_num_tokens_padded_with_pad = triton.cdiv(max_num_tokens_padded_no_pad, block_size) * block_size
-    
-    assert max_num_tokens_padded_no_pad < max_num_tokens_padded_with_pad, \
-        "Without padding should use less memory"
-
-
-def test_parallel_initialization():
-    """
-    Test parallel initialization of sorted_token_ids
-    """
-    num_tokens = 100
-    num_experts = 16
-    topk = 4
-    block_size = 64
-    
-    topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda")
-    max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
-    
-    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
-    expert_ids = torch.empty(
-        (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda"
-    )
-    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda")
-    cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda")
-    
-    # Run with pad_sorted_token_ids=True
-    moe_align_block_size(
-        topk_ids,
-        num_experts + 1,
-        block_size,
-        sorted_ids,
-        expert_ids,
-        num_tokens_post_pad,
-        cumsum_buffer,
-        pad_sorted_token_ids=True,
-    )
-    
-    # Check that padding values are correctly set to numel
-    valid_count = num_tokens_post_pad.item()
-    padding_values = sorted_ids[valid_count:valid_count+10]
-    
-    # All padding values should be equal to numel (the sentinel value)
-    assert torch.all(padding_values == topk_ids.numel()), \
-        f"Padding values should all be {topk_ids.numel()}, got {padding_values}"
-
-
-def test_invalid_expert_filtering():
-    """
-    Test filtering out invalid experts (for EP mode)
-    """
-    num_tokens = 50
-    num_experts = 16
-    topk = 2
-    block_size = 32
-    
-    # Create topk_ids with some invalid expert IDs
-    topk_ids = torch.randint(0, num_experts + 5, (num_tokens, topk), dtype=torch.int32, device="cuda")
-    
-    max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
-    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
-    expert_ids = torch.empty(
-        (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda"
-    )
-    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda")
-    cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda")
-    
-    # Should not crash even with invalid expert IDs
-    moe_align_block_size(
-        topk_ids,
-        num_experts + 1,
-        block_size,
-        sorted_ids,
-        expert_ids,
-        num_tokens_post_pad,
-        cumsum_buffer,
-        pad_sorted_token_ids=True,
-    )
-    
-    # Count how many invalid expert IDs we had
-    invalid_count = torch.sum(topk_ids >= num_experts).item()
-    valid_count = topk_ids.numel() - invalid_count
-
-
-def test_expert_ids_padding():
-    """
-    Test filling remaining expert_ids with -1
-    """
-    num_tokens = 30
-    num_experts = 8
-    topk = 2
-    block_size = 32
-    
-    topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda")
-    max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
-    
-    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
-    expert_ids = torch.empty(
-        (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda"
-    )
-    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda")
-    cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda")
-    
-    moe_align_block_size(
-        topk_ids,
-        num_experts + 1,
-        block_size,
-        sorted_ids,
-        expert_ids,
-        num_tokens_post_pad,
-        cumsum_buffer,
-        pad_sorted_token_ids=True,
-    )
-    
-    # Calculate the number of valid blocks
-    valid_blocks = (num_tokens_post_pad.item() + block_size - 1) // block_size
-    
-    # Check that remaining expert_ids are filled with -1
-    remaining_expert_ids = expert_ids[valid_blocks:]
-    
-    # All remaining blocks should have expert_id = -1
-    if len(remaining_expert_ids) > 0:
-        assert torch.all(remaining_expert_ids == -1), \
-            f"Remaining expert_ids should be -1, got {remaining_expert_ids[:10]}"
-
-
-@pytest.mark.parametrize(
-    "num_tokens,num_experts,topk,block_size",
-    [
-        (1, 8, 1, 32),      # Small batch
-        (10, 16, 2, 64),    # Medium batch
-        (100, 32, 4, 128),  # Larger batch
-        (1000, 64, 8, 256), # Large batch
-    ]
-)
-def test_all_optimizations_combined(num_tokens, num_experts, topk, block_size):
-    """
-    Test all optimizations work together correctly
-    """
-    topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda")
-    max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
-    
-    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
-    expert_ids = torch.empty(
-        (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda"
-    )
-    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda")
-    cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda")
-    
-    # Should complete successfully with all optimizations
-    moe_align_block_size(
-        topk_ids,
-        num_experts + 1,
-        block_size,
-        sorted_ids,
-        expert_ids,
-        num_tokens_post_pad,
-        cumsum_buffer,
-        pad_sorted_token_ids=True,
-    )
-    
-    # Basic sanity checks
-    assert num_tokens_post_pad.item() > 0, "Should have some valid tokens"
-    assert num_tokens_post_pad.item() % block_size == 0, "Result should be aligned to block_size"
-
-
 if __name__ == "__main__":
     pytest.main([__file__])

From 0f87c8ba8feb97f46ac02a387caf63ca32f89645 Mon Sep 17 00:00:00 2001
From: BBuf <1182563586@qq.com>
Date: Sat, 6 Dec 2025 23:20:30 +0800
Subject: [PATCH 3/5] Add CUDA kernel size analysis tool for sgl-kernel
 optimization

---
 docker/Dockerfile                      |   3 +-
 sgl-kernel/README.md                   |  22 +++
 sgl-kernel/analyze_whl_kernel_sizes.py | 261 +++++++++++++++++++++++++
 3 files changed, 285 insertions(+), 1 deletion(-)
 create mode 100644 sgl-kernel/analyze_whl_kernel_sizes.py

diff --git a/docker/Dockerfile b/docker/Dockerfile
index f83620fdcbfc..00e98192cf29 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -279,7 +279,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     wheel \
     scikit-build-core \
     nixl \
-    py-spy
+    py-spy \
+    cubloaty
 
 # Build and install sgl-model-gateway (install Rust, build, then remove to save space)
 RUN --mount=type=cache,target=/root/.cache/pip \
diff --git a/sgl-kernel/README.md b/sgl-kernel/README.md
index 51fe3ad38535..aa20923141c5 100644
--- a/sgl-kernel/README.md
+++ b/sgl-kernel/README.md
@@ -102,6 +102,28 @@ m.impl("fwd", torch::kCUDA, make_pytorch_shim(&mha_fwd));
 
 3. Run test suite
 
+## Kernel Size Analysis
+
+Analyze CUDA kernel sizes in compiled wheel files to identify optimization opportunities:
+
+```bash
+# Install cubloaty
+pip install cubloaty
+
+# Analyze a wheel file
+python analyze_whl_kernel_sizes.py path/to/sgl_kernel-*.whl
+
+# Custom output file
+python analyze_whl_kernel_sizes.py path/to/sgl_kernel-*.whl --output my_analysis.txt
+```
+
+The tool generates:
+- Text report with kernel groups (by name prefix) and individual kernel sizes
+- JSON file with detailed structured data
+- Timing information for each analysis step
+
+Use this to identify large kernels and potential template instantiation bloat.
+
 ## FAQ
 - Q: Segmentation fault with CUDA 12.6
 - A: Update ptxas to 12.8, reference: [segment fault error](https://github.com/Dao-AILab/flash-attention/issues/1453)
diff --git a/sgl-kernel/analyze_whl_kernel_sizes.py b/sgl-kernel/analyze_whl_kernel_sizes.py
new file mode 100644
index 000000000000..a45322e05c7c
--- /dev/null
+++ b/sgl-kernel/analyze_whl_kernel_sizes.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+import zipfile
+from pathlib import Path
+
+
+def extract_whl(whl_file, extract_dir):
+    with zipfile.ZipFile(whl_file, "r") as zip_ref:
+        zip_ref.extractall(extract_dir)
+
+
+def find_binary_files(extract_dir):
+    binary_files = []
+    extract_path = Path(extract_dir)
+
+    for so_file in extract_path.rglob("*.so"):
+        binary_files.append(str(so_file))
+
+    for cubin_file in extract_path.rglob("*.cubin"):
+        binary_files.append(str(cubin_file))
+
+    return sorted(binary_files)
+
+
+def run_cubloaty(binary_file):
+    result = subprocess.run(
+        ["cubloaty", binary_file, "--format", "json"],
+        capture_output=True,
+        text=True,
+        timeout=60,
+    )
+
+    if result.returncode != 0:
+        if (
+            "No CUDA binary sections found" in result.stderr
+            or "does not contain device code" in result.stderr
+        ):
+            return {}
+        raise subprocess.CalledProcessError(
+            result.returncode, result.args, result.stdout, result.stderr
+        )
+
+    return json.loads(result.stdout)
+
+
+def analyze_whl(whl_file):
+    temp_dir = tempfile.mkdtemp(prefix="sgl_kernel_analysis_")
+
+    try:
+        t0 = time.time()
+        print(f"Extracting {whl_file}...")
+        extract_whl(whl_file, temp_dir)
+        print(f"  Extraction took {time.time() - t0:.2f}s\n")
+
+        t0 = time.time()
+        binary_files = find_binary_files(temp_dir)
+        if not binary_files:
+            print(f"No .so or .cubin files found in {whl_file}")
+            return []
+
+        print(
+            f"Found {len(binary_files)} binary files (took {time.time() - t0:.2f}s)\n"
+        )
+
+        all_kernels = []
+        total_analyzed = 0
+        total_skipped = 0
+
+        for binary_file in binary_files:
+            file_name = os.path.basename(binary_file)
+            t0 = time.time()
+            print(f"Analyzing {file_name}...", end=" ", flush=True)
+
+            data = run_cubloaty(binary_file)
+            elapsed = time.time() - t0
+
+            if not data or "kernels" not in data:
+                print(f"skipped (no CUDA code, {elapsed:.2f}s)")
+                total_skipped += 1
+                continue
+
+            kernel_count = 0
+            for kernel in data["kernels"]:
+                all_kernels.append(
+                    {
+                        "file": file_name,
+                        "name": kernel.get("name", "unknown"),
+                        "size": kernel.get("size", 0),
+                        "size_kb": kernel.get("size", 0) / 1024,
+                        "size_mb": kernel.get("size", 0) / 1024 / 1024,
+                    }
+                )
+                kernel_count += 1
+
+            print(f"found {kernel_count} kernels ({elapsed:.2f}s)")
+            total_analyzed += 1
+
+        print(
+            f"\nSummary: {total_analyzed} files analyzed, {total_skipped} files skipped\n"
+        )
+        return all_kernels
+
+    finally:
+        shutil.rmtree(temp_dir, ignore_errors=True)
+
+
+def extract_kernel_prefix(kernel_name):
+    if "<" in kernel_name:
+        return kernel_name.split("<")[0]
+    return kernel_name
+
+
+def generate_report(all_kernels, output_file):
+    if not all_kernels:
+        print("No kernels found")
+        return
+
+    t0 = time.time()
+    print("Generating report...")
+
+    sorted_kernels = sorted(all_kernels, key=lambda x: x["size"], reverse=True)
+    total_size = sum(k["size"] for k in all_kernels)
+    total_size_mb = total_size / 1024 / 1024
+
+    # Group by kernel prefix
+    from collections import defaultdict
+
+    kernel_groups = defaultdict(lambda: {"size": 0, "count": 0})
+    for kernel in all_kernels:
+        prefix = extract_kernel_prefix(kernel["name"])
+        kernel_groups[prefix]["size"] += kernel["size"]
+        kernel_groups[prefix]["count"] += 1
+
+    sorted_groups = sorted(
+        kernel_groups.items(), key=lambda x: x[1]["size"], reverse=True
+    )
+
+    lines = []
+    lines.append("=" * 140)
+    lines.append("CUDA Kernel Size Analysis")
+    lines.append("=" * 140)
+    lines.append("")
+    lines.append(f"Total kernels: {len(all_kernels)}")
+    lines.append(f"Total size: {total_size_mb:.2f} MB ({total_size:,} bytes)")
+    lines.append(f"Average kernel size: {total_size / len(all_kernels) / 1024:.2f} KB")
+    lines.append("")
+
+    # Grouped by kernel name prefix
+    lines.append("=" * 140)
+    lines.append("Kernel Groups (by name prefix)")
+    lines.append("=" * 140)
+    lines.append(
+        f"{'Rank':<6} {'Kernel Prefix':<80} {'Count':<8} {'Total (MB)':<12} {'%':<8}"
+    )
+    lines.append("-" * 140)
+
+    for i, (prefix, stats) in enumerate(sorted_groups, 1):
+        percentage = (stats["size"] / total_size * 100) if total_size > 0 else 0
+        size_mb = stats["size"] / 1024 / 1024
+
+        display_prefix = prefix
+        if len(display_prefix) > 77:
+            display_prefix = display_prefix[:74] + "..."
+
+        lines.append(
+            f"{i:<6} {display_prefix:<80} {stats['count']:<8} {size_mb:<12.2f} {percentage:<8.2f}"
+        )
+
+    lines.append("")
+    lines.append("=" * 140)
+    lines.append("Individual Kernels (sorted by size)")
+    lines.append("=" * 140)
+    lines.append(
+        f"{'Rank':<6} {'File':<40} {'Kernel Name':<70} {'Size (KB)':<12} {'Size (MB)':<12} {'%':<8}"
+    )
+    lines.append("-" * 140)
+
+    for i, kernel in enumerate(sorted_kernels, 1):
+        percentage = (kernel["size"] / total_size * 100) if total_size > 0 else 0
+        kernel_name = kernel["name"]
+        if len(kernel_name) > 67:
+            kernel_name = kernel_name[:64] + "..."
+
+        file_name = kernel["file"]
+        if len(file_name) > 37:
+            file_name = file_name[:34] + "..."
+
+        lines.append(
+            f"{i:<6} {file_name:<40} {kernel_name:<70} "
+            f"{kernel['size_kb']:<12.2f} {kernel['size_mb']:<12.4f} {percentage:<8.2f}"
+        )
+
+    report_text = "\n".join(lines)
+
+    with open(output_file, "w") as f:
+        f.write(report_text)
+    print(f"Report saved to: {output_file}")
+
+    json_output = output_file.replace(".txt", ".json")
+    with open(json_output, "w") as f:
+        json.dump(
+            {
+                "total_kernels": len(all_kernels),
+                "total_size_bytes": total_size,
+                "total_size_mb": total_size_mb,
+                "kernel_groups": [
+                    {
+                        "prefix": prefix,
+                        "count": stats["count"],
+                        "size_bytes": stats["size"],
+                        "size_mb": stats["size"] / 1024 / 1024,
+                        "percentage": (
+                            (stats["size"] / total_size * 100) if total_size > 0 else 0
+                        ),
+                    }
+                    for prefix, stats in sorted_groups
+                ],
+                "kernels": sorted_kernels,
+            },
+            f,
+            indent=2,
+        )
+    print(f"JSON data saved to: {json_output}")
+    print(f"Report generation took {time.time() - t0:.2f}s")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Analyze CUDA kernel sizes in sgl-kernel whl file"
+    )
+    parser.add_argument("whl", type=str, help="Path to whl file")
+    parser.add_argument(
+        "--output", type=str, default="kernel_analysis.txt", help="Output report file"
+    )
+    args = parser.parse_args()
+
+    if not os.path.exists(args.whl):
+        print(f"Error: {args.whl} not found")
+        sys.exit(1)
+
+    total_start = time.time()
+    print(f"Analyzing {args.whl}\n")
+    all_kernels = analyze_whl(args.whl)
+
+    if all_kernels:
+        generate_report(all_kernels, args.output)
+        print(f"\nTotal time: {time.time() - total_start:.2f}s")
+    else:
+        print("No kernel information extracted")
+
+
+if __name__ == "__main__":
+    main()

From 57fa71a5c0348f0dfb68b9e9d7fc9b3a813498cc Mon Sep 17 00:00:00 2001
From: BBuf <1182563586@qq.com>
Date: Sun, 7 Dec 2025 15:21:59 +0800
Subject: [PATCH 4/5] Make analyze_whl_kernel_sizes.py executable

---
 sgl-kernel/analyze_whl_kernel_sizes.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 sgl-kernel/analyze_whl_kernel_sizes.py

diff --git a/sgl-kernel/analyze_whl_kernel_sizes.py b/sgl-kernel/analyze_whl_kernel_sizes.py
old mode 100644
new mode 100755

From f975e99da194a66b91553b0644dcb4003e550389 Mon Sep 17 00:00:00 2001
From: BBuf <1182563586@qq.com>
Date: Sun, 7 Dec 2025 15:25:11 +0800
Subject: [PATCH 5/5] ud

---
 sgl-kernel/analyze_whl_kernel_sizes.py | 2 --
 1 file changed, 2 deletions(-)
 mode change 100755 => 100644 sgl-kernel/analyze_whl_kernel_sizes.py

diff --git a/sgl-kernel/analyze_whl_kernel_sizes.py b/sgl-kernel/analyze_whl_kernel_sizes.py
old mode 100755
new mode 100644
index a45322e05c7c..f845c81c9a18
--- a/sgl-kernel/analyze_whl_kernel_sizes.py
+++ b/sgl-kernel/analyze_whl_kernel_sizes.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 import argparse
 import json
 import os