From 6f48bbd558d6476eafd94bee91dbaf91ac75e63b Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Sat, 29 Nov 2025 13:57:58 +0800 Subject: [PATCH 1/5] add moe_wna16_marlin_gemm_v2 --- .../fused_moe_triton/moe_align_block_size.py | 16 +- .../benchmark/bench_moe_align_block_size.py | 15 +- sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu | 1 + sgl-kernel/csrc/moe/moe_align_kernel.cu | 90 ++++++--- sgl-kernel/tests/test_moe_align.py | 181 ++++++++++++++++++ 5 files changed, 278 insertions(+), 25 deletions(-) diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py b/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py index ce1cae66e9e8..f86867077783 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py @@ -15,7 +15,10 @@ def moe_align_block_size( - topk_ids: torch.Tensor, block_size: int, num_experts: int + topk_ids: torch.Tensor, + block_size: int, + num_experts: int, + pad_to_block_size: bool = False, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Aligns the token distribution across experts to be compatible with block @@ -26,6 +29,9 @@ def moe_align_block_size( top-k expert indices for each token. - block_size: The block size used in block matrix multiplication. - num_experts: The total number of experts. + - pad_to_block_size: Whether to pad the sorted_ids size to a multiple + of block_size. For small batch sizes, setting this to False can + save memory. Returns: - sorted_token_ids: A tensor containing the sorted token indices according @@ -54,7 +60,15 @@ def moe_align_block_size( - The padding ensures that the total number of tokens is now divisible by block_size for proper block matrix operations. """ + # Optimization 1: More precise memory allocation for small batches + # Calculate the minimum required size max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1) + + # Only round up to block_size if explicitly requested + # This saves memory for small batch sizes + if pad_to_block_size: + max_num_tokens_padded = triton.cdiv(max_num_tokens_padded, block_size) * block_size + sorted_ids = torch.empty( (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device ) diff --git a/sgl-kernel/benchmark/bench_moe_align_block_size.py b/sgl-kernel/benchmark/bench_moe_align_block_size.py index 2156c5cd41a7..a99e58c0f79d 100644 --- a/sgl-kernel/benchmark/bench_moe_align_block_size.py +++ b/sgl-kernel/benchmark/bench_moe_align_block_size.py @@ -380,7 +380,11 @@ def benchmark(num_tokens, num_experts, topk, provider): if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser( + description="Benchmark moe_align_block_size kernel. " + "Includes optimizations: " + "1) Precise memory allocation 2) Parallel init 3) EP mode filtering 4) expert_ids padding" + ) parser.add_argument( "--save_path", type=str, @@ -418,6 +422,15 @@ def benchmark(num_tokens, num_experts, topk, provider): num_experts = args.num_experts topk = args.topk + print("\n" + "=" * 80) + print("MoE Align Block Size Kernel Benchmark") + print("Includes optimizations:") + print(" 1. Precise memory allocation for small batches") + print(" 2. Parallel initialization of sorted_token_ids") + print(" 3. EP mode invalid expert filtering") + print(" 4. expert_ids padding") + print("=" * 80 + "\n") + calculate_diff(num_tokens=num_tokens, num_experts=num_experts, topk=topk) if not args.skip_full_benchmark and not IS_CI: # Skip full benchmark in CI diff --git a/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu b/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu index b249f64156da..84148e7df526 100644 --- a/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu +++ b/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu @@ -24,6 +24,7 @@ #endif #include "kernel.h" +#include "marlin_template.h" #include "kernel_marlin.cuh" #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \ diff --git a/sgl-kernel/csrc/moe/moe_align_kernel.cu b/sgl-kernel/csrc/moe/moe_align_kernel.cu index 92fd342707e6..bcca07033f4a 100644 --- a/sgl-kernel/csrc/moe/moe_align_kernel.cu +++ b/sgl-kernel/csrc/moe/moe_align_kernel.cu @@ -29,12 +29,17 @@ __global__ void count_and_sort_expert_tokens_kernel( const scalar_t* __restrict__ topk_ids, int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer, - size_t numel) { + size_t numel, + int32_t num_experts) { const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; const size_t stride = blockDim.x * gridDim.x; for (size_t i = tid; i < numel; i += stride) { int32_t expert_id = topk_ids[i] + 1; + // Filter out invalid experts (for EP mode) + if (expert_id < 0 || expert_id > num_experts) { + continue; + } int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1); sorted_token_ids[rank_post_pad] = i; } @@ -63,7 +68,8 @@ __global__ void moe_align_block_size_kernel( size_t numel, int32_t* __restrict__ cumsum, bool pad_sorted_token_ids, - const int32_t scan_size) { + const int32_t scan_size, + const int32_t max_num_tokens_padded) { extern __shared__ int32_t smem[]; int32_t* shared_counts = smem; // [num_experts] int32_t* prefix = shared_counts + num_experts; // [num_experts + 1] @@ -73,14 +79,29 @@ __global__ void moe_align_block_size_kernel( const size_t tid = threadIdx.x; const size_t stride = blockDim.x; + // Optimization 2: Parallel initialization of sorted_token_ids + if (pad_sorted_token_ids) { + Vec fill_vec; + fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel; + int32_t total_vecs = (max_num_tokens_padded + VEC_SIZE - 1) / VEC_SIZE; + Vec* out_ptr = reinterpret_cast(sorted_token_ids); + for (int32_t i = tid; i < total_vecs; i += stride) { + out_ptr[i] = fill_vec; + } + } + if (tid < num_experts) { shared_counts[tid] = 0; } __syncthreads(); + // Optimization 3: Filter out invalid experts for (size_t i = tid; i < numel; i += stride) { int expert_id = topk_ids[i] + 1; + if (expert_id < 0 || expert_id > num_experts) { + continue; + } atomicAdd(&shared_counts[expert_id], 1); } @@ -200,6 +221,7 @@ __global__ void moe_align_block_size_kernel( if (tid <= num_experts) { cumsum[tid] = prefix[tid]; } + // fill expert_ids const int32_t num_blocks = s_total_tokens_post_pad / block_size; for (int32_t i = tid; i < num_blocks; i += stride) { @@ -216,14 +238,11 @@ __global__ void moe_align_block_size_kernel( expert_ids[i] = left - 2; } - if (pad_sorted_token_ids) { - Vec fill_vec; - fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel; - int32_t total_vecs = (s_total_tokens_post_pad + VEC_SIZE - 1) / VEC_SIZE; - Vec* out_ptr = reinterpret_cast(sorted_token_ids); - for (int32_t i = tid; i < total_vecs; i += stride) { - out_ptr[i] = fill_vec; - } + // Optimization 4: Fill remaining expert_ids with -1 (invalid expert) + const int32_t expert_ids_size = (max_num_tokens_padded + block_size - 1) / block_size; + const int32_t fill_start_idx = num_blocks + tid; + for (int32_t i = fill_start_idx; i < expert_ids_size; i += stride) { + expert_ids[i] = -1; } } @@ -236,7 +255,8 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( int32_t num_experts, int32_t block_size, size_t numel, - bool pad_sorted_token_ids) { + bool pad_sorted_token_ids, + const int32_t max_num_tokens_padded) { const size_t tid = threadIdx.x; const size_t stride = blockDim.x; @@ -244,12 +264,28 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( int32_t* cumsum = shared_mem; int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1); + // Optimization 2: Parallel initialization of sorted_token_ids + if (pad_sorted_token_ids) { + Vec fill_vec; + fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel; + int32_t total_vecs = (max_num_tokens_padded + VEC_SIZE - 1) / VEC_SIZE; + Vec* out_ptr = reinterpret_cast(sorted_token_ids); + for (int32_t i = tid; i < total_vecs; i += stride) { + out_ptr[i] = fill_vec; + } + } + for (int i = 0; i < num_experts; ++i) { tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0; } + // Optimization 3: Filter out invalid experts for (size_t i = tid; i < numel; i += stride) { - ++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i] + 1]; + int32_t expert_id = topk_ids[i] + 1; + if (expert_id < 0 || expert_id > num_experts) { + continue; + } + ++tokens_cnts[(threadIdx.x + 1) * num_experts + expert_id]; } __syncthreads(); @@ -279,20 +315,22 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( } } - if (pad_sorted_token_ids) { - Vec fill_vec; - fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel; - int32_t total_vecs = (*total_tokens_post_pad + VEC_SIZE - 1) / VEC_SIZE; - Vec* out_ptr = reinterpret_cast(sorted_token_ids); - for (int32_t i = tid; i < total_vecs; i += stride) { - out_ptr[i] = fill_vec; - } + // Optimization 4: Fill remaining expert_ids with -1 + const int32_t num_valid_blocks = (*total_tokens_post_pad + block_size - 1) / block_size; + const int32_t expert_ids_size = (max_num_tokens_padded + block_size - 1) / block_size; + const int32_t fill_start_idx = num_valid_blocks + tid; + for (int32_t i = fill_start_idx; i < expert_ids_size; i += stride) { + expert_ids[i] = -1; } __syncthreads(); + // Optimization 3: Filter out invalid experts in sorting phase for (size_t i = tid; i < numel; i += stride) { int32_t expert_id = topk_ids[i] + 1; + if (expert_id < 0 || expert_id > num_experts) { + continue; + } int32_t rank_post_pad = tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id]; sorted_token_ids[rank_post_pad] = i; ++tokens_cnts[threadIdx.x * num_experts + expert_id]; @@ -314,6 +352,9 @@ void moe_align_block_size( threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE; + // Optimization 1: Pass the actual allocated size to kernel + const int32_t max_num_tokens_padded = sorted_token_ids.size(0); + DISPATCH_INTEGRAL_TYPES(topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { bool small_batch_expert_mode = (topk_ids.numel() < 1024) && (num_experts <= 64); @@ -330,7 +371,8 @@ void moe_align_block_size( num_experts, block_size, topk_ids.numel(), - pad_sorted_token_ids); + pad_sorted_token_ids, + max_num_tokens_padded); } else { auto align_kernel = moe_align_block_size_kernel; @@ -346,7 +388,8 @@ void moe_align_block_size( topk_ids.numel(), cumsum_buffer.data_ptr(), pad_sorted_token_ids, - scan_size); + scan_size, + max_num_tokens_padded); const int block_threads = std::min(256, (int)threads); const int num_blocks = (topk_ids.numel() + block_threads - 1) / block_threads; @@ -358,7 +401,8 @@ void moe_align_block_size( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), cumsum_buffer.data_ptr(), - topk_ids.numel()); + topk_ids.numel(), + num_experts); } }); } diff --git a/sgl-kernel/tests/test_moe_align.py b/sgl-kernel/tests/test_moe_align.py index 40a37f563278..25ce5c98e332 100644 --- a/sgl-kernel/tests/test_moe_align.py +++ b/sgl-kernel/tests/test_moe_align.py @@ -268,5 +268,186 @@ def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype): torch.testing.assert_close(actual, expected, atol=2e-2, rtol=0) +# Additional optimization tests +def test_memory_allocation_optimization(): + """ + Test precise memory allocation for small batches + """ + num_tokens = 10 + num_experts = 8 + topk = 2 + block_size = 64 + + topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda") + + # Test with pad_to_block_size=False (should use less memory) + max_num_tokens_padded_no_pad = topk_ids.numel() + (num_experts + 1) * (block_size - 1) + + # Test with pad_to_block_size=True (should round up) + max_num_tokens_padded_with_pad = triton.cdiv(max_num_tokens_padded_no_pad, block_size) * block_size + + assert max_num_tokens_padded_no_pad < max_num_tokens_padded_with_pad, \ + "Without padding should use less memory" + + +def test_parallel_initialization(): + """ + Test parallel initialization of sorted_token_ids + """ + num_tokens = 100 + num_experts = 16 + topk = 4 + block_size = 64 + + topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda") + max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1) + + sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda") + expert_ids = torch.empty( + (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda" + ) + num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda") + cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda") + + # Run with pad_sorted_token_ids=True + moe_align_block_size( + topk_ids, + num_experts + 1, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + cumsum_buffer, + pad_sorted_token_ids=True, + ) + + # Check that padding values are correctly set to numel + valid_count = num_tokens_post_pad.item() + padding_values = sorted_ids[valid_count:valid_count+10] + + # All padding values should be equal to numel (the sentinel value) + assert torch.all(padding_values == topk_ids.numel()), \ + f"Padding values should all be {topk_ids.numel()}, got {padding_values}" + + +def test_invalid_expert_filtering(): + """ + Test filtering out invalid experts (for EP mode) + """ + num_tokens = 50 + num_experts = 16 + topk = 2 + block_size = 32 + + # Create topk_ids with some invalid expert IDs + topk_ids = torch.randint(0, num_experts + 5, (num_tokens, topk), dtype=torch.int32, device="cuda") + + max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1) + sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda") + expert_ids = torch.empty( + (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda" + ) + num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda") + cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda") + + # Should not crash even with invalid expert IDs + moe_align_block_size( + topk_ids, + num_experts + 1, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + cumsum_buffer, + pad_sorted_token_ids=True, + ) + + # Count how many invalid expert IDs we had + invalid_count = torch.sum(topk_ids >= num_experts).item() + valid_count = topk_ids.numel() - invalid_count + + +def test_expert_ids_padding(): + """ + Test filling remaining expert_ids with -1 + """ + num_tokens = 30 + num_experts = 8 + topk = 2 + block_size = 32 + + topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda") + max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1) + + sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda") + expert_ids = torch.empty( + (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda" + ) + num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda") + cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda") + + moe_align_block_size( + topk_ids, + num_experts + 1, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + cumsum_buffer, + pad_sorted_token_ids=True, + ) + + # Calculate the number of valid blocks + valid_blocks = (num_tokens_post_pad.item() + block_size - 1) // block_size + + # Check that remaining expert_ids are filled with -1 + remaining_expert_ids = expert_ids[valid_blocks:] + + # All remaining blocks should have expert_id = -1 + if len(remaining_expert_ids) > 0: + assert torch.all(remaining_expert_ids == -1), \ + f"Remaining expert_ids should be -1, got {remaining_expert_ids[:10]}" + + +@pytest.mark.parametrize( + "num_tokens,num_experts,topk,block_size", + [ + (1, 8, 1, 32), # Small batch + (10, 16, 2, 64), # Medium batch + (100, 32, 4, 128), # Larger batch + (1000, 64, 8, 256), # Large batch + ] +) +def test_all_optimizations_combined(num_tokens, num_experts, topk, block_size): + """ + Test all optimizations work together correctly + """ + topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda") + max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1) + + sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda") + expert_ids = torch.empty( + (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda" + ) + num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda") + cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda") + + # Should complete successfully with all optimizations + moe_align_block_size( + topk_ids, + num_experts + 1, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + cumsum_buffer, + pad_sorted_token_ids=True, + ) + + # Basic sanity checks + assert num_tokens_post_pad.item() > 0, "Should have some valid tokens" + assert num_tokens_post_pad.item() % block_size == 0, "Result should be aligned to block_size" + + if __name__ == "__main__": pytest.main([__file__]) From eeea208cb0b9af4ed9fd70e58c4969f12b625684 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Sat, 29 Nov 2025 14:04:31 +0800 Subject: [PATCH 2/5] Revert "add moe_wna16_marlin_gemm_v2" This reverts commit 6f48bbd558d6476eafd94bee91dbaf91ac75e63b. --- .../fused_moe_triton/moe_align_block_size.py | 16 +- .../benchmark/bench_moe_align_block_size.py | 15 +- sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu | 1 - sgl-kernel/csrc/moe/moe_align_kernel.cu | 90 +++------ sgl-kernel/tests/test_moe_align.py | 181 ------------------ 5 files changed, 25 insertions(+), 278 deletions(-) diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py b/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py index f86867077783..ce1cae66e9e8 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py @@ -15,10 +15,7 @@ def moe_align_block_size( - topk_ids: torch.Tensor, - block_size: int, - num_experts: int, - pad_to_block_size: bool = False, + topk_ids: torch.Tensor, block_size: int, num_experts: int ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Aligns the token distribution across experts to be compatible with block @@ -29,9 +26,6 @@ def moe_align_block_size( top-k expert indices for each token. - block_size: The block size used in block matrix multiplication. - num_experts: The total number of experts. - - pad_to_block_size: Whether to pad the sorted_ids size to a multiple - of block_size. For small batch sizes, setting this to False can - save memory. Returns: - sorted_token_ids: A tensor containing the sorted token indices according @@ -60,15 +54,7 @@ def moe_align_block_size( - The padding ensures that the total number of tokens is now divisible by block_size for proper block matrix operations. """ - # Optimization 1: More precise memory allocation for small batches - # Calculate the minimum required size max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1) - - # Only round up to block_size if explicitly requested - # This saves memory for small batch sizes - if pad_to_block_size: - max_num_tokens_padded = triton.cdiv(max_num_tokens_padded, block_size) * block_size - sorted_ids = torch.empty( (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device ) diff --git a/sgl-kernel/benchmark/bench_moe_align_block_size.py b/sgl-kernel/benchmark/bench_moe_align_block_size.py index a99e58c0f79d..2156c5cd41a7 100644 --- a/sgl-kernel/benchmark/bench_moe_align_block_size.py +++ b/sgl-kernel/benchmark/bench_moe_align_block_size.py @@ -380,11 +380,7 @@ def benchmark(num_tokens, num_experts, topk, provider): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Benchmark moe_align_block_size kernel. " - "Includes optimizations: " - "1) Precise memory allocation 2) Parallel init 3) EP mode filtering 4) expert_ids padding" - ) + parser = argparse.ArgumentParser() parser.add_argument( "--save_path", type=str, @@ -422,15 +418,6 @@ def benchmark(num_tokens, num_experts, topk, provider): num_experts = args.num_experts topk = args.topk - print("\n" + "=" * 80) - print("MoE Align Block Size Kernel Benchmark") - print("Includes optimizations:") - print(" 1. Precise memory allocation for small batches") - print(" 2. Parallel initialization of sorted_token_ids") - print(" 3. EP mode invalid expert filtering") - print(" 4. expert_ids padding") - print("=" * 80 + "\n") - calculate_diff(num_tokens=num_tokens, num_experts=num_experts, topk=topk) if not args.skip_full_benchmark and not IS_CI: # Skip full benchmark in CI diff --git a/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu b/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu index 84148e7df526..b249f64156da 100644 --- a/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu +++ b/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu @@ -24,7 +24,6 @@ #endif #include "kernel.h" -#include "marlin_template.h" #include "kernel_marlin.cuh" #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \ diff --git a/sgl-kernel/csrc/moe/moe_align_kernel.cu b/sgl-kernel/csrc/moe/moe_align_kernel.cu index bcca07033f4a..92fd342707e6 100644 --- a/sgl-kernel/csrc/moe/moe_align_kernel.cu +++ b/sgl-kernel/csrc/moe/moe_align_kernel.cu @@ -29,17 +29,12 @@ __global__ void count_and_sort_expert_tokens_kernel( const scalar_t* __restrict__ topk_ids, int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer, - size_t numel, - int32_t num_experts) { + size_t numel) { const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; const size_t stride = blockDim.x * gridDim.x; for (size_t i = tid; i < numel; i += stride) { int32_t expert_id = topk_ids[i] + 1; - // Filter out invalid experts (for EP mode) - if (expert_id < 0 || expert_id > num_experts) { - continue; - } int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1); sorted_token_ids[rank_post_pad] = i; } @@ -68,8 +63,7 @@ __global__ void moe_align_block_size_kernel( size_t numel, int32_t* __restrict__ cumsum, bool pad_sorted_token_ids, - const int32_t scan_size, - const int32_t max_num_tokens_padded) { + const int32_t scan_size) { extern __shared__ int32_t smem[]; int32_t* shared_counts = smem; // [num_experts] int32_t* prefix = shared_counts + num_experts; // [num_experts + 1] @@ -79,29 +73,14 @@ __global__ void moe_align_block_size_kernel( const size_t tid = threadIdx.x; const size_t stride = blockDim.x; - // Optimization 2: Parallel initialization of sorted_token_ids - if (pad_sorted_token_ids) { - Vec fill_vec; - fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel; - int32_t total_vecs = (max_num_tokens_padded + VEC_SIZE - 1) / VEC_SIZE; - Vec* out_ptr = reinterpret_cast(sorted_token_ids); - for (int32_t i = tid; i < total_vecs; i += stride) { - out_ptr[i] = fill_vec; - } - } - if (tid < num_experts) { shared_counts[tid] = 0; } __syncthreads(); - // Optimization 3: Filter out invalid experts for (size_t i = tid; i < numel; i += stride) { int expert_id = topk_ids[i] + 1; - if (expert_id < 0 || expert_id > num_experts) { - continue; - } atomicAdd(&shared_counts[expert_id], 1); } @@ -221,7 +200,6 @@ __global__ void moe_align_block_size_kernel( if (tid <= num_experts) { cumsum[tid] = prefix[tid]; } - // fill expert_ids const int32_t num_blocks = s_total_tokens_post_pad / block_size; for (int32_t i = tid; i < num_blocks; i += stride) { @@ -238,11 +216,14 @@ __global__ void moe_align_block_size_kernel( expert_ids[i] = left - 2; } - // Optimization 4: Fill remaining expert_ids with -1 (invalid expert) - const int32_t expert_ids_size = (max_num_tokens_padded + block_size - 1) / block_size; - const int32_t fill_start_idx = num_blocks + tid; - for (int32_t i = fill_start_idx; i < expert_ids_size; i += stride) { - expert_ids[i] = -1; + if (pad_sorted_token_ids) { + Vec fill_vec; + fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel; + int32_t total_vecs = (s_total_tokens_post_pad + VEC_SIZE - 1) / VEC_SIZE; + Vec* out_ptr = reinterpret_cast(sorted_token_ids); + for (int32_t i = tid; i < total_vecs; i += stride) { + out_ptr[i] = fill_vec; + } } } @@ -255,8 +236,7 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( int32_t num_experts, int32_t block_size, size_t numel, - bool pad_sorted_token_ids, - const int32_t max_num_tokens_padded) { + bool pad_sorted_token_ids) { const size_t tid = threadIdx.x; const size_t stride = blockDim.x; @@ -264,28 +244,12 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( int32_t* cumsum = shared_mem; int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1); - // Optimization 2: Parallel initialization of sorted_token_ids - if (pad_sorted_token_ids) { - Vec fill_vec; - fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel; - int32_t total_vecs = (max_num_tokens_padded + VEC_SIZE - 1) / VEC_SIZE; - Vec* out_ptr = reinterpret_cast(sorted_token_ids); - for (int32_t i = tid; i < total_vecs; i += stride) { - out_ptr[i] = fill_vec; - } - } - for (int i = 0; i < num_experts; ++i) { tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0; } - // Optimization 3: Filter out invalid experts for (size_t i = tid; i < numel; i += stride) { - int32_t expert_id = topk_ids[i] + 1; - if (expert_id < 0 || expert_id > num_experts) { - continue; - } - ++tokens_cnts[(threadIdx.x + 1) * num_experts + expert_id]; + ++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i] + 1]; } __syncthreads(); @@ -315,22 +279,20 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( } } - // Optimization 4: Fill remaining expert_ids with -1 - const int32_t num_valid_blocks = (*total_tokens_post_pad + block_size - 1) / block_size; - const int32_t expert_ids_size = (max_num_tokens_padded + block_size - 1) / block_size; - const int32_t fill_start_idx = num_valid_blocks + tid; - for (int32_t i = fill_start_idx; i < expert_ids_size; i += stride) { - expert_ids[i] = -1; + if (pad_sorted_token_ids) { + Vec fill_vec; + fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel; + int32_t total_vecs = (*total_tokens_post_pad + VEC_SIZE - 1) / VEC_SIZE; + Vec* out_ptr = reinterpret_cast(sorted_token_ids); + for (int32_t i = tid; i < total_vecs; i += stride) { + out_ptr[i] = fill_vec; + } } __syncthreads(); - // Optimization 3: Filter out invalid experts in sorting phase for (size_t i = tid; i < numel; i += stride) { int32_t expert_id = topk_ids[i] + 1; - if (expert_id < 0 || expert_id > num_experts) { - continue; - } int32_t rank_post_pad = tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id]; sorted_token_ids[rank_post_pad] = i; ++tokens_cnts[threadIdx.x * num_experts + expert_id]; @@ -352,9 +314,6 @@ void moe_align_block_size( threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE; - // Optimization 1: Pass the actual allocated size to kernel - const int32_t max_num_tokens_padded = sorted_token_ids.size(0); - DISPATCH_INTEGRAL_TYPES(topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { bool small_batch_expert_mode = (topk_ids.numel() < 1024) && (num_experts <= 64); @@ -371,8 +330,7 @@ void moe_align_block_size( num_experts, block_size, topk_ids.numel(), - pad_sorted_token_ids, - max_num_tokens_padded); + pad_sorted_token_ids); } else { auto align_kernel = moe_align_block_size_kernel; @@ -388,8 +346,7 @@ void moe_align_block_size( topk_ids.numel(), cumsum_buffer.data_ptr(), pad_sorted_token_ids, - scan_size, - max_num_tokens_padded); + scan_size); const int block_threads = std::min(256, (int)threads); const int num_blocks = (topk_ids.numel() + block_threads - 1) / block_threads; @@ -401,8 +358,7 @@ void moe_align_block_size( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), cumsum_buffer.data_ptr(), - topk_ids.numel(), - num_experts); + topk_ids.numel()); } }); } diff --git a/sgl-kernel/tests/test_moe_align.py b/sgl-kernel/tests/test_moe_align.py index 25ce5c98e332..40a37f563278 100644 --- a/sgl-kernel/tests/test_moe_align.py +++ b/sgl-kernel/tests/test_moe_align.py @@ -268,186 +268,5 @@ def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype): torch.testing.assert_close(actual, expected, atol=2e-2, rtol=0) -# Additional optimization tests -def test_memory_allocation_optimization(): - """ - Test precise memory allocation for small batches - """ - num_tokens = 10 - num_experts = 8 - topk = 2 - block_size = 64 - - topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda") - - # Test with pad_to_block_size=False (should use less memory) - max_num_tokens_padded_no_pad = topk_ids.numel() + (num_experts + 1) * (block_size - 1) - - # Test with pad_to_block_size=True (should round up) - max_num_tokens_padded_with_pad = triton.cdiv(max_num_tokens_padded_no_pad, block_size) * block_size - - assert max_num_tokens_padded_no_pad < max_num_tokens_padded_with_pad, \ - "Without padding should use less memory" - - -def test_parallel_initialization(): - """ - Test parallel initialization of sorted_token_ids - """ - num_tokens = 100 - num_experts = 16 - topk = 4 - block_size = 64 - - topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda") - max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1) - - sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda") - expert_ids = torch.empty( - (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda" - ) - num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda") - cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda") - - # Run with pad_sorted_token_ids=True - moe_align_block_size( - topk_ids, - num_experts + 1, - block_size, - sorted_ids, - expert_ids, - num_tokens_post_pad, - cumsum_buffer, - pad_sorted_token_ids=True, - ) - - # Check that padding values are correctly set to numel - valid_count = num_tokens_post_pad.item() - padding_values = sorted_ids[valid_count:valid_count+10] - - # All padding values should be equal to numel (the sentinel value) - assert torch.all(padding_values == topk_ids.numel()), \ - f"Padding values should all be {topk_ids.numel()}, got {padding_values}" - - -def test_invalid_expert_filtering(): - """ - Test filtering out invalid experts (for EP mode) - """ - num_tokens = 50 - num_experts = 16 - topk = 2 - block_size = 32 - - # Create topk_ids with some invalid expert IDs - topk_ids = torch.randint(0, num_experts + 5, (num_tokens, topk), dtype=torch.int32, device="cuda") - - max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1) - sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda") - expert_ids = torch.empty( - (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda" - ) - num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda") - cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda") - - # Should not crash even with invalid expert IDs - moe_align_block_size( - topk_ids, - num_experts + 1, - block_size, - sorted_ids, - expert_ids, - num_tokens_post_pad, - cumsum_buffer, - pad_sorted_token_ids=True, - ) - - # Count how many invalid expert IDs we had - invalid_count = torch.sum(topk_ids >= num_experts).item() - valid_count = topk_ids.numel() - invalid_count - - -def test_expert_ids_padding(): - """ - Test filling remaining expert_ids with -1 - """ - num_tokens = 30 - num_experts = 8 - topk = 2 - block_size = 32 - - topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda") - max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1) - - sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda") - expert_ids = torch.empty( - (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda" - ) - num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda") - cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda") - - moe_align_block_size( - topk_ids, - num_experts + 1, - block_size, - sorted_ids, - expert_ids, - num_tokens_post_pad, - cumsum_buffer, - pad_sorted_token_ids=True, - ) - - # Calculate the number of valid blocks - valid_blocks = (num_tokens_post_pad.item() + block_size - 1) // block_size - - # Check that remaining expert_ids are filled with -1 - remaining_expert_ids = expert_ids[valid_blocks:] - - # All remaining blocks should have expert_id = -1 - if len(remaining_expert_ids) > 0: - assert torch.all(remaining_expert_ids == -1), \ - f"Remaining expert_ids should be -1, got {remaining_expert_ids[:10]}" - - -@pytest.mark.parametrize( - "num_tokens,num_experts,topk,block_size", - [ - (1, 8, 1, 32), # Small batch - (10, 16, 2, 64), # Medium batch - (100, 32, 4, 128), # Larger batch - (1000, 64, 8, 256), # Large batch - ] -) -def test_all_optimizations_combined(num_tokens, num_experts, topk, block_size): - """ - Test all optimizations work together correctly - """ - topk_ids = torch.randint(0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda") - max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1) - - sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda") - expert_ids = torch.empty( - (triton.cdiv(max_num_tokens_padded, block_size),), dtype=torch.int32, device="cuda" - ) - num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda") - cumsum_buffer = torch.empty((num_experts + 2,), dtype=torch.int32, device="cuda") - - # Should complete successfully with all optimizations - moe_align_block_size( - topk_ids, - num_experts + 1, - block_size, - sorted_ids, - expert_ids, - num_tokens_post_pad, - cumsum_buffer, - pad_sorted_token_ids=True, - ) - - # Basic sanity checks - assert num_tokens_post_pad.item() > 0, "Should have some valid tokens" - assert num_tokens_post_pad.item() % block_size == 0, "Result should be aligned to block_size" - - if __name__ == "__main__": pytest.main([__file__]) From 0f87c8ba8feb97f46ac02a387caf63ca32f89645 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Sat, 6 Dec 2025 23:20:30 +0800 Subject: [PATCH 3/5] Add CUDA kernel size analysis tool for sgl-kernel optimization --- docker/Dockerfile | 3 +- sgl-kernel/README.md | 22 +++ sgl-kernel/analyze_whl_kernel_sizes.py | 261 +++++++++++++++++++++++++ 3 files changed, 285 insertions(+), 1 deletion(-) create mode 100644 sgl-kernel/analyze_whl_kernel_sizes.py diff --git a/docker/Dockerfile b/docker/Dockerfile index f83620fdcbfc..00e98192cf29 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -279,7 +279,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \ wheel \ scikit-build-core \ nixl \ - py-spy + py-spy \ + cubloaty # Build and install sgl-model-gateway (install Rust, build, then remove to save space) RUN --mount=type=cache,target=/root/.cache/pip \ diff --git a/sgl-kernel/README.md b/sgl-kernel/README.md index 51fe3ad38535..aa20923141c5 100644 --- a/sgl-kernel/README.md +++ b/sgl-kernel/README.md @@ -102,6 +102,28 @@ m.impl("fwd", torch::kCUDA, make_pytorch_shim(&mha_fwd)); 3. Run test suite +## Kernel Size Analysis + +Analyze CUDA kernel sizes in compiled wheel files to identify optimization opportunities: + +```bash +# Install cubloaty +pip install cubloaty + +# Analyze a wheel file +python analyze_whl_kernel_sizes.py path/to/sgl_kernel-*.whl + +# Custom output file +python analyze_whl_kernel_sizes.py path/to/sgl_kernel-*.whl --output my_analysis.txt +``` + +The tool generates: +- Text report with kernel groups (by name prefix) and individual kernel sizes +- JSON file with detailed structured data +- Timing information for each analysis step + +Use this to identify large kernels and potential template instantiation bloat. + ## FAQ - Q: Segmentation fault with CUDA 12.6 - A: Update ptxas to 12.8, reference: [segment fault error](https://github.com/Dao-AILab/flash-attention/issues/1453) diff --git a/sgl-kernel/analyze_whl_kernel_sizes.py b/sgl-kernel/analyze_whl_kernel_sizes.py new file mode 100644 index 000000000000..a45322e05c7c --- /dev/null +++ b/sgl-kernel/analyze_whl_kernel_sizes.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import shutil +import subprocess +import sys +import tempfile +import time +import zipfile +from pathlib import Path + + +def extract_whl(whl_file, extract_dir): + with zipfile.ZipFile(whl_file, "r") as zip_ref: + zip_ref.extractall(extract_dir) + + +def find_binary_files(extract_dir): + binary_files = [] + extract_path = Path(extract_dir) + + for so_file in extract_path.rglob("*.so"): + binary_files.append(str(so_file)) + + for cubin_file in extract_path.rglob("*.cubin"): + binary_files.append(str(cubin_file)) + + return sorted(binary_files) + + +def run_cubloaty(binary_file): + result = subprocess.run( + ["cubloaty", binary_file, "--format", "json"], + capture_output=True, + text=True, + timeout=60, + ) + + if result.returncode != 0: + if ( + "No CUDA binary sections found" in result.stderr + or "does not contain device code" in result.stderr + ): + return {} + raise subprocess.CalledProcessError( + result.returncode, result.args, result.stdout, result.stderr + ) + + return json.loads(result.stdout) + + +def analyze_whl(whl_file): + temp_dir = tempfile.mkdtemp(prefix="sgl_kernel_analysis_") + + try: + t0 = time.time() + print(f"Extracting {whl_file}...") + extract_whl(whl_file, temp_dir) + print(f" Extraction took {time.time() - t0:.2f}s\n") + + t0 = time.time() + binary_files = find_binary_files(temp_dir) + if not binary_files: + print(f"No .so or .cubin files found in {whl_file}") + return [] + + print( + f"Found {len(binary_files)} binary files (took {time.time() - t0:.2f}s)\n" + ) + + all_kernels = [] + total_analyzed = 0 + total_skipped = 0 + + for binary_file in binary_files: + file_name = os.path.basename(binary_file) + t0 = time.time() + print(f"Analyzing {file_name}...", end=" ", flush=True) + + data = run_cubloaty(binary_file) + elapsed = time.time() - t0 + + if not data or "kernels" not in data: + print(f"skipped (no CUDA code, {elapsed:.2f}s)") + total_skipped += 1 + continue + + kernel_count = 0 + for kernel in data["kernels"]: + all_kernels.append( + { + "file": file_name, + "name": kernel.get("name", "unknown"), + "size": kernel.get("size", 0), + "size_kb": kernel.get("size", 0) / 1024, + "size_mb": kernel.get("size", 0) / 1024 / 1024, + } + ) + kernel_count += 1 + + print(f"found {kernel_count} kernels ({elapsed:.2f}s)") + total_analyzed += 1 + + print( + f"\nSummary: {total_analyzed} files analyzed, {total_skipped} files skipped\n" + ) + return all_kernels + + finally: + shutil.rmtree(temp_dir, ignore_errors=True) + + +def extract_kernel_prefix(kernel_name): + if "<" in kernel_name: + return kernel_name.split("<")[0] + return kernel_name + + +def generate_report(all_kernels, output_file): + if not all_kernels: + print("No kernels found") + return + + t0 = time.time() + print("Generating report...") + + sorted_kernels = sorted(all_kernels, key=lambda x: x["size"], reverse=True) + total_size = sum(k["size"] for k in all_kernels) + total_size_mb = total_size / 1024 / 1024 + + # Group by kernel prefix + from collections import defaultdict + + kernel_groups = defaultdict(lambda: {"size": 0, "count": 0}) + for kernel in all_kernels: + prefix = extract_kernel_prefix(kernel["name"]) + kernel_groups[prefix]["size"] += kernel["size"] + kernel_groups[prefix]["count"] += 1 + + sorted_groups = sorted( + kernel_groups.items(), key=lambda x: x[1]["size"], reverse=True + ) + + lines = [] + lines.append("=" * 140) + lines.append("CUDA Kernel Size Analysis") + lines.append("=" * 140) + lines.append("") + lines.append(f"Total kernels: {len(all_kernels)}") + lines.append(f"Total size: {total_size_mb:.2f} MB ({total_size:,} bytes)") + lines.append(f"Average kernel size: {total_size / len(all_kernels) / 1024:.2f} KB") + lines.append("") + + # Grouped by kernel name prefix + lines.append("=" * 140) + lines.append("Kernel Groups (by name prefix)") + lines.append("=" * 140) + lines.append( + f"{'Rank':<6} {'Kernel Prefix':<80} {'Count':<8} {'Total (MB)':<12} {'%':<8}" + ) + lines.append("-" * 140) + + for i, (prefix, stats) in enumerate(sorted_groups, 1): + percentage = (stats["size"] / total_size * 100) if total_size > 0 else 0 + size_mb = stats["size"] / 1024 / 1024 + + display_prefix = prefix + if len(display_prefix) > 77: + display_prefix = display_prefix[:74] + "..." + + lines.append( + f"{i:<6} {display_prefix:<80} {stats['count']:<8} {size_mb:<12.2f} {percentage:<8.2f}" + ) + + lines.append("") + lines.append("=" * 140) + lines.append("Individual Kernels (sorted by size)") + lines.append("=" * 140) + lines.append( + f"{'Rank':<6} {'File':<40} {'Kernel Name':<70} {'Size (KB)':<12} {'Size (MB)':<12} {'%':<8}" + ) + lines.append("-" * 140) + + for i, kernel in enumerate(sorted_kernels, 1): + percentage = (kernel["size"] / total_size * 100) if total_size > 0 else 0 + kernel_name = kernel["name"] + if len(kernel_name) > 67: + kernel_name = kernel_name[:64] + "..." + + file_name = kernel["file"] + if len(file_name) > 37: + file_name = file_name[:34] + "..." + + lines.append( + f"{i:<6} {file_name:<40} {kernel_name:<70} " + f"{kernel['size_kb']:<12.2f} {kernel['size_mb']:<12.4f} {percentage:<8.2f}" + ) + + report_text = "\n".join(lines) + + with open(output_file, "w") as f: + f.write(report_text) + print(f"Report saved to: {output_file}") + + json_output = output_file.replace(".txt", ".json") + with open(json_output, "w") as f: + json.dump( + { + "total_kernels": len(all_kernels), + "total_size_bytes": total_size, + "total_size_mb": total_size_mb, + "kernel_groups": [ + { + "prefix": prefix, + "count": stats["count"], + "size_bytes": stats["size"], + "size_mb": stats["size"] / 1024 / 1024, + "percentage": ( + (stats["size"] / total_size * 100) if total_size > 0 else 0 + ), + } + for prefix, stats in sorted_groups + ], + "kernels": sorted_kernels, + }, + f, + indent=2, + ) + print(f"JSON data saved to: {json_output}") + print(f"Report generation took {time.time() - t0:.2f}s") + + +def main(): + parser = argparse.ArgumentParser( + description="Analyze CUDA kernel sizes in sgl-kernel whl file" + ) + parser.add_argument("whl", type=str, help="Path to whl file") + parser.add_argument( + "--output", type=str, default="kernel_analysis.txt", help="Output report file" + ) + args = parser.parse_args() + + if not os.path.exists(args.whl): + print(f"Error: {args.whl} not found") + sys.exit(1) + + total_start = time.time() + print(f"Analyzing {args.whl}\n") + all_kernels = analyze_whl(args.whl) + + if all_kernels: + generate_report(all_kernels, args.output) + print(f"\nTotal time: {time.time() - total_start:.2f}s") + else: + print("No kernel information extracted") + + +if __name__ == "__main__": + main() From 57fa71a5c0348f0dfb68b9e9d7fc9b3a813498cc Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Sun, 7 Dec 2025 15:21:59 +0800 Subject: [PATCH 4/5] Make analyze_whl_kernel_sizes.py executable --- sgl-kernel/analyze_whl_kernel_sizes.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 sgl-kernel/analyze_whl_kernel_sizes.py diff --git a/sgl-kernel/analyze_whl_kernel_sizes.py b/sgl-kernel/analyze_whl_kernel_sizes.py old mode 100644 new mode 100755 From f975e99da194a66b91553b0644dcb4003e550389 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Sun, 7 Dec 2025 15:25:11 +0800 Subject: [PATCH 5/5] ud --- sgl-kernel/analyze_whl_kernel_sizes.py | 2 -- 1 file changed, 2 deletions(-) mode change 100755 => 100644 sgl-kernel/analyze_whl_kernel_sizes.py diff --git a/sgl-kernel/analyze_whl_kernel_sizes.py b/sgl-kernel/analyze_whl_kernel_sizes.py old mode 100755 new mode 100644 index a45322e05c7c..f845c81c9a18 --- a/sgl-kernel/analyze_whl_kernel_sizes.py +++ b/sgl-kernel/analyze_whl_kernel_sizes.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - import argparse import json import os