Skip to content
Open
Show file tree
Hide file tree
Changes from 85 commits
Commits
Show all changes
92 commits
Select commit Hold shift + click to select a range
1818b07
update to flashinfer 0.2.5
Edenzzzz May 29, 2025
c82cd7f
Merge branch 'main' into upgrade_act
Edenzzzz May 29, 2025
c6d6ebe
Update rope and bmm args
Edenzzzz May 29, 2025
f2bc110
Merge branch 'main' into upgrade_act
Edenzzzz May 30, 2025
199a7e4
update llama4 chat template and pythonic parser (#6679)
upfixer May 31, 2025
dd79f42
feat(tool call): Enhance Llama32Detector for improved JSON parsing in…
CatherineSue May 31, 2025
d143c1e
Support token-level quantization for EP MoE (#6782)
ch-wan May 31, 2025
e13c073
Temporarily lower mmlu threshold for triton sliding window backend (#…
NorthmanPKU May 31, 2025
e622cca
ci: relax test_function_call_required (#6786)
CatherineSue May 31, 2025
51dadba
Add intel_amx backend for Radix Attention for CPU (#6408)
yanbing-j May 31, 2025
813e7f6
Fix incorrect LoRA weight loading for fused gate_up_proj (#6734)
lifuhuang May 31, 2025
ec8a3c9
fix(PD-disaggregation): Can not get local ip (#6792)
storyicon May 31, 2025
4eb04d5
[FIX] mmmu bench serving result display error (#6525) (#6791)
Arist12 May 31, 2025
a5abf60
Bump torch to 2.7.0 (#6788)
Qiaolin-Yu May 31, 2025
3e1d645
chore: bump sgl-kernel v0.1.5 (#6794)
zhyncs May 31, 2025
5403644
Improve profiler and integrate profiler in bench_one_batch_server (#6…
merrymercy May 31, 2025
a6774a3
chore: upgrade sgl-kernel v0.1.5 (#6795)
zhyncs Jun 1, 2025
9eb5162
[Minor] Always append newline after image token when parsing chat mes…
lifuhuang Jun 1, 2025
fc8b63b
Update CI tests for Llama4 models (#6421)
ravi03071991 Jun 1, 2025
0c42a31
[Feat] Enable PDL automatically on Hopper architecture (#5981)
PopSoda2002 Jun 1, 2025
6fd2dae
chore: update blackwell docker (#6800)
zhyncs Jun 1, 2025
814601b
misc: cache is_hopper_arch (#6799)
Edenzzzz Jun 1, 2025
2431560
set enable_pdl
Edenzzzz Jun 2, 2025
ce6fb05
Merge branch 'main' into upgrade_act
Edenzzzz Jun 2, 2025
68062c5
fix
Edenzzzz Jun 2, 2025
ec620df
Merge branch 'main' into upgrade_act
Edenzzzz Jun 2, 2025
866cfcf
Merge branch 'main' into upgrade_act
FlamingoPg Jun 4, 2025
6ba7d17
Merge branch 'main' into upgrade_act
Edenzzzz Jun 4, 2025
6ee99b8
Merge branch 'main' into upgrade_act
Edenzzzz Jun 6, 2025
0904613
fix args
Edenzzzz Jun 7, 2025
224d74e
fix
Edenzzzz Jun 7, 2025
6f8195e
fix
Edenzzzz Jun 7, 2025
b22d672
Merge branch 'main' into upgrade_act
Edenzzzz Jun 7, 2025
9891438
Merge branch 'main' into upgrade_act
Edenzzzz Jun 7, 2025
9d1239d
Merge branch 'main' into upgrade_act
Edenzzzz Jun 7, 2025
8c6295e
Merge branch 'main' into upgrade_act
Edenzzzz Jun 7, 2025
81c6fa1
fix dtype
Edenzzzz Jun 8, 2025
123b6ba
support blackwell
Edenzzzz Jun 8, 2025
13f81a2
Merge branch 'main' into upgrade_act
Edenzzzz Jun 9, 2025
cfe7732
Merge branch 'main' into upgrade_act
Edenzzzz Jun 9, 2025
d56f8ff
Merge branch 'main' into upgrade_act
Fridge003 Jun 9, 2025
185223e
Merge branch 'main' into upgrade_act
Fridge003 Jun 11, 2025
7d0af1c
Merge branch 'main' into upgrade_act
Edenzzzz Jun 12, 2025
67eae34
Merge branch 'main' into upgrade_act
Edenzzzz Jun 21, 2025
79f4146
Merge branch 'main' into upgrade_act
zhyncs Jun 21, 2025
182f046
Merge branch 'main' into upgrade_act
Edenzzzz Jun 21, 2025
9b116df
Merge branch 'main' into upgrade_act
Fridge003 Jul 1, 2025
efc6b14
Merge branch 'main' into upgrade_act
Fridge003 Jul 13, 2025
6158d77
Merge branch 'main' into upgrade_act
fzyzcjy Jul 15, 2025
20ab53d
Merge branch 'main' into upgrade_act
Fridge003 Jul 20, 2025
3cd9e1f
Merge main
Edenzzzz Jul 26, 2025
17700c7
fix
Edenzzzz Jul 26, 2025
12cc9e2
Merge branch 'main' into upgrade_act
Edenzzzz Jul 27, 2025
547f8fd
Merge branch 'main' into upgrade_act
Edenzzzz Jul 28, 2025
0ab683c
Merge branch 'main' into upgrade_act
Edenzzzz Jul 30, 2025
f958ca9
Merge branch 'main' into upgrade_act
Edenzzzz Jul 31, 2025
b65176d
fix
Edenzzzz Jul 31, 2025
a74ab9e
Update sgl-kernel/python/sgl_kernel/elementwise.py
Edenzzzz Jul 31, 2025
cf46798
Merge branch 'main' into upgrade_act
Edenzzzz Jul 31, 2025
075c7f6
fix
Edenzzzz Aug 1, 2025
334d797
Merge branch 'main' into upgrade_act
Edenzzzz Aug 3, 2025
044e24b
Merge branch 'main' into upgrade_act
Edenzzzz Aug 12, 2025
4b5835f
Merge main
Edenzzzz Sep 8, 2025
2ea1ef4
fix
Edenzzzz Sep 10, 2025
acbfda0
Merge branch 'main' into upgrade_act
Edenzzzz Sep 10, 2025
4eac47a
Merge branch 'main' into upgrade_act
Edenzzzz Sep 15, 2025
7d9f812
Merge branch 'main' into upgrade_act
Fridge003 Sep 15, 2025
dad9d0c
Merge branch 'main' into upgrade_act
Edenzzzz Sep 16, 2025
bae595f
fix
Edenzzzz Sep 16, 2025
b3e29e4
fix
Edenzzzz Sep 16, 2025
aeca0d0
Merge branch 'main' into upgrade_act
Edenzzzz Sep 16, 2025
3bf8ef3
fix
Edenzzzz Sep 16, 2025
ebb587f
Merge branch 'main' into upgrade_act
FlamingoPg Sep 18, 2025
ee883be
Merge branch 'main' into upgrade_act
Edenzzzz Sep 19, 2025
9fc56c2
Merge branch 'main' into upgrade_act
Edenzzzz Oct 25, 2025
2cc892a
Merge branch 'main' into upgrade_act
Fridge003 Oct 28, 2025
eda5473
Merge branch 'main' into upgrade_act
Edenzzzz Oct 28, 2025
124d7f8
Merge branch 'main' into upgrade_act
Edenzzzz Oct 28, 2025
ff1d2d2
Merge branch 'main' into upgrade_act
Fridge003 Oct 29, 2025
232dfe7
Merge branch 'main' into upgrade_act
Edenzzzz Oct 30, 2025
a3bc9dc
Merge branch 'main' into upgrade_act
Edenzzzz Oct 30, 2025
440e351
Merge branch 'main' into upgrade_act
Fridge003 Oct 30, 2025
9bda166
Merge branch 'main' into upgrade_act
Fridge003 Oct 31, 2025
c5b257a
Merge branch 'main' into upgrade_act
Edenzzzz Oct 31, 2025
59bbe7e
Merge branch 'main' into upgrade_act
Fridge003 Nov 1, 2025
5235888
Merge branch 'main' into upgrade_act
Edenzzzz Nov 4, 2025
5ff976c
Merge branch 'main' into upgrade_act
Edenzzzz Nov 4, 2025
b41e65d
Merge branch 'main' into upgrade_act
Fridge003 Nov 5, 2025
6bb801f
Merge branch 'main' into upgrade_act
Fridge003 Nov 5, 2025
0478776
Merge branch 'main' into upgrade_act
Fridge003 Nov 8, 2025
0f14bf6
try device_guard earlier
Edenzzzz Nov 9, 2025
f2c4eb5
Merge branch 'main' into upgrade_act
Fridge003 Nov 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions sgl-kernel/csrc/common_extension.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,18 +79,18 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
m.def("gemma_fused_add_rmsnorm(Tensor! input, Tensor! residual, Tensor weight, float eps, bool enable_pdl) -> ()");
m.impl("gemma_fused_add_rmsnorm", torch::kCUDA, &gemma_fused_add_rmsnorm);

m.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
m.def("silu_and_mul(Tensor! out, Tensor input, bool enable_pdl) -> ()");
m.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);

m.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
m.def("gelu_tanh_and_mul(Tensor! out, Tensor input, bool enable_pdl) -> ()");
m.impl("gelu_tanh_and_mul", torch::kCUDA, &gelu_tanh_and_mul);

m.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
m.def("gelu_and_mul(Tensor! out, Tensor input, bool enable_pdl) -> ()");
m.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul);

m.def(
"apply_rope_pos_ids_cos_sin_cache(Tensor q, Tensor k, Tensor! q_rope, Tensor! k_rope, Tensor cos_sin_cache, "
"Tensor pos_ids, bool interleave, bool enable_pdl, int cuda_stream, "
"Tensor pos_ids, bool interleave, bool enable_pdl, "
"Tensor? v, Tensor!? k_buffer, Tensor!? v_buffer, Tensor? kv_cache_loc) -> ()");
m.impl("apply_rope_pos_ids_cos_sin_cache", torch::kCUDA, &apply_rope_pos_ids_cos_sin_cache);

Expand Down Expand Up @@ -401,7 +401,7 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
*/
m.def(
"bmm_fp8(Tensor A, Tensor B, Tensor! D, Tensor A_scale, Tensor B_scale, Tensor workspace_buffer, int "
"cublas_handle, int cuda_stream) -> ()",
"cublas_handle) -> ()",
{at::Tag::needs_fixed_stride_order});
m.impl("bmm_fp8", torch::kCUDA, &bmm_fp8);

Expand Down
6 changes: 3 additions & 3 deletions sgl-kernel/csrc/common_extension_rocm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
/*
* From csrc/activation
*/
m.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
m.def("silu_and_mul(Tensor! out, Tensor input, bool enable_pdl) -> ()");
m.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);

m.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
m.def("gelu_tanh_and_mul(Tensor! out, Tensor input, bool enable_pdl) -> ()");
m.impl("gelu_tanh_and_mul", torch::kCUDA, &gelu_tanh_and_mul);

m.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
m.def("gelu_and_mul(Tensor! out, Tensor input, bool enable_pdl) -> ()");
m.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul);

m.def("gelu_quick(Tensor! out, Tensor input) -> ()");
Expand Down
96 changes: 69 additions & 27 deletions sgl-kernel/csrc/elementwise/activation.cu
Original file line number Diff line number Diff line change
Expand Up @@ -82,67 +82,109 @@ __device__ __forceinline__ T gelu_tanh(const T& x) {
return detail::from_f32<T>(f32_val * cdf);
}

void silu_and_mul(at::Tensor& out, at::Tensor& input) {
void silu_and_mul(at::Tensor& out, at::Tensor& input, bool enable_pdl) {
int d = input.size(-1) / 2;
int64_t num_tokens = input.numel() / input.size(-1);
dim3 grid(num_tokens);

const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));

auto stream = at::cuda::getCurrentCUDAStream();
DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), c_type, [&] {
uint32_t vec_size = 16 / sizeof(c_type);
dim3 block(std::min(d / vec_size, 1024U));
#if USE_ROCM
dim3 grid(num_tokens);
dim3 block(std::min(d / vec_size, 1024U));
sgl_hip::activation::act_and_mul_kernel<c_type, silu>
<<<grid, block, 0, stream>>>(static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
#else
flashinfer::activation::act_and_mul_kernel<c_type, silu>
<<<grid, block, 0, stream>>>(static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
const c10::cuda::OptionalCUDAGuard device_guard(device_of(input));
cudaLaunchConfig_t config;
config.gridDim = num_tokens;
config.blockDim = std::min(d / vec_size, 1024U);
config.dynamicSmemBytes = 0;
config.stream = stream;
cudaLaunchAttribute attrs[1];
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
config.numAttrs = 1;
config.attrs = attrs;

auto kernel = flashinfer::activation::act_and_mul_kernel<c_type, silu>;
cudaLaunchKernelEx(
&config, kernel, static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);

cudaError_t err = cudaGetLastError();
TORCH_CHECK(err == cudaSuccess, "Failed to launch kernel: ", cudaGetErrorString(err));
#endif
return true;
});
}

void gelu_tanh_and_mul(at::Tensor& out, at::Tensor& input) {
void gelu_tanh_and_mul(at::Tensor& out, at::Tensor& input, bool enable_pdl) {
int d = input.size(-1) / 2;
int64_t num_tokens = input.numel() / input.size(-1);
dim3 grid(num_tokens);

const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));

auto stream = at::cuda::getCurrentCUDAStream();
DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), c_type, [&] {
uint32_t vec_size = 16 / sizeof(c_type);
dim3 block(std::min(d / vec_size, 1024U));
#if USE_ROCM
dim3 grid(num_tokens);
dim3 block(std::min(d / vec_size, 1024U));
sgl_hip::activation::act_and_mul_kernel<c_type, gelu_tanh>
<<<grid, block, 0, stream>>>(static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
#else
flashinfer::activation::act_and_mul_kernel<c_type, gelu_tanh>
<<<grid, block, 0, stream>>>(static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
const c10::cuda::OptionalCUDAGuard device_guard(device_of(input));
cudaLaunchConfig_t config;
config.gridDim = num_tokens;
config.blockDim = std::min(d / vec_size, 1024U);
config.dynamicSmemBytes = 0;
config.stream = stream;
cudaLaunchAttribute attrs[1];
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
config.numAttrs = 1;
config.attrs = attrs;

auto kernel = flashinfer::activation::act_and_mul_kernel<c_type, gelu_tanh>;

cudaLaunchKernelEx(
&config, kernel, static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);

cudaError_t err = cudaGetLastError();
TORCH_CHECK(err == cudaSuccess, "Failed to launch kernel: ", cudaGetErrorString(err));
#endif

return true;
});
}

void gelu_and_mul(at::Tensor& out, at::Tensor& input) {
void gelu_and_mul(at::Tensor& out, at::Tensor& input, bool enable_pdl) {
int d = input.size(-1) / 2;
int64_t num_tokens = input.numel() / input.size(-1);
dim3 grid(num_tokens);

const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));

auto stream = at::cuda::getCurrentCUDAStream();
DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), c_type, [&] {
uint32_t vec_size = 16 / sizeof(c_type);
dim3 block(std::min(d / vec_size, 1024U));
#if USE_ROCM
dim3 grid(num_tokens);
dim3 block(std::min(d / vec_size, 1024U));
sgl_hip::activation::act_and_mul_kernel<c_type, gelu>
<<<grid, block, 0, stream>>>(static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
#else
flashinfer::activation::act_and_mul_kernel<c_type, gelu>
<<<grid, block, 0, stream>>>(static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
const c10::cuda::OptionalCUDAGuard device_guard(device_of(input));
cudaLaunchConfig_t config;
config.gridDim = num_tokens;
config.blockDim = std::min(d / vec_size, 1024U);
config.dynamicSmemBytes = 0;
config.stream = stream;
cudaLaunchAttribute attrs[1];
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
config.numAttrs = 1;
config.attrs = attrs;

auto kernel = flashinfer::activation::act_and_mul_kernel<c_type, gelu>;

cudaLaunchKernelEx(
&config, kernel, static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);

cudaError_t err = cudaGetLastError();
TORCH_CHECK(err == cudaSuccess, "Failed to launch kernel: ", cudaGetErrorString(err));
#endif

return true;
Expand Down
5 changes: 2 additions & 3 deletions sgl-kernel/csrc/elementwise/rope.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#include "pytorch_extension_utils.h"

using namespace flashinfer;

void apply_rope_pos_ids_cos_sin_cache(
at::Tensor q,
at::Tensor k,
Expand All @@ -28,7 +27,6 @@ void apply_rope_pos_ids_cos_sin_cache(
at::Tensor pos_ids,
bool interleave,
bool enable_pdl,
int64_t cuda_stream,
const std::optional<at::Tensor>& v,
const std::optional<at::Tensor>& k_buffer,
const std::optional<at::Tensor>& v_buffer,
Expand Down Expand Up @@ -88,7 +86,8 @@ void apply_rope_pos_ids_cos_sin_cache(
size_t k_rope_stride_n = k_rope.stride(0);
size_t k_rope_stride_h = k_rope.stride(1);

cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
const c10::cuda::OptionalCUDAGuard device_guard(q.device());
auto stream = at::cuda::getCurrentCUDAStream();
DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(q.scalar_type(), c_type, [&] {
// TODO temporarily only use `BatchQKApplyRotaryPosIdsCosSinCacheEnhanced` when save_kv_cache
// to avoid changing original code path; but this branch is feature-complete and should switch to this later
Expand Down
6 changes: 3 additions & 3 deletions sgl-kernel/csrc/gemm/bmm_fp8.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ void bmm_fp8(
at::Tensor A_scale,
at::Tensor B_scale,
at::Tensor workspace_buffer,
int64_t cublas_handle,
int64_t cuda_stream) {
int64_t cublas_handle) {
TORCH_CHECK(A.is_cuda(), "A must be a CUDA tensor");
TORCH_CHECK(B.is_cuda(), "B must be a CUDA tensor");
TORCH_CHECK(D.is_cuda(), "D must be a CUDA tensor");
Expand All @@ -51,7 +50,8 @@ void bmm_fp8(
auto n = B.size(2);

auto lt_handle = reinterpret_cast<cublasLtHandle_t>(cublas_handle);
auto stream = reinterpret_cast<cudaStream_t>(cuda_stream);
const c10::cuda::OptionalCUDAGuard device_guard(A.device());
auto stream = at::cuda::getCurrentCUDAStream();

auto status = flashinfer::bmm_fp8::bmm_fp8_internal_cublaslt(
workspace_buffer.data_ptr(),
Expand Down
11 changes: 4 additions & 7 deletions sgl-kernel/include/sgl_kernel_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,9 @@ void sgl_fused_add_rmsnorm(
torch::Tensor input, torch::Tensor residual, torch::Tensor weight, double eps, bool enable_pdl);
void gemma_rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, double eps, bool enable_pdl);
void gemma_fused_add_rmsnorm(at::Tensor& input, at::Tensor& residual, at::Tensor& weight, double eps, bool enable_pdl);
void silu_and_mul(at::Tensor& out, at::Tensor& input);
void gelu_tanh_and_mul(at::Tensor& out, at::Tensor& input);
void gelu_and_mul(at::Tensor& out, at::Tensor& input);

void silu_and_mul(at::Tensor& out, at::Tensor& input, bool enable_pdl);
void gelu_tanh_and_mul(at::Tensor& out, at::Tensor& input, bool enable_pdl);
void gelu_and_mul(at::Tensor& out, at::Tensor& input, bool enable_pdl);
void apply_rope_pos_ids_cos_sin_cache(
at::Tensor q,
at::Tensor k,
Expand All @@ -152,7 +151,6 @@ void apply_rope_pos_ids_cos_sin_cache(
at::Tensor pos_ids,
bool interleave,
bool enable_pdl,
int64_t cuda_stream,
const std::optional<at::Tensor>& v,
const std::optional<at::Tensor>& k_buffer,
const std::optional<at::Tensor>& v_buffer,
Expand Down Expand Up @@ -253,8 +251,7 @@ void bmm_fp8(
at::Tensor A_scale,
at::Tensor B_scale,
at::Tensor workspace_buffer,
int64_t cublas_handle,
int64_t cuda_stream);
int64_t cublas_handle);
void dsv3_router_gemm(torch::Tensor& output, const torch::Tensor& mat_a, const torch::Tensor& mat_b);
void dsv3_fused_a_gemm(torch::Tensor& output, torch::Tensor const& mat_a, torch::Tensor const& mat_b);

Expand Down
Loading
Loading