diff --git a/sgl-kernel/csrc/allreduce/custom_all_reduce.cu b/sgl-kernel/csrc/allreduce/custom_all_reduce.cu index c04481ea6b4..a1f5ed53e23 100644 --- a/sgl-kernel/csrc/allreduce/custom_all_reduce.cu +++ b/sgl-kernel/csrc/allreduce/custom_all_reduce.cu @@ -18,11 +18,11 @@ init_custom_ar(const std::vector& fake_ipc_ptrs, torch::Tensor& rank_dat if (world_size % 2 != 0) throw std::invalid_argument("Odd num gpus is not supported for now"); if (rank < 0 || rank >= world_size) throw std::invalid_argument("invalid rank passed in"); - vllm::Signal* ipc_ptrs[8]; + sglang::Signal* ipc_ptrs[8]; for (int i = 0; i < world_size; i++) { - ipc_ptrs[i] = reinterpret_cast(fake_ipc_ptrs[i]); + ipc_ptrs[i] = reinterpret_cast(fake_ipc_ptrs[i]); } - return (fptr_t) new vllm::CustomAllreduce( + return (fptr_t) new sglang::CustomAllreduce( ipc_ptrs, rank_data.data_ptr(), rank_data.numel(), rank, world_size, full_nvlink); } @@ -55,7 +55,7 @@ bool _is_weak_contiguous(torch::Tensor& t) { * copied into _reg_buffer. */ void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) { - auto fa = reinterpret_cast(_fa); + auto fa = reinterpret_cast(_fa); const at::cuda::OptionalCUDAGuard device_guard(device_of(inp)); auto stream = c10::cuda::getCurrentCUDAStream().stream(); @@ -98,15 +98,15 @@ void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, fptr_t _reg_ } void dispose(fptr_t _fa) { - delete reinterpret_cast(_fa); + delete reinterpret_cast(_fa); } int64_t meta_size() { - return sizeof(vllm::Signal); + return sizeof(sglang::Signal); } void register_buffer(fptr_t _fa, const std::vector& fake_ipc_ptrs) { - auto fa = reinterpret_cast(_fa); + auto fa = reinterpret_cast(_fa); TORCH_CHECK(fake_ipc_ptrs.size() == fa->world_size_); void* ipc_ptrs[8]; for (int i = 0; i < fake_ipc_ptrs.size(); i++) { @@ -117,7 +117,7 @@ void register_buffer(fptr_t _fa, const std::vector& fake_ipc_ptrs) { // Use vector to represent byte data for python binding compatibility. std::tuple, std::vector> get_graph_buffer_ipc_meta(fptr_t _fa) { - auto fa = reinterpret_cast(_fa); + auto fa = reinterpret_cast(_fa); auto [handle, offsets] = fa->get_graph_buffer_ipc_meta(); std::vector bytes(handle.begin(), handle.end()); return std::make_tuple(bytes, offsets); @@ -126,7 +126,7 @@ std::tuple, std::vector> get_graph_buffer_ipc_meta // Use vector to represent byte data for python binding compatibility. void register_graph_buffers( fptr_t _fa, const std::vector>& handles, const std::vector>& offsets) { - auto fa = reinterpret_cast(_fa); + auto fa = reinterpret_cast(_fa); std::vector bytes; bytes.reserve(handles.size()); for (int i = 0; i < handles.size(); i++) { diff --git a/sgl-kernel/csrc/allreduce/custom_all_reduce.cuh b/sgl-kernel/csrc/allreduce/custom_all_reduce.cuh index 18468df1632..ec223bdebcc 100644 --- a/sgl-kernel/csrc/allreduce/custom_all_reduce.cuh +++ b/sgl-kernel/csrc/allreduce/custom_all_reduce.cuh @@ -15,7 +15,7 @@ #include "utils.h" -namespace vllm { +namespace sglang { constexpr int kMaxBlocks = 36; // Counter may overflow, but it's fine since unsigned int overflow is @@ -483,7 +483,7 @@ class CustomAllreduce { /** * To inspect PTX/SASS, copy paste this header file to compiler explorer and add a template instantiation: - * template void vllm::CustomAllreduce::allreduce(cudaStream_t, half *, + * template void sglang::CustomAllreduce::allreduce(cudaStream_t, half *, half *, int, int, int); */ -} // namespace vllm +} // namespace sglang diff --git a/sgl-kernel/csrc/allreduce/custom_all_reduce.hip b/sgl-kernel/csrc/allreduce/custom_all_reduce.hip index 6c1ef0d0682..7a1a586d40a 100644 --- a/sgl-kernel/csrc/allreduce/custom_all_reduce.hip +++ b/sgl-kernel/csrc/allreduce/custom_all_reduce.hip @@ -29,8 +29,8 @@ fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data, for (int i = 0; i < world_size; i++) { std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(hipIpcMemHandle_t)); } - return (fptr_t) new vllm::CustomAllreduce( - reinterpret_cast(meta.data_ptr()), rank_data.data_ptr(), + return (fptr_t) new sglang::CustomAllreduce( + reinterpret_cast(meta.data_ptr()), rank_data.data_ptr(), rank_data.numel(), ipc_handles, offsets, rank, full_nvlink); } @@ -58,7 +58,7 @@ bool _is_weak_contiguous(torch::Tensor& t) { void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, hipStream_t stream) { - auto fa = reinterpret_cast(_fa); + auto fa = reinterpret_cast(_fa); TORCH_CHECK(_is_weak_contiguous(out)); switch (out.scalar_type()) { case at::ScalarType::Float: { @@ -110,22 +110,22 @@ void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer, } void dispose(fptr_t _fa) { - auto fa = reinterpret_cast(_fa); + auto fa = reinterpret_cast(_fa); delete fa; } -int64_t meta_size() { return sizeof(vllm::Signal); } +int64_t meta_size() { return sizeof(sglang::Signal); } void register_buffer(fptr_t _fa, torch::Tensor& t, const std::vector& handles, const std::vector& offsets) { - auto fa = reinterpret_cast(_fa); + auto fa = reinterpret_cast(_fa); fa->register_buffer(handles, offsets, t.data_ptr()); } std::tuple> get_graph_buffer_ipc_meta( fptr_t _fa) { - auto fa = reinterpret_cast(_fa); + auto fa = reinterpret_cast(_fa); auto [handle_bytes, offsets] = fa->get_graph_buffer_ipc_meta(); auto options = torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU); @@ -137,7 +137,7 @@ std::tuple> get_graph_buffer_ipc_meta( void register_graph_buffers(fptr_t _fa, const std::vector& handles, const std::vector>& offsets) { - auto fa = reinterpret_cast(_fa); + auto fa = reinterpret_cast(_fa); fa->register_graph_buffers(handles, offsets); } diff --git a/sgl-kernel/csrc/allreduce/custom_all_reduce_hip.cuh b/sgl-kernel/csrc/allreduce/custom_all_reduce_hip.cuh index 7baf5f01ef4..ff4d28d29fc 100644 --- a/sgl-kernel/csrc/allreduce/custom_all_reduce_hip.cuh +++ b/sgl-kernel/csrc/allreduce/custom_all_reduce_hip.cuh @@ -26,7 +26,7 @@ typedef __hip_bfloat16 nv_bfloat16; } \ } while (0) -namespace vllm { +namespace sglang { constexpr int kMaxBlocks = 64; // note: we don't want to use atomics for signals because peer atomics are no @@ -572,11 +572,11 @@ class CustomAllreduce { CUDACHECK(hipIpcCloseMemHandle(ptr)); } } -}; // namespace vllm +}; // namespace sglang /** * To inspect PTX/SASS, copy paste this header file to compiler explorer and add a template instantiation: - * template void vllm::CustomAllreduce::allreduce(hipStream_t, half *, + * template void sglang::CustomAllreduce::allreduce(hipStream_t, half *, half *, int, int, int); */ -} // namespace vllm +} // namespace sglang