vllm-project · dsikka · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024
@@ -167,6 +167,7 @@ set(VLLM_EXT_SRC
   "csrc/layernorm_kernels.cu"
   "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
   "csrc/quantization/gptq/q_gemm.cu"
+  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
   "csrc/quantization/fp8/fp8_cuda_kernels.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/moe_align_block_size_kernels.cu"

diff --git a/csrc/ops.h b/csrc/ops.h
@@ -174,6 +174,11 @@ void dynamic_scaled_fp8_quant(
   torch::Tensor& input,
   torch::Tensor& scale);
 
+void quant_per_tensor(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  float scale);
+
 void moe_align_block_size(
   torch::Tensor topk_ids,
   int num_experts,

diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
@@ -82,6 +82,15 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     &moe_align_block_size,
     "Aligning the number of tokens to be processed by each expert such that it is divisible by the block size.");
 
+  ops.def(
+    "quant_per_tensor",
+    py::overload_cast<
+    torch::Tensor&,
+    torch::Tensor&,
+    float>(&quant_per_tensor),
+    "Per-tensor Quantization");
+
+
   // Cache ops
   pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
   cache_ops.def(

diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -0,0 +1,50 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include <assert.h>
+
+#include "../../dispatch_utils.h"
+
+static inline __device__ int8_t float_to_int8_rn(float x)
+{
+    uint32_t dst;
+    asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
+    return reinterpret_cast<const int8_t&>(dst);
+}
+
+namespace vllm {
+
+template <typename scalar_t, typename scale_type>
+__global__ void quant_kernel(
+  const scalar_t* __restrict__ input,
+  int8_t* __restrict__ out,
+  scale_type scale,
+  const int hidden_size) {
+  const int tid = threadIdx.x;
+  const int token_idx = blockIdx.x;
+
+  for (int i = tid; i < hidden_size; i += blockDim.x) {
+    out[token_idx * hidden_size + i] =
+        float_to_int8_rn(((float)input[token_idx * hidden_size + i]) / scale);
+  }
+}
+} // namespace vllm
+
+void quant_per_tensor(
+  torch::Tensor& out,   // [..., hidden_size]
+  torch::Tensor& input, // [..., hidden_size]
+  float scale) {
+  assert(input.is_contiguous());
+  assert(out.is_contiguous());
+  int hidden_size = input.size(-1);
+  int num_tokens = input.numel() / hidden_size;
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, 1024));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "quant_kernel", [&] {
+    vllm::quant_kernel<scalar_t, float><<<grid, block, 0, stream>>>(
+      input.data_ptr<scalar_t>(),
+      out.data_ptr<int8_t>(),
+      scale,
+      hidden_size);
+  });
+}
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
@@ -7,3 +7,5 @@ nvidia-ml-py # for pynvml package
 vllm-nccl-cu12>=2.18,<2.19  # for downloading nccl library
 torch == 2.3.0
 xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
+nvidia-cutlass == 3.5.0
+