diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4cb48c119e6c..ebea46103699 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -167,9 +167,6 @@ set(VLLM_EXT_SRC
   "csrc/layernorm_kernels.cu"
   "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
   "csrc/quantization/gptq/q_gemm.cu"
-  "csrc/quantization/smoothquant/int8gemm/cublasAlgoMap.cc"
-  "csrc/quantization/smoothquant/int8gemm/cublasINT8MMWrapper.cc"
-  "csrc/quantization/smoothquant/int8gemm/cuda_utils.cc"
   "csrc/quantization/smoothquant/fused_kernels.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/moe_align_block_size_kernels.cu"
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index 8c4fbdaed105..983455878023 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -1,7 +1,6 @@
 #include "cache.h"
 #include "cuda_utils.h"
 #include "ops.h"
-#include "quantization/smoothquant/int8gemm/int8_gemm.h"
 #include <torch/extension.h>
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
@@ -50,21 +49,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     "fused_add_rms_norm",
     &fused_add_rms_norm,
     "In-place fused Add and RMS Normalization");
-  ops.def(
-    "dequant",
-    py::overload_cast<
-    torch::Tensor&,
-    torch::Tensor&,
-    float>(&dequant),
-    "Dequant.");
-  ops.def(
-    "dequant",
-    py::overload_cast<
-    torch::Tensor&,
-    torch::Tensor&,
-    torch::Tensor&,
-    float>(&dequant),
-    "Per-token dequant.");
   ops.def(
     "quant",
     py::overload_cast<
@@ -102,12 +86,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
   ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
   ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
-  pybind11::class_<I8CUGEMM>(ops, "I8CUGEMM")
-    .def(pybind11::init<>())
-    .def("linear_a8_w8_o32", &I8CUGEMM::linear_a8_w8_o32)
-    .def("linear_a8_w8_o8", &I8CUGEMM::linear_a8_w8_o8)
-    .def("linear_a8_w8_o8_", &I8CUGEMM::linear_a8_w8_o8_)
-    .def("linear_a8_w8_o32_", &I8CUGEMM::linear_a8_w8_o32_);
   ops.def(
     "moe_align_block_size",
     &moe_align_block_size,
diff --git a/csrc/quantization/smoothquant/fused_kernels.cu b/csrc/quantization/smoothquant/fused_kernels.cu
index 1e9d1acf8f47..1d23a5a0653c 100644
--- a/csrc/quantization/smoothquant/fused_kernels.cu
+++ b/csrc/quantization/smoothquant/fused_kernels.cu
@@ -7,27 +7,6 @@
 #include "quant_utils.cuh"
 
 namespace vllm {
-template <typename scalar_t, bool use_per_token_dequant>
-__global__ void dequant_kernel(
-  const int32_t* __restrict__ input,
-  scalar_t* __restrict__ out,
-  const float scale,
-  const int m,
-  const int hidden_size,
-  const int input_stride,
-  const int out_stride,
-  const float* __restrict__ act_scale = nullptr) {
-  const int tid = threadIdx.x;
-  const int token_idx = blockIdx.x;
-  float scale_ = scale;
-  if constexpr (use_per_token_dequant) {
-    scale_ = scale * act_scale[token_idx];
-  }
-  for (int i = tid; i < hidden_size; i += blockDim.x) {
-    out[token_idx * out_stride + i] =
-    (scalar_t)(((float)input[token_idx * input_stride + i]) * scale_);
-  }
-}
 
 template <typename scalar_t, typename scale_type, bool use_per_token_quant>
 __global__ void quant_kernel(
@@ -71,56 +50,6 @@ __global__ void quant_kernel(
 }
 } // namespace vllm
 
-void dequant(
-  torch::Tensor& out,   // [..., hidden_size]
-  torch::Tensor& input, // [..., hidden_size]
-  float scale) {
-  int hidden_size = input.size(-1);
-  int num_tokens = input.numel() / hidden_size;
-  dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
-  int input_stride = input.stride(-2);
-  int out_stride = out.stride(-2);
-
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(out.scalar_type(), "dequant_kernel", [&] {
-    vllm::dequant_kernel<scalar_t, false><<<grid, block, 0, stream>>>(
-      input.data_ptr<int32_t>(),
-      out.data_ptr<scalar_t>(),
-      scale,
-      num_tokens,
-      hidden_size,
-      input_stride,
-      out_stride);
-  });
-}
-
-void dequant(
-  torch::Tensor& out,   // [..., hidden_size]
-  torch::Tensor& input, // [..., hidden_size]
-  torch::Tensor& scale,
-  float weight_dequant_scale) {
-  int hidden_size = input.size(-1);
-  int num_tokens = input.numel() / hidden_size;
-  dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
-  int input_stride = input.stride(-2);
-  int out_stride = out.stride(-2);
-
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(out.scalar_type(), "dequant_kernel", [&] {
-    vllm::dequant_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
-      input.data_ptr<int32_t>(),
-      out.data_ptr<scalar_t>(),
-      weight_dequant_scale,
-      num_tokens,
-      hidden_size,
-      input_stride,
-      out_stride,
-      scale.data_ptr<float>());
-  });
-}
-
 void quant(
   torch::Tensor& out,   // [..., hidden_size]
   torch::Tensor& input, // [..., hidden_size]
diff --git a/csrc/quantization/smoothquant/int8gemm/allocator.h b/csrc/quantization/smoothquant/int8gemm/allocator.h
deleted file mode 100644
index 79be2e99e29c..000000000000
--- a/csrc/quantization/smoothquant/int8gemm/allocator.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * Memory Allocator
- **/
-
-#pragma once
-
-#include "cuda_utils.h"
-#include <cuda_runtime.h>
-#include <unordered_map>
-#include <vector>
-
-#if defined(CUDART_VERSION) && CUDART_VERSION < 11020
-#define CUDA_MEMORY_POOL_DISABLED
-#endif
-
-enum class AllocatorType { CUDA, TF, TH };
-
-enum class ReallocType {
-  INCREASE,
-  REUSE,
-  DECREASE,
-};
-
-class IAllocator {
-public:
-  virtual ~IAllocator(){};
-
-  virtual void *malloc(size_t size, const bool is_set_zero = true,
-                       bool is_host = false) = 0;
-  virtual void free(void **ptr, bool is_host = false) const = 0;
-  virtual void setStream(cudaStream_t stream) = 0;
-  virtual cudaStream_t returnStream() = 0;
-  virtual void memSet(void *ptr, const int val, const size_t size) = 0;
-
-  template <typename T>
-  void *reMalloc(T *ptr, size_t size, const bool is_set_zero = true,
-                 bool is_host = false) {
-    // FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    size = ((size + 31) / 32) * 32; // make the buffer align with 32 bytes
-    void *void_ptr = (void *)ptr;
-    void *ptr_address = getAddress(void_ptr);
-    if (isExist(ptr_address)) {
-      ReallocType realloc_type = isReMalloc(ptr_address, size);
-      if (realloc_type == ReallocType::INCREASE) {
-        // FT_LOG_DEBUG("ReMalloc the buffer %p since it is too small.",
-        // void_ptr);
-        free((void **)(&void_ptr), is_host);
-        return malloc(size, is_set_zero, is_host);
-      }
-#if !defined(CUDA_MEMORY_POOL_DISABLED)
-      else if (realloc_type == ReallocType::DECREASE) {
-        // FT_LOG_DEBUG("ReMalloc the buffer %p to release unused memory to
-        // memory pools.", void_ptr);
-        free((void **)(&void_ptr), is_host);
-        return malloc(size, is_set_zero, is_host);
-      }
-#endif
-      else {
-        // FT_LOG_DEBUG("Reuse original buffer %p with size %d and do nothing
-        // for reMalloc.", void_ptr, size);
-        if (is_set_zero) {
-          memSet(void_ptr, 0, size);
-        }
-        return void_ptr;
-      }
-    } else {
-      // FT_LOG_DEBUG("Cannot find buffer %p, mallocing new one.", void_ptr);
-      return malloc(size, is_set_zero, is_host);
-    }
-  }
-
-protected:
-  virtual bool isExist(void *address) const = 0;
-  virtual ReallocType isReMalloc(void *address, size_t size) const = 0;
-
-  void *getAddress(void *ptr) const { return ptr; }
-};
-
-template <AllocatorType AllocType_> class Allocator;
-
-template <> class Allocator<AllocatorType::CUDA> : public IAllocator {
-private:
-  const int device_id_;
-  cudaStream_t stream_ = 0; // initialize as default stream
-  std::unordered_map<void *, size_t> *pointer_mapping_;
-
-  bool isExist(void *address) const {
-    return pointer_mapping_->count(address) > 0;
-  }
-  ReallocType isReMalloc(void *address, size_t size) const {
-    FT_CHECK(isExist(address));
-    if (pointer_mapping_->at(address) < size) {
-      return ReallocType::INCREASE;
-    } else if (pointer_mapping_->at(address) == size) {
-      return ReallocType::REUSE;
-    } else {
-      return ReallocType::DECREASE;
-    }
-  }
-
-public:
-  Allocator(int device_id) : device_id_(device_id) {
-    // FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    pointer_mapping_ = new std::unordered_map<void *, size_t>();
-#if defined(CUDA_MEMORY_POOL_DISABLED)
-    // FT_LOG_WARNING(
-    //     "Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync
-    //     cudaMalloc/Free." "Note this may lead to hang with NCCL kernels
-    //     launched in parallel; if so, try NCCL_LAUNCH_MODE=GROUP");
-#else
-    int device_count = 1;
-    check_cuda_error(cudaGetDeviceCount(&device_count));
-    cudaMemPool_t mempool;
-    check_cuda_error(cudaDeviceGetDefaultMemPool(&mempool, device_id));
-    cudaMemAccessDesc desc = {};
-    int peer_access_available = 0;
-    for (int i = 0; i < device_count; i++) {
-      if (i == device_id) {
-        continue;
-      }
-      check_cuda_error(
-          cudaDeviceCanAccessPeer(&peer_access_available, device_id, i));
-      if (!peer_access_available) {
-        // FT_LOG_WARNING("Device " + std::to_string(device_id) + " peer access
-        // Device " + std::to_string(i)
-        //                + " is not available.");
-        continue;
-      }
-      desc.location.type = cudaMemLocationTypeDevice;
-      desc.location.id = i;
-      desc.flags = cudaMemAccessFlagsProtReadWrite;
-      check_cuda_error(cudaMemPoolSetAccess(mempool, &desc, 1));
-    }
-    // set memory pool threshold to avoid shrinking the pool
-    uint64_t setVal = UINT64_MAX;
-    check_cuda_error(cudaMemPoolSetAttribute(
-        mempool, cudaMemPoolAttrReleaseThreshold, &setVal));
-#endif
-  }
-
-  virtual ~Allocator() {
-    // FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    while (!pointer_mapping_->empty()) {
-      free((void **)(&pointer_mapping_->begin()->first));
-    }
-    delete pointer_mapping_;
-  }
-
-  void setStream(cudaStream_t stream) { stream_ = stream; }
-
-  cudaStream_t returnStream() { return stream_; };
-
-  void *malloc(size_t size, const bool is_set_zero = true,
-               bool is_host = false) {
-    // FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    if (size == 0) {
-      return nullptr;
-    }
-    void *ptr = nullptr;
-    int o_device = 0;
-
-    check_cuda_error(getSetDevice(device_id_, &o_device));
-    if (is_host) {
-      check_cuda_error(cudaMallocHost(&ptr, (size_t)(ceil(size / 32.)) * 32));
-    } else {
-#if defined(CUDA_MEMORY_POOL_DISABLED)
-      check_cuda_error(cudaMalloc(&ptr, (size_t)(ceil(size / 32.)) * 32));
-#else
-      check_cuda_error(
-          cudaMallocAsync(&ptr, (size_t)(ceil(size / 32.)) * 32, stream_));
-#endif
-    }
-    if (is_set_zero) {
-      check_cuda_error(
-          cudaMemsetAsync(ptr, 0, (size_t)(ceil(size / 32.)) * 32, stream_));
-    }
-    check_cuda_error(getSetDevice(o_device));
-    // FT_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size);
-
-    pointer_mapping_->insert({getAddress(ptr), size});
-
-    return ptr;
-  }
-
-  void free(void **ptr, bool is_host = false) const {
-    // FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    void *address = getAddress(*ptr);
-    if (*ptr != nullptr) {
-      int o_device = 0;
-      if (pointer_mapping_->count(address)) {
-        // FT_LOG_DEBUG("Free buffer %p", address);
-        check_cuda_error(getSetDevice(device_id_, &o_device));
-        if (is_host) {
-          check_cuda_error(cudaFreeHost(*ptr));
-        } else {
-#if defined(CUDA_MEMORY_POOL_DISABLED)
-          check_cuda_error(cudaFree(*ptr));
-#else
-          check_cuda_error(cudaFreeAsync(*ptr, stream_));
-          cudaStreamSynchronize(stream_);
-#endif
-        }
-        check_cuda_error(getSetDevice(o_device));
-        pointer_mapping_->erase(address);
-      } else {
-        // FT_LOG_WARNING("pointer_mapping_ does not have information of ptr at
-        // %p.", address);
-      }
-    }
-    *ptr = nullptr;
-    return;
-  }
-
-  void memSet(void *ptr, const int val, const size_t size) {
-    check_cuda_error(cudaMemsetAsync(ptr, val, size, stream_));
-  }
-};
\ No newline at end of file
diff --git a/csrc/quantization/smoothquant/int8gemm/cublasAlgoMap.cc b/csrc/quantization/smoothquant/int8gemm/cublasAlgoMap.cc
deleted file mode 100644
index 61e41438c6a8..000000000000
--- a/csrc/quantization/smoothquant/int8gemm/cublasAlgoMap.cc
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "cublasAlgoMap.h"
-
-cublasAlgoMap::cublasAlgoMap(const std::string filename,
-                             const std::string sp_config_filename)
-    : config_filename_(filename), sp_config_filename_(sp_config_filename) {
-  loadGemmConfig();
-  loadSpGemmConfig();
-}
-
-cublasAlgoMap::cublasAlgoMap(const cublasAlgoMap &algo_map)
-    : config_filename_(algo_map.config_filename_),
-      sp_config_filename_(algo_map.sp_config_filename_),
-      algo_map_(algo_map.algo_map_), sp_algo_map_(algo_map.sp_algo_map_) {}
-
-cublasAlgoMap::~cublasAlgoMap() { algo_map_.clear(); }
-
-void cublasAlgoMap::loadGemmConfig() {
-  FILE *fd;
-  fd = fopen(config_filename_.c_str(), "r");
-  if (fd == NULL) {
-    std::cout << "[WARNING] " << config_filename_
-              << " is not found; using default GEMM algo" << std::endl;
-    return;
-  }
-
-  int batchCount2, m2, n2, k2, algoId, customOption, tile, splitK_val;
-  int batch_size, seq_len, head_num, size_per_head, dataType;
-  int swizzle, reductionScheme, workspaceSize, stages;
-  int inner_shapeId, cluster_shapeId, mma_shapeId, cga_shapeId, sche_mode;
-  float exec_time;
-  char tmp[1024];
-  if (!fgets(tmp, 1024, fd)) {
-    printf("[ERROR] fgets fail at %s:%d \n", __FILE__, __LINE__);
-    exit(-1);
-  }
-  while (fscanf(fd,
-                "%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                "%d %d "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                "%d %d %d "
-#endif
-                "%f\n",
-                &batch_size, &seq_len, &head_num, &size_per_head, &dataType,
-                &batchCount2, &n2, &m2, &k2, &algoId, &customOption, &tile,
-                &splitK_val, &swizzle, &reductionScheme, &workspaceSize,
-                &stages,
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                &inner_shapeId, &cluster_shapeId,
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                &mma_shapeId, &cga_shapeId, &sche_mode,
-#endif
-                &exec_time) != EOF) {
-    if (dataType != FLOAT_DATATYPE && dataType != HALF_DATATYPE &&
-        dataType != BFLOAT16_DATATYPE && dataType != INT8_DATATYPE &&
-        dataType != FP8_DATATYPE) {
-      printf("[WARNING][readAlgoFromConfig] wrong dataType %d!\n", dataType);
-      continue;
-    }
-    cublasAlgoConfig_t markStr{batchCount2, m2, n2, k2,
-                               static_cast<CublasDataType>(dataType)};
-    // workspaceSize should be zero
-    if (algo_map_.find(markStr) == algo_map_.end()) {
-      algo_map_[markStr].algoId = algoId;
-      algo_map_[markStr].customOption = customOption;
-      algo_map_[markStr].tile = tile;
-      algo_map_[markStr].splitK_val = splitK_val;
-      algo_map_[markStr].swizzle = swizzle;
-      algo_map_[markStr].reductionScheme = reductionScheme;
-      algo_map_[markStr].workspaceSize = workspaceSize;
-      algo_map_[markStr].stages = stages;
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-      algo_map_[markStr].inner_shapeId = (uint16_t)inner_shapeId;
-      algo_map_[markStr].cluster_shapeId = (uint16_t)cluster_shapeId;
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-      algo_map_[markStr].mma_shapeId = (uint16_t)mma_shapeId;
-      algo_map_[markStr].cga_shapeId = (uint16_t)cga_shapeId;
-      algo_map_[markStr].sche_mode = (uint16_t)sche_mode;
-#endif
-      algo_map_[markStr].exec_time = exec_time;
-    }
-  }
-  fclose(fd);
-}
-
-bool cublasAlgoMap::isExist(const int batch_count, const int m, const int n,
-                            const int k, const CublasDataType data_type) {
-  cublasAlgoConfig_t mark{batch_count, n, m, k, data_type};
-  return algo_map_.find(mark) != algo_map_.end();
-}
-
-cublasLtMatmulAlgo_info cublasAlgoMap::getAlgo(const int batch_count,
-                                               const int m, const int n,
-                                               const int k,
-                                               const CublasDataType data_type) {
-  cublasAlgoConfig_t mark{batch_count, n, m, k, data_type};
-  if (algo_map_.find(mark) != algo_map_.end()) {
-    return algo_map_[mark];
-  } else {
-    cublasLtMatmulAlgo_info tmp_algo;
-    tmp_algo.algoId = static_cast<int>(data_type == FLOAT_DATATYPE
-                                           ? CUBLAS_GEMM_DEFAULT
-                                           : CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    tmp_algo.customOption = -1;
-    tmp_algo.tile = -1;
-    tmp_algo.splitK_val = -1;
-    tmp_algo.swizzle = -1;
-    tmp_algo.reductionScheme = -1;
-    tmp_algo.workspaceSize = -1;
-    tmp_algo.stages = -1;
-    tmp_algo.exec_time = -1.0f;
-    return tmp_algo;
-  }
-}
-
-void cublasAlgoMap::loadSpGemmConfig() {
-  if (sp_config_filename_.empty()) {
-    return;
-  }
-  FILE *fd = fopen(sp_config_filename_.c_str(), "r");
-  if (fd == NULL) {
-    printf("[WARNING] %s is not found; using SPGEMM algo id 0\n",
-           sp_config_filename_.c_str());
-    return;
-  }
-  sp_algo_map_.clear();
-  int batch_size, seq_len, head_num, size_per_head, data_type;
-  int batchCount, m, n, k, algoId;
-  float exec_time;
-  char tmp[1024];
-  if (!fgets(tmp, 1024, fd)) {
-    printf("[ERROR] fgets fail at %s:%d \n", __FILE__, __LINE__);
-    exit(-1);
-  }
-  while (fscanf(fd, "%d %d %d %d %d ### %d %d %d %d %d %f\n", &batch_size,
-                &seq_len, &head_num, &size_per_head, &data_type, &batchCount,
-                &m, &n, &k, &algoId, &exec_time) != EOF) {
-    char mark[256];
-    sprintf(mark, "%d_%d_%d_%d", batchCount, m, n, k);
-    std::string markStr(mark);
-    sp_algo_map_[markStr] = algoId;
-  }
-  fclose(fd);
-}
-
-int cublasAlgoMap::getSpAlgo(const int batch_count, const int m, const int n,
-                             const int k) {
-  char mark[256];
-  sprintf(mark, "%d_%d_%d_%d", batch_count, m, n, k);
-  if (sp_algo_map_.find(mark) != sp_algo_map_.end()) {
-    return sp_algo_map_[mark];
-  } else {
-    // for remove padding, select algo 1 for simplicity
-    return 0;
-  }
-}
-
-bool cublasAlgoMap::isUseSparse(const int batch_count, const int m, const int n,
-                                const int k) {
-  // not available to use cusparselt.
-  if (m % 8 != 0 || n % 8 != 0 || k % 8 != 0) {
-    return false;
-  }
-  char mark[256];
-  sprintf(mark, "%d_%d_%d_%d", batch_count, m, n, k);
-  if (sp_algo_map_.find(mark) != sp_algo_map_.end()) {
-    return sp_algo_map_[mark] != -1;
-  } else {
-    // no gemm test case, choose sparse according to sparse flag
-    return true;
-  }
-}
diff --git a/csrc/quantization/smoothquant/int8gemm/cublasAlgoMap.h b/csrc/quantization/smoothquant/int8gemm/cublasAlgoMap.h
deleted file mode 100644
index beb9d3a23d90..000000000000
--- a/csrc/quantization/smoothquant/int8gemm/cublasAlgoMap.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "cuda_utils.h"
-#include <cublasLt.h>
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <map>
-#include <string>
-#include <unordered_map>
-#include <utility>
-
-#pragma once
-
-#define GEMM_NUM 6
-#define GEMM_CONFIG "gemm_config.in"
-#define IGEMM_CONFIG "igemm_config.in"
-#define SPGEMM_CONFIG "spgemm_config.in"
-#define SPIGEMM_CONFIG "spigemm_config.in"
-
-typedef struct {
-  int algoId, customOption, tile, splitK_val;
-  int swizzle, reductionScheme, workspaceSize;
-  // only used in cublasLt >= 11.0
-  int stages;
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-  uint16_t inner_shapeId, cluster_shapeId;
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-  uint16_t mma_shapeId, cga_shapeId, sche_mode;
-#endif
-  float exec_time;
-} cublasLtMatmulAlgo_info;
-
-/* Structure to store information about different run trials */
-typedef struct {
-  cublasLtMatmulAlgo_t algo;
-  cublasStatus_t status;
-  float time;
-  size_t workspaceSize; // actual memory workspace needed
-  cublasMath_t mathMode;
-  cublasLtReductionScheme_t reductionScheme;
-  int customOption;
-  float wavesCount;
-} customMatmulPerf_t;
-
-struct cublasAlgoConfig_t {
-  int batch_count;
-  int m;
-  int n;
-  int k;
-  CublasDataType data_type;
-  bool operator==(cublasAlgoConfig_t const &config) const {
-    return (batch_count == config.batch_count) && (m == config.m) &&
-           (n == config.n) && (k == config.k) &&
-           (data_type == config.data_type);
-  }
-};
-
-class cublasAlgoConfig_hasher {
-public:
-  std::size_t operator()(cublasAlgoConfig_t const &config) const {
-    return config.batch_count * 98317ull ^ config.m * 49157ull ^
-           config.n * 24593ull ^ config.k * 196613ull ^
-           static_cast<int>(config.data_type) * 6151ull;
-  }
-};
-
-class cublasAlgoMap {
-private:
-  std::unordered_map<cublasAlgoConfig_t, cublasLtMatmulAlgo_info,
-                     cublasAlgoConfig_hasher>
-      algo_map_;
-  std::string config_filename_;
-  std::string sp_config_filename_;
-  std::map<std::string, int> sp_algo_map_;
-
-public:
-  cublasAlgoMap(){};
-  explicit cublasAlgoMap(const std::string filename,
-                         const std::string sp_config_filename = "");
-  cublasAlgoMap(const cublasAlgoMap &map);
-  ~cublasAlgoMap();
-  void loadGemmConfig();
-  void loadSpGemmConfig();
-  int getSpAlgo(const int batch_count, const int m, const int n, const int k);
-  bool isUseSparse(const int batch_count, const int m, const int n,
-                   const int k);
-
-  bool isExist(const int batch_count, const int m, const int n, const int k,
-               const CublasDataType data_type);
-
-  cublasLtMatmulAlgo_info getAlgo(const int batch_count, const int m,
-                                  const int n, const int k,
-                                  const CublasDataType data_type);
-};
diff --git a/csrc/quantization/smoothquant/int8gemm/cublasINT8MMWrapper.cc b/csrc/quantization/smoothquant/int8gemm/cublasINT8MMWrapper.cc
deleted file mode 100644
index 03c656b10cbd..000000000000
--- a/csrc/quantization/smoothquant/int8gemm/cublasINT8MMWrapper.cc
+++ /dev/null
@@ -1,676 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "cublasINT8MMWrapper.h"
-
-#ifndef CUDART_VERSION
-#error CUDART_VERSION Undefined!
-#endif
-
-cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle,
-                                         cudaStream_t stream,
-                                         cublasAlgoMap *cublas_algo_map,
-                                         std::mutex *mu,
-                                         bool use_ORDER_COL32_2R_4R4)
-    : cublas_handle_(nullptr), cublaslt_handle_(cublaslt_handle),
-      stream_(stream), cublas_algo_map_(cublas_algo_map), mu_(mu),
-      allocator_(nullptr), use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4) {}
-
-cublasINT8MMWrapper::cublasINT8MMWrapper(cublasHandle_t cublas_handle,
-                                         cublasLtHandle_t cublaslt_handle,
-                                         cudaStream_t stream,
-                                         cublasAlgoMap *cublas_algo_map,
-                                         std::mutex *mu,
-                                         bool use_ORDER_COL32_2R_4R4)
-    : cublas_handle_(cublas_handle), cublaslt_handle_(cublaslt_handle),
-      stream_(stream), cublas_algo_map_(cublas_algo_map), mu_(mu),
-      allocator_(nullptr), use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4) {}
-
-
-cublasINT8MMWrapper::~cublasINT8MMWrapper() { mu_ = nullptr; }
-
-cublasINT8MMWrapper::cublasINT8MMWrapper(const cublasINT8MMWrapper &wrapper)
-    : cublas_handle_(nullptr), cublaslt_handle_(wrapper.cublaslt_handle_),
-      stream_(wrapper.stream_), cublas_algo_map_(wrapper.cublas_algo_map_), mu_(wrapper.mu_),
-      allocator_(wrapper.allocator_), use_ORDER_COL32_2R_4R4_(wrapper.use_ORDER_COL32_2R_4R4_) {
-}
-
-// for int8 cublasLtMM with algo
-// ATransform should be m*n, CUBLASLT_ORDER_COL32
-// kernel should be n*k, CUBLASLT_ORDER_COL4_4R2_8C or
-// CUBLASLT_ORDER_COL32_2R_4R4 res is m*n, CUBLASLT_ORDER_COL32
-void cublasINT8MMWrapper::Gemm(int *res, int batchCount, int m, int n, int k,
-                               int64_t stridea, int64_t strideb,
-                               int64_t stridec, const int8_t *ATransform,
-                               const int8_t *kernel) {
-  mu_->lock();
-  cublasOperation_t opTranspose = CUBLAS_OP_T;
-#if (CUDART_VERSION >= 11000)
-  cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
-#else
-  cudaDataType_t computeType = CUDA_R_32I;
-#endif
-  cublasLtMatmulDesc_t matmulDesc;
-  cublasLtMatrixLayout_t AtransformDesc = NULL;
-  cublasLtMatrixLayout_t BtransformDesc = NULL;
-  cublasLtMatrixLayout_t CtransformDesc = NULL;
-  cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
-
-  cublasLtOrder_t order_matrixB;
-#if (CUDART_VERSION >= 11000)
-  if (use_ORDER_COL32_2R_4R4_) {
-    order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
-  } else {
-    order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
-  }
-#else
-  order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
-#endif
-
-  int ldaTransform = 32 * m;
-  int ldbTransform;
-  if (use_ORDER_COL32_2R_4R4_) {
-    ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
-  } else {
-    ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
-  }
-  int ldcTransform = 32 * m;
-
-  // create matmulDesc
-#if (CUDART_VERSION >= 11000)
-  cublasLtMatmulDescCreate(&matmulDesc, computeType, CUDA_R_32I);
-#else
-  cublasLtMatmulDescCreate(&matmulDesc, computeType);
-#endif
-  cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB,
-                                 &opTranspose, sizeof(cublasOperation_t));
-  cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldaTransform);
-  cublasLtMatrixLayoutSetAttribute(AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER,
-                                   &order_COL32, sizeof(order_COL32));
-  cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbTransform);
-  cublasLtMatrixLayoutSetAttribute(BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER,
-                                   &order_matrixB, sizeof(order_matrixB));
-  cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_32I, m, n, ldcTransform);
-  cublasLtMatrixLayoutSetAttribute(CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER,
-                                   &order_COL32, sizeof(order_COL32));
-  if (batchCount > 1) {
-    cublasLtMatrixLayoutSetAttribute(AtransformDesc,
-                                     CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-                                     &batchCount, sizeof(batchCount));
-    cublasLtMatrixLayoutSetAttribute(
-        AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-        sizeof(stridea));
-    cublasLtMatrixLayoutSetAttribute(BtransformDesc,
-                                     CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-                                     &batchCount, sizeof(batchCount));
-    cublasLtMatrixLayoutSetAttribute(
-        BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-        sizeof(strideb));
-    cublasLtMatrixLayoutSetAttribute(CtransformDesc,
-                                     CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-                                     &batchCount, sizeof(batchCount));
-    cublasLtMatrixLayoutSetAttribute(
-        CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-        sizeof(stridec));
-  }
-
-  int alphaI = 1;
-  int betaI = 0;
-
-  // get algo
-  cublasLtMatmulAlgo_t algo;
-  int findAlgo = 0;
-  if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) {
-    // printf("find algo %s\n", markStr.c_str());
-    findAlgo = 1;
-
-    cublasLtMatmulAlgo_info tmp_info =
-        cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE);
-
-    cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32I, CUDA_R_8I,
-                           CUDA_R_8I, CUDA_R_32I, CUDA_R_32I, tmp_info.algoId,
-                           &algo);
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption),
-        sizeof(tmp_info.customOption));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID,
-                                         &(tmp_info.tile),
-                                         sizeof(tmp_info.tile));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                                         &(tmp_info.splitK_val),
-                                         sizeof(tmp_info.splitK_val));
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle),
-        sizeof(tmp_info.swizzle));
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-        &(tmp_info.reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID,
-                                         &(tmp_info.stages),
-                                         sizeof(tmp_info.stages));
-#endif
-  } else {
-    findAlgo = 1;
-    int algoId;
-    if (use_ORDER_COL32_2R_4R4_) {
-      algoId = 7;
-    } else {
-      algoId = 6;
-    }
-    int swizzle = 0;
-    int customOption = 0;
-    int tile = 20;
-    int splitK_val = 0;
-    int reductionScheme = 0;
-    cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32I, CUDA_R_8I,
-                           CUDA_R_8I, CUDA_R_32I, CUDA_R_32I, algoId, &algo);
-    cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                         CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
-                                         &(customOption), sizeof(customOption));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID,
-                                         &(tile), sizeof(tile));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                                         &(splitK_val), sizeof(splitK_val));
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                         CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                                         &(reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-    int stages;
-    if (use_ORDER_COL32_2R_4R4_) {
-      stages = 15;
-    } else {
-      stages = 13;
-    }
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID,
-                                         &(stages), sizeof(stages));
-#endif
-  }
-
-  cublasLtMatmul(cublaslt_handle_, matmulDesc, &alphaI, ATransform,
-                 AtransformDesc, kernel, BtransformDesc, &betaI, res,
-                 CtransformDesc, res, CtransformDesc,
-                 (findAlgo == 1 ? (&algo) : NULL), NULL, 0, stream_);
-
-  cublasLtMatmulDescDestroy(matmulDesc);
-  cublasLtMatrixLayoutDestroy(AtransformDesc);
-  cublasLtMatrixLayoutDestroy(BtransformDesc);
-  cublasLtMatrixLayoutDestroy(CtransformDesc);
-  sync_check_cuda_error();
-  mu_->unlock();
-}
-
-// Atransform: mxk CUDA_R_8I
-// kernel: nxk CUDA_R_8I
-// res: mxn CUDA_R_32I
-// alpha: CUDA_R_32I should be 1
-// beta: CUDA_R_32I should be 0
-// computeType: CUBLAS_COMPUTE_32I
-void cublasINT8MMWrapper::Gemm_(int *res, int batchCount, int m, int n, int k,
-                                int64_t stridea, int64_t strideb,
-                                int64_t stridec, const int8_t *ATransform,
-                                const int8_t *kernel) {
-  mu_->lock();
-  cublasOperation_t opTranspose = CUBLAS_OP_T;
-#if (CUDART_VERSION >= 11000)
-  cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
-#else
-  cudaDataType_t computeType = CUDA_R_32I;
-#endif
-  cublasLtMatmulDesc_t matmulDesc;
-  cublasLtMatrixLayout_t AtransformDesc = NULL;
-  cublasLtMatrixLayout_t BtransformDesc = NULL;
-  cublasLtMatrixLayout_t CtransformDesc = NULL;
-
-  // create matmulDesc
-#if (CUDART_VERSION >= 11000)
-  cublasLtMatmulDescCreate(&matmulDesc, computeType, CUDA_R_32I);
-#else
-  cublasLtMatmulDescCreate(&matmulDesc, computeType);
-#endif
-  cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA,
-                                 &opTranspose, sizeof(cublasOperation_t));
-
-  cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, k, n, k);
-
-  cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, k, m, k);
-
-  cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_32I, n, m, n);
-
-  if (batchCount > 1) {
-    cublasLtMatrixLayoutSetAttribute(AtransformDesc,
-                                     CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-                                     &batchCount, sizeof(batchCount));
-    cublasLtMatrixLayoutSetAttribute(
-        AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-        sizeof(stridea));
-    cublasLtMatrixLayoutSetAttribute(BtransformDesc,
-                                     CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-                                     &batchCount, sizeof(batchCount));
-    cublasLtMatrixLayoutSetAttribute(
-        BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-        sizeof(strideb));
-    cublasLtMatrixLayoutSetAttribute(CtransformDesc,
-                                     CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-                                     &batchCount, sizeof(batchCount));
-    cublasLtMatrixLayoutSetAttribute(
-        CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-        sizeof(stridec));
-  }
-
-  int alphaI = 1;
-  int betaI = 0;
-
-  // get algo
-  cublasLtMatmulAlgo_t algo;
-  int findAlgo = 0;
-  if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) {
-    // printf("find algo %s\n", markStr.c_str());
-    findAlgo = 1;
-
-    cublasLtMatmulAlgo_info tmp_info =
-        cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE);
-
-    cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32I, CUDA_R_8I,
-                           CUDA_R_8I, CUDA_R_32I, CUDA_R_32I, tmp_info.algoId,
-                           &algo);
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption),
-        sizeof(tmp_info.customOption));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID,
-                                         &(tmp_info.tile),
-                                         sizeof(tmp_info.tile));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                                         &(tmp_info.splitK_val),
-                                         sizeof(tmp_info.splitK_val));
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle),
-        sizeof(tmp_info.swizzle));
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-        &(tmp_info.reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID,
-                                         &(tmp_info.stages),
-                                         sizeof(tmp_info.stages));
-#endif
-  } else {
-    findAlgo = 1;
-    int algoId;
-    algoId = 21;
-    int swizzle = 0;
-    int customOption = 0;
-    int tile = 20;
-    int splitK_val = 0;
-    int reductionScheme = 0;
-    cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32I, CUDA_R_8I,
-                           CUDA_R_8I, CUDA_R_32I, CUDA_R_32I, algoId, &algo);
-    cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                         CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
-                                         &(customOption), sizeof(customOption));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID,
-                                         &(tile), sizeof(tile));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                                         &(splitK_val), sizeof(splitK_val));
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                         CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                                         &(reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-    int stages;
-    stages = 17;
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID,
-                                         &(stages), sizeof(stages));
-#endif
-  }
-
-  cublasLtMatmul(cublaslt_handle_, matmulDesc, &alphaI, kernel, AtransformDesc,
-                 ATransform, BtransformDesc, &betaI, res, CtransformDesc, res,
-                 CtransformDesc, (findAlgo == 1 ? (&algo) : NULL), NULL, 0,
-                 stream_);
-
-  cublasLtMatmulDescDestroy(matmulDesc);
-  cublasLtMatrixLayoutDestroy(AtransformDesc);
-  cublasLtMatrixLayoutDestroy(BtransformDesc);
-  cublasLtMatrixLayoutDestroy(CtransformDesc);
-  sync_check_cuda_error();
-  mu_->unlock();
-}
-
-// for int8 IO cublasLtMM with algo
-// ATransform should be m*k CUBLASLT_ORDER_COL32
-// kernel should be n*k CUBLASLT_ORDER_COL4_4R2_8C
-// res is m*n CUBLASLT_ORDER_COL32
-void cublasINT8MMWrapper::Gemm(int8_t *res, int batchCount, int m, int n, int k,
-                               int64_t stridea, int64_t strideb,
-                               int64_t stridec, const float alpha,
-                               const int8_t *ATransform, const int8_t *kernel) {
-  mu_->lock();
-  cublasOperation_t opTranspose = CUBLAS_OP_T;
-  // int8 gemm does not support CUBLAS_POINTER_MODE_DEVICE
-  // cublasLtPointerMode_t pointerMode =
-  // CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
-  cudaDataType_t scaleType = CUDA_R_32F;
-#if (CUDART_VERSION >= 11000)
-  cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
-#else
-  cudaDataType_t computeType = CUDA_R_32I;
-#endif
-  cublasLtMatmulDesc_t matmulDesc;
-  cublasLtMatrixLayout_t AtransformDesc = NULL;
-  cublasLtMatrixLayout_t BtransformDesc = NULL;
-  cublasLtMatrixLayout_t CtransformDesc = NULL;
-  cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
-
-  cublasLtOrder_t order_matrixB;
-#if (CUDART_VERSION >= 11000)
-  if (use_ORDER_COL32_2R_4R4_) {
-    order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
-  } else {
-    order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
-  }
-#else
-  order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
-#endif
-
-  int ldaTransform = 32 * m;
-
-  int ldbTransform;
-  if (use_ORDER_COL32_2R_4R4_) {
-    ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
-  } else {
-    ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
-  }
-
-  int ldcTransform = 32 * m;
-
-  // create matmulDesc
-#if (CUDART_VERSION >= 11000)
-  cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType);
-#else
-  cublasLtMatmulDescCreate(&matmulDesc, computeType);
-#endif
-  cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB,
-                                 &opTranspose, sizeof(cublasOperation_t));
-  cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_SCALE_TYPE,
-                                 &scaleType, sizeof(scaleType));
-  // cublasLtMatmulDescSetAttribute(matmulDesc,
-  // CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointerMode,
-  // sizeof(cublasLtPointerMode_t));
-  cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldaTransform);
-  cublasLtMatrixLayoutSetAttribute(AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER,
-                                   &order_COL32, sizeof(order_COL32));
-  cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbTransform);
-  cublasLtMatrixLayoutSetAttribute(BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER,
-                                   &order_matrixB, sizeof(order_matrixB));
-  cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_8I, m, n, ldcTransform);
-  cublasLtMatrixLayoutSetAttribute(CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER,
-                                   &order_COL32, sizeof(order_COL32));
-  if (batchCount > 1) {
-    cublasLtMatrixLayoutSetAttribute(AtransformDesc,
-                                     CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-                                     &batchCount, sizeof(batchCount));
-    cublasLtMatrixLayoutSetAttribute(
-        AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-        sizeof(stridea));
-    cublasLtMatrixLayoutSetAttribute(BtransformDesc,
-                                     CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-                                     &batchCount, sizeof(batchCount));
-    cublasLtMatrixLayoutSetAttribute(
-        BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-        sizeof(strideb));
-    cublasLtMatrixLayoutSetAttribute(CtransformDesc,
-                                     CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-                                     &batchCount, sizeof(batchCount));
-    cublasLtMatrixLayoutSetAttribute(
-        CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-        sizeof(stridec));
-  }
-
-  // get algo
-  cublasLtMatmulAlgo_t algo;
-  int findAlgo = 0;
-  if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) {
-    findAlgo = 1;
-
-    cublasLtMatmulAlgo_info tmp_info =
-        cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE);
-
-    cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32F, CUDA_R_8I,
-                           CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, tmp_info.algoId,
-                           &algo);
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption),
-        sizeof(tmp_info.customOption));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID,
-                                         &(tmp_info.tile),
-                                         sizeof(tmp_info.tile));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                                         &(tmp_info.splitK_val),
-                                         sizeof(tmp_info.splitK_val));
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle),
-        sizeof(tmp_info.swizzle));
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-        &(tmp_info.reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID,
-                                         &(tmp_info.stages),
-                                         sizeof(tmp_info.stages));
-#endif
-  } else {
-    findAlgo = 1;
-    int algoId;
-    if (use_ORDER_COL32_2R_4R4_) {
-      algoId = 7;
-    } else {
-      algoId = 6;
-    }
-    int swizzle = 0;
-    int customOption = 0;
-    int tile = 20;
-    int splitK_val = 0;
-    int reductionScheme = 0;
-    cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32F, CUDA_R_8I,
-                           CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, algoId, &algo);
-    cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                         CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
-                                         &(customOption), sizeof(customOption));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID,
-                                         &(tile), sizeof(tile));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                                         &(splitK_val), sizeof(splitK_val));
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                         CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                                         &(reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-    int stages;
-    if (use_ORDER_COL32_2R_4R4_) {
-      stages = 15;
-    } else {
-      stages = 13;
-    }
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID,
-                                         &(stages), sizeof(stages));
-#endif
-  }
-
-  float beta = 0.0f;
-  cublasLtMatmul(cublaslt_handle_, matmulDesc, &alpha, kernel, AtransformDesc,
-                 ATransform, BtransformDesc, &beta, res, CtransformDesc, res,
-                 CtransformDesc, (findAlgo == 1 ? (&algo) : NULL), NULL, 0,
-                 stream_);
-
-  cublasLtMatmulDescDestroy(matmulDesc);
-  cublasLtMatrixLayoutDestroy(AtransformDesc);
-  cublasLtMatrixLayoutDestroy(BtransformDesc);
-  cublasLtMatrixLayoutDestroy(CtransformDesc);
-  sync_check_cuda_error();
-  mu_->unlock();
-}
-
-// Atransform: mxk CUDA_R_8I
-// kernel: nxk CUDA_R_8I
-// res: mxn CUDA_R_8I
-// alpha: CUDA_R_32F
-// beta: CUDA_R_32F
-// computeType: CUBLAS_COMPUTE_32I
-void cublasINT8MMWrapper::Gemm_(int8_t *res, int batchCount, int m, int n,
-                                int k, int64_t stridea, int64_t strideb,
-                                int64_t stridec, const float alpha,
-                                const int8_t *ATransform,
-                                const int8_t *kernel) {
-  mu_->lock();
-  cublasOperation_t opTranspose = CUBLAS_OP_T;
-  // int8 gemm does not support CUBLAS_POINTER_MODE_DEVICE
-  // cublasLtPointerMode_t pointerMode =
-  // CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
-  cudaDataType_t scaleType = CUDA_R_32F;
-#if (CUDART_VERSION >= 11000)
-  cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
-#else
-  cudaDataType_t computeType = CUDA_R_32I;
-#endif
-  cublasLtMatmulDesc_t matmulDesc;
-  cublasLtMatrixLayout_t AtransformDesc = NULL;
-  cublasLtMatrixLayout_t BtransformDesc = NULL;
-  cublasLtMatrixLayout_t CtransformDesc = NULL;
-
-  // create matmulDesc
-#if (CUDART_VERSION >= 11000)
-  cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType);
-#else
-  cublasLtMatmulDescCreate(&matmulDesc, computeType);
-#endif
-  cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA,
-                                 &opTranspose, sizeof(cublasOperation_t));
-  cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_SCALE_TYPE,
-                                 &scaleType, sizeof(scaleType));
-  // cublasLtMatmulDescSetAttribute(matmulDesc,
-  // CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointerMode,
-  // sizeof(cublasLtPointerMode_t));
-  cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, k, n, k);
-
-  cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, k, m, k);
-
-  cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_8I, n, m, n);
-
-  if (batchCount > 1) {
-    cublasLtMatrixLayoutSetAttribute(AtransformDesc,
-                                     CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-                                     &batchCount, sizeof(batchCount));
-    cublasLtMatrixLayoutSetAttribute(
-        AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-        sizeof(stridea));
-    cublasLtMatrixLayoutSetAttribute(BtransformDesc,
-                                     CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-                                     &batchCount, sizeof(batchCount));
-    cublasLtMatrixLayoutSetAttribute(
-        BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-        sizeof(strideb));
-    cublasLtMatrixLayoutSetAttribute(CtransformDesc,
-                                     CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-                                     &batchCount, sizeof(batchCount));
-    cublasLtMatrixLayoutSetAttribute(
-        CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-        sizeof(stridec));
-  }
-
-  // get algo
-  cublasLtMatmulAlgo_t algo;
-  int findAlgo = 0;
-  if (cublas_algo_map_->isExist(batchCount, n, m, k, INT8_DATATYPE)) {
-    findAlgo = 1;
-    cublasLtMatmulAlgo_info tmp_info =
-        cublas_algo_map_->getAlgo(batchCount, n, m, k, INT8_DATATYPE);
-
-    cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32F, CUDA_R_8I,
-                           CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, tmp_info.algoId,
-                           &algo);
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption),
-        sizeof(tmp_info.customOption));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID,
-                                         &(tmp_info.tile),
-                                         sizeof(tmp_info.tile));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                                         &(tmp_info.splitK_val),
-                                         sizeof(tmp_info.splitK_val));
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle),
-        sizeof(tmp_info.swizzle));
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-        &(tmp_info.reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID,
-                                         &(tmp_info.stages),
-                                         sizeof(tmp_info.stages));
-#endif
-  } else {
-    findAlgo = 1;
-    int algoId;
-    algoId = 21;
-    int swizzle = 0;
-    int customOption = 0;
-    int tile = 20;
-    int splitK_val = 0;
-    int reductionScheme = 0;
-    cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32F, CUDA_R_8I,
-                           CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, algoId, &algo);
-    cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                         CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
-                                         &(customOption), sizeof(customOption));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID,
-                                         &(tile), sizeof(tile));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                                         &(splitK_val), sizeof(splitK_val));
-    cublasLtMatmulAlgoConfigSetAttribute(
-        &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
-    cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                         CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                                         &(reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-    int stages;
-    stages = 17;
-    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID,
-                                         &(stages), sizeof(stages));
-#endif
-  }
-
-  float beta = 0.0f;
-  cublasLtMatmul(cublaslt_handle_, matmulDesc, &alpha, kernel, AtransformDesc,
-                 ATransform, BtransformDesc, &beta, res, CtransformDesc, res,
-                 CtransformDesc, (findAlgo == 1 ? (&algo) : NULL), NULL, 0,
-                 stream_);
-
-  cublasLtMatmulDescDestroy(matmulDesc);
-  cublasLtMatrixLayoutDestroy(AtransformDesc);
-  cublasLtMatrixLayoutDestroy(BtransformDesc);
-  cublasLtMatrixLayoutDestroy(CtransformDesc);
-  sync_check_cuda_error();
-  mu_->unlock();
-}
-
-bool cublasINT8MMWrapper::getUseOrderCol322R4R4() {
-  return use_ORDER_COL32_2R_4R4_;
-}
diff --git a/csrc/quantization/smoothquant/int8gemm/cublasINT8MMWrapper.h b/csrc/quantization/smoothquant/int8gemm/cublasINT8MMWrapper.h
deleted file mode 100644
index 8bc209f58b91..000000000000
--- a/csrc/quantization/smoothquant/int8gemm/cublasINT8MMWrapper.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "allocator.h"
-#include "cublasAlgoMap.h"
-#include <cublasLt.h>
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <map>
-#include <mutex>
-#include <string>
-
-#pragma once
-
-class cublasINT8MMWrapper{
-protected:
-  cublasHandle_t cublas_handle_;
-  cublasLtHandle_t cublaslt_handle_;
-  cudaStream_t stream_;
-  cublasAlgoMap *cublas_algo_map_;
-  std::mutex *mu_;
-  IAllocator *allocator_ = nullptr;
-  
-private:
-  bool use_ORDER_COL32_2R_4R4_;
-
-public:
-  cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle_, cudaStream_t stream,
-                      cublasAlgoMap *map, std::mutex *mu,
-                      bool use_ORDER_COL32_2R_4R4);
-
-  cublasINT8MMWrapper(cublasHandle_t cublas_handle,
-                      cublasLtHandle_t cublaslt_handle, cudaStream_t stream,
-                      cublasAlgoMap *map, std::mutex *mu,
-                      bool use_ORDER_COL32_2R_4R4);
-
-  ~cublasINT8MMWrapper();
-
-  cublasINT8MMWrapper(const cublasINT8MMWrapper &wrapper);
-
-  void Gemm(int *res, int batchCount, int m, int n, int k, int64_t stridea,
-            int64_t strideb, int64_t stridec, const int8_t *ATransform,
-            const int8_t *kernel);
-
-  void Gemm_(int *res, int batchCount, int m, int n, int k, int64_t stridea,
-             int64_t strideb, int64_t stridec, const int8_t *ATransform,
-             const int8_t *kernel);
-
-  void Gemm(int8_t *res, int batchCount, int m, int n, int k, int64_t stridea,
-            int64_t strideb, int64_t stridec, const float alpha,
-            const int8_t *ATransform, const int8_t *kernel);
-
-  void Gemm_(int8_t *res, int batchCount, int m, int n, int k, int64_t stridea,
-             int64_t strideb, int64_t stridec, const float alpha,
-             const int8_t *ATransform, const int8_t *kernel);
-
-  bool getUseOrderCol322R4R4();
-};
\ No newline at end of file
diff --git a/csrc/quantization/smoothquant/int8gemm/cuda_utils.cc b/csrc/quantization/smoothquant/int8gemm/cuda_utils.cc
deleted file mode 100644
index 588375570937..000000000000
--- a/csrc/quantization/smoothquant/int8gemm/cuda_utils.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "cuda_utils.h"
-
-cudaError_t getSetDevice(int i_device, int *o_device) {
-  int current_dev_id = 0;
-  cudaError_t err = cudaSuccess;
-
-  if (o_device != NULL) {
-    err = cudaGetDevice(&current_dev_id);
-    if (err != cudaSuccess) {
-      return err;
-    }
-    if (current_dev_id == i_device) {
-      *o_device = i_device;
-    } else {
-      err = cudaSetDevice(i_device);
-      if (err != cudaSuccess) {
-        return err;
-      }
-      *o_device = current_dev_id;
-    }
-  } else {
-    err = cudaSetDevice(i_device);
-    if (err != cudaSuccess) {
-      return err;
-    }
-  }
-
-  return cudaSuccess;
-}
diff --git a/csrc/quantization/smoothquant/int8gemm/cuda_utils.h b/csrc/quantization/smoothquant/int8gemm/cuda_utils.h
deleted file mode 100644
index f1d9bba4ab06..000000000000
--- a/csrc/quantization/smoothquant/int8gemm/cuda_utils.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cublasLt.h>
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-
-enum CublasDataType {
-  FLOAT_DATATYPE = 0,
-  HALF_DATATYPE = 1,
-  BFLOAT16_DATATYPE = 2,
-  INT8_DATATYPE = 3,
-  FP8_DATATYPE = 4
-};
-
-static const char *_cudaGetErrorEnum(cudaError_t error) {
-  return cudaGetErrorString(error);
-}
-
-static const char *_cudaGetErrorEnum(cublasStatus_t error) {
-  switch (error) {
-  case CUBLAS_STATUS_SUCCESS:
-    return "CUBLAS_STATUS_SUCCESS";
-
-  case CUBLAS_STATUS_NOT_INITIALIZED:
-    return "CUBLAS_STATUS_NOT_INITIALIZED";
-
-  case CUBLAS_STATUS_ALLOC_FAILED:
-    return "CUBLAS_STATUS_ALLOC_FAILED";
-
-  case CUBLAS_STATUS_INVALID_VALUE:
-    return "CUBLAS_STATUS_INVALID_VALUE";
-
-  case CUBLAS_STATUS_ARCH_MISMATCH:
-    return "CUBLAS_STATUS_ARCH_MISMATCH";
-
-  case CUBLAS_STATUS_MAPPING_ERROR:
-    return "CUBLAS_STATUS_MAPPING_ERROR";
-
-  case CUBLAS_STATUS_EXECUTION_FAILED:
-    return "CUBLAS_STATUS_EXECUTION_FAILED";
-
-  case CUBLAS_STATUS_INTERNAL_ERROR:
-    return "CUBLAS_STATUS_INTERNAL_ERROR";
-
-  case CUBLAS_STATUS_NOT_SUPPORTED:
-    return "CUBLAS_STATUS_NOT_SUPPORTED";
-
-  case CUBLAS_STATUS_LICENSE_ERROR:
-    return "CUBLAS_STATUS_LICENSE_ERROR";
-  }
-  return "<unknown>";
-}
-
-template <typename T>
-void check(T result, char const *const func, const char *const file,
-           int const line) {
-  if (result) {
-    throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") +
-                             (_cudaGetErrorEnum(result)) + " " + file + ":" +
-                             std::to_string(line) + " \n");
-  }
-}
-
-#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
-#define check_cuda_error_2(val, file, line) check((val), #val, file, line)
-
-inline void syncAndCheck(const char *const file, int const line) {
-  // When FT_DEBUG_LEVEL=DEBUG, must check error
-  static char *level_name = std::getenv("FT_DEBUG_LEVEL");
-  if (level_name != nullptr) {
-    static std::string level = std::string(level_name);
-    if (level == "DEBUG") {
-      cudaDeviceSynchronize();
-      cudaError_t result = cudaGetLastError();
-      if (result) {
-        throw std::runtime_error(
-            std::string("[FT][ERROR] CUDA runtime error: ") +
-            (_cudaGetErrorEnum(result)) + " " + file + ":" +
-            std::to_string(line) + " \n");
-      }
-      // FT_LOG_DEBUG(fmtstr("run syncAndCheck at %s:%d", file, line));
-    }
-  }
-
-#ifndef NDEBUG
-  cudaDeviceSynchronize();
-  cudaError_t result = cudaGetLastError();
-  if (result) {
-    throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") +
-                             (_cudaGetErrorEnum(result)) + " " + file + ":" +
-                             std::to_string(line) + " \n");
-  }
-#endif
-}
-
-#define sync_check_cuda_error() syncAndCheck(__FILE__, __LINE__)
-
-
-[[noreturn]] inline void throwRuntimeError(const char *const file,
-                                           int const line,
-                                           std::string const &info = "") {
-  throw std::runtime_error(std::string("[FT][ERROR] ") + info +
-                           " Assertion fail: " + file + ":" +
-                           std::to_string(line) + " \n");
-}
-
-inline void myAssert(bool result, const char *const file, int const line,
-                     std::string const &info = "") {
-  if (!result) {
-    throwRuntimeError(file, line, info);
-  }
-}
-
-#define FT_CHECK(val) myAssert(val, __FILE__, __LINE__)
-#define FT_CHECK_WITH_INFO(val, info)                                          \
-  do {                                                                         \
-    bool is_valid_val = (val);                                                 \
-    if (!is_valid_val) {                                                       \
-      fastertransformer::myAssert(is_valid_val, __FILE__, __LINE__, (info));   \
-    }                                                                          \
-  } while (0)
-
-#define FT_THROW(info) throwRuntimeError(__FILE__, __LINE__, info)
-
-cudaError_t getSetDevice(int i_device, int *o_device = NULL);
-
-inline int getDevice() {
-  int current_dev_id = 0;
-  check_cuda_error(cudaGetDevice(&current_dev_id));
-  return current_dev_id;
-}
-
-inline int getDeviceCount() {
-  int count = 0;
-  check_cuda_error(cudaGetDeviceCount(&count));
-  return count;
-}
\ No newline at end of file
diff --git a/csrc/quantization/smoothquant/int8gemm/int8_gemm.h b/csrc/quantization/smoothquant/int8gemm/int8_gemm.h
deleted file mode 100644
index 2e80d4efe22a..000000000000
--- a/csrc/quantization/smoothquant/int8gemm/int8_gemm.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
-  gemm methods are adapted from ft
-*/
-#include <ATen/cuda/CUDAContext.h>
-#include "cublasAlgoMap.h"
-#include "cublasINT8MMWrapper.h"
-
-class I8CUGEMM {
-private:
-  cublasINT8MMWrapper *int8_gemm_wrapper = nullptr;
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-public:
-  I8CUGEMM();
-  ~I8CUGEMM();
-
-  void linear_a8_w8_o32(
-    torch::Tensor& input,
-    torch::Tensor& weight,
-    torch::Tensor& output);
-  void linear_a8_w8_o32_(
-    torch::Tensor& input,
-    torch::Tensor& weight,
-    torch::Tensor& output);                     
-  void linear_a8_w8_o8(
-    torch::Tensor& input,
-    torch::Tensor& weight,
-    torch::Tensor& output,
-    float alpha);
-  void linear_a8_w8_o8_(
-    torch::Tensor& input,
-    torch::Tensor& weight,
-    torch::Tensor& output,
-    float alpha);
-};
-I8CUGEMM::I8CUGEMM() {
-  // cublasAlgoMap *cublas_algo_map = new cublasAlgoMap("igemm_config.in");
-  cublasAlgoMap *cublas_algo_map = new cublasAlgoMap();
-  std::mutex *cublas_wrapper_mutex = new std::mutex();
-  bool use_ORDER_COL32_2R_4R4 = true;
-
-  cublasLtHandle_t cublaslt_handle;
-  cublasLtCreate(&cublaslt_handle);
-
-  int8_gemm_wrapper = new cublasINT8MMWrapper(
-    cublaslt_handle,
-    this->stream,
-    cublas_algo_map,
-    cublas_wrapper_mutex,
-    use_ORDER_COL32_2R_4R4);
-}
-
-I8CUGEMM::~I8CUGEMM() {}
-
-void I8CUGEMM::linear_a8_w8_o32(
-  torch::Tensor& input,  // INT8
-  torch::Tensor& weight, // INT8
-  torch::Tensor& out // INT32
-) {
-  int m = input.size(0);
-  int n = weight.size(0);
-  int k = input.size(1);
-
-  // Set data types
-  int8_t* input_ptr = input.data_ptr<int8_t>();
-  int8_t* weight_ptr = weight.data_ptr<int8_t>();
-  int32_t* output_ptr = out.data_ptr<int32_t>();
-
-  int8_gemm_wrapper->Gemm(output_ptr, 1, m, n, k, 0, 0, 0, input_ptr,
-                          weight_ptr);
-}
-
-void I8CUGEMM::linear_a8_w8_o32_(
-  torch::Tensor& input,  // INT8
-  torch::Tensor& weight, // INT8
-  torch::Tensor& out // INT32
-) {
-  int m = input.size(0);
-  int n = weight.size(0);
-  int k = input.size(1);
-
-  // Set data types
-  int8_t* input_ptr = input.data_ptr<int8_t>();
-  int8_t* weight_ptr = weight.data_ptr<int8_t>();
-  int32_t* output_ptr = out.data_ptr<int32_t>();
-
-  int8_gemm_wrapper->Gemm_(output_ptr, 1, m, n, k, 0, 0, 0, input_ptr,
-                          weight_ptr);
-}
-
-void I8CUGEMM::linear_a8_w8_o8(
-  torch::Tensor& input,  // INT8
-  torch::Tensor& weight, // INT8
-  torch::Tensor& out,    // INT8
-  float alpha // FP32
-) {
-  int m = input.size(0);
-  int n = weight.size(0);
-  int k = input.size(1);
-
-  // Set data types
-  int8_t* input_ptr = input.data_ptr<int8_t>();
-  int8_t* weight_ptr = weight.data_ptr<int8_t>();
-  int8_t* output_ptr = out.data_ptr<int8_t>();
-
-  int8_gemm_wrapper->Gemm(output_ptr, 1, m, n, k, 0, 0, 0, alpha, input_ptr,
-                          weight_ptr);
-}
-
-void I8CUGEMM::linear_a8_w8_o8_(
-  torch::Tensor& input,  // INT8
-  torch::Tensor& weight, // INT8
-  torch::Tensor& out,    // INT8
-  float alpha // FP32
-) {
-  int m = input.size(0);
-  int n = weight.size(0);
-  int k = input.size(1);
-
-  // Set data types
-  int8_t* input_ptr = input.data_ptr<int8_t>();
-  int8_t* weight_ptr = weight.data_ptr<int8_t>();
-  int8_t* output_ptr = out.data_ptr<int8_t>();
-
-  int8_gemm_wrapper->Gemm_(output_ptr, 1, m, n, k, 0, 0, 0, alpha, input_ptr,
-                          weight_ptr);
-}
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 6ee75e8139c0..f3adcb519ed9 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -8,3 +8,4 @@ vllm-nccl-cu12>=2.18,<2.19  # for downloading nccl library
 torch == 2.2.1
 xformers == 0.0.25  # Requires PyTorch 2.2.1
 triton >= 2.1.0
+nvidia-cutlass == 3.5.0
diff --git a/tests/kernels/test_fusion.py b/tests/kernels/test_fusion.py
index 07d9ce60a403..7cebc76b248b 100644
--- a/tests/kernels/test_fusion.py
+++ b/tests/kernels/test_fusion.py
@@ -9,54 +9,6 @@
 SEEDS = [0]
 SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
 
-@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("scale", SCALE)
-@torch.inference_mode()
-def test_dequant(num_tokens: int, hidden_size: int, dtype: torch.dtype,
-                 seed: int, scale: float) -> None:
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    x = torch.randint(
-        torch.iinfo(torch.int32).min,
-        torch.iinfo(torch.int32).max,
-        (num_tokens, hidden_size),
-        dtype=torch.int32,
-        device="cuda",
-    )
-
-    out1 = (x * scale).to(dtype)
-    out2 = torch.empty_like(x, dtype=dtype)
-    ops.dequant(out2, x, scale)
-    assert torch.allclose(out1, out2, atol=0.001)
-
-
-@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@torch.inference_mode()
-def test_per_token_dequant(num_tokens: int, hidden_size: int,
-                           dtype: torch.dtype, seed: int) -> None:
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    x = torch.randint(
-        torch.iinfo(torch.int32).min,
-        torch.iinfo(torch.int32).max,
-        (num_tokens, hidden_size),
-        dtype=torch.int32,
-        device="cuda",
-    )
-    scale = torch.rand(num_tokens, 1, dtype=torch.float32, device="cuda")
-    out1 = (x * scale).to(dtype)
-    out2 = torch.empty_like(x, dtype=dtype)
-    scale = torch.squeeze(scale)
-    ops.dequant(out2, x, scale)
-    assert torch.allclose(out1, out2, atol=0.001)
-
-
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 2598156bbed3..95b36b782b88 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -166,9 +166,6 @@ class ColumnParallelLinear(torch.nn.Module):
                        skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         linear_method: (Maybe quantized) linear method.
-        logical_widths: Optional list of widths for logical weight matrices.
-                        E.g. for QKVParallelLinear, this parameter defines
-                        the width
     """
 
     def __init__(
@@ -308,6 +305,7 @@ def weight_loader(self,
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
         param_shard_splitter = getattr(param, "shard_splitter", None)
+
         if output_dim is not None and param_shard_splitter is not None:
             raise NotImplementedError(
                 "We do not currently support output_dim != None and "
@@ -373,8 +371,11 @@ def weight_loader(self,
                                                  shard_size)
         # If a param_shard_splitter is defined by the LinearMethod, use it.
         elif param_shard_splitter is not None:
+            logical_widths = getattr(param, "logical_widths")
             param_data, loaded_weight = param_shard_splitter(
-                param_data, loaded_weight, loaded_shard_id)
+                param_data, loaded_weight, loaded_shard_id, logical_widths
+            )
+
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -466,7 +467,7 @@ def weight_loader(self,
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
         param_shard_splitter = getattr(param, "shard_splitter", None)
-        
+
         if output_dim is not None and param_shard_splitter is not None:
             raise NotImplementedError(
                 "We do not currently support output_dim != None and "
@@ -548,8 +549,10 @@ def weight_loader(self,
                                                 shard_size)
         # If a param_shard_splitter is defined by the LinearMethod, use it.
         elif param_shard_splitter is not None:
+            logical_widths = getattr(param, "logical_widths")
             param_data, loaded_weight = param_shard_splitter(
-                param_data, loaded_weight, loaded_shard_id)
+                param_data, loaded_weight, loaded_shard_id, logical_widths)
+
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
diff --git a/vllm/model_executor/layers/quantization/smoothquant/config.py b/vllm/model_executor/layers/quantization/smoothquant/config.py
index 885ffce3e36d..788481a51ae1 100644
--- a/vllm/model_executor/layers/quantization/smoothquant/config.py
+++ b/vllm/model_executor/layers/quantization/smoothquant/config.py
@@ -1,10 +1,8 @@
 from typing import Any, Dict, List, Tuple, Type, Optional, Union
-import threading
 
 import torch
 from torch.nn.parameter import Parameter
 
-from vllm._C import ops
 from vllm.model_executor.layers.linear import (
     LinearMethodBase,
     set_weight_attrs)
@@ -15,6 +13,9 @@
     SmoothQuantDynamicPerToken,
     SmoothQuantStaticPerTensor,
 )
+from vllm.model_executor.layers.quantization.smoothquant.cutlass_gemm import (
+    cutlass_gemm_dq
+)
 
 LAYER_KEYS = ["qkv", "out", "fc1", "fc2"]
 FORMAT_REGISTRY = {
@@ -85,31 +86,10 @@ def from_config(cls, config: Dict[str, Any]) -> "SmoothQuantConfig":
     def get_linear_method(self) -> "SmoothQuantLinearMethod":
         return SmoothQuantLinearMethod(self)
 
-
-# TODO: why is this needed?
-class Int8GEMM(object):
-    _instance_lock = threading.Lock()
-
-    def __init__(self):
-        if not hasattr(self, "i8cugemm"):
-            self.i8cugemm = ops.I8CUGEMM()
-
-    def __new__(cls, *args, **kwargs):
-        if not hasattr(Int8GEMM, "_instance"):
-            with Int8GEMM._instance_lock:
-                if not hasattr(Int8GEMM, "_instance"):
-                    Int8GEMM._instance = object.__new__(cls)
-        return Int8GEMM._instance
-
-    def get_i8cugemm(self):
-        return self.i8cugemm
-
-
 class SmoothQuantLinearMethod(LinearMethodBase):
     def __init__(self, sq_config: SmoothQuantConfig) -> None:
         self.sq_config = sq_config
         self.sq_type = None
-        self.i8cugemm = Int8GEMM().get_i8cugemm()    
 
     def maybe_update_loaded_weight_name(self, 
                                         name: str) -> str:
@@ -123,24 +103,26 @@ def maybe_update_loaded_weight_name(self,
             name.replace(suffix, "dequant_scale")
         return name
 
-    def scales_shard_splitter(self, 
-                              param: torch.Tensor,
-                              loaded_weight: torch.Tensor, 
-                              shard_id: Union[str, int]) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Index into param for for loading.
-        
-        This function is called by QKVColumnLinear and MergedColumnParallelLinear
-        during weight loading to put the scales from disk in the right spot.
-        """
-        if type(shard_id) == str:
-            qkv_idxs = { "q": 0, "k": 1, "v": 2 }
-            if shard_id not in qkv_idxs:
-                raise ValueError(f"Invalid shard_id {shard_id}")
-            shard_id = qkv_idxs[shard_id]
-        elif type(shard_id) != int:
-            raise ValueError(f"Invalid shard id {shard_id}")
+    def shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
 
-        return param[shard_id], loaded_weight
+        assert isinstance(shard_id, str)
+        qkv_idxs = { "q": 0, "k": 1, "v": 2 }
+        assert shard_id in qkv_idxs
+        return qkv_idxs[shard_id]
+
+    def scales_shard_splitter(self,
+                              param: torch.Tensor,
+                              loaded_weight: torch.Tensor,
+                              shard_id: Union[str, int],
+                              logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        shard_id = self.shard_id_as_int(shard_id)
+        offset = sum(logical_widths[:shard_id]) 
+        size = logical_widths[shard_id]
+        # update loaded weight with copies for broadcast.
+        loaded_weight = loaded_weight.repeat(size)
+        return param[offset : offset + size], loaded_weight
 
     def get_layer_format(self, layer_name: str) -> SmoothQuantFormat:
         """
@@ -213,16 +195,22 @@ def create_weights(self,
             "output_dim": 0,
         })
 
-        # Static scale for each logical weight (e.g. 3 for QKV).
-        dequant_scale = Parameter(
-            torch.empty(
-                len(output_sizes_per_partition), 
-                 device='cuda', dtype=params_dtype,
-            ), requires_grad=False
-        )
-        set_weight_attrs(dequant_scale, {
-            "shard_splitter": self.scales_shard_splitter,
-        })
+        if len(output_sizes_per_partition) == 1:
+            # Single static scale for the entire tensor.
+            dequant_scale = Parameter(
+                torch.empty((1),device='cuda', dtype=params_dtype),
+                requires_grad=False
+            )
+        else:
+            # Static scale for each logical weight (e.g. 3 for QKV).
+            dequant_scale = Parameter(
+                torch.empty((sum(output_sizes_per_partition)),
+                             device='cuda', dtype=params_dtype),
+                requires_grad=False
+            )
+            set_weight_attrs(dequant_scale,
+                             {"shard_splitter": self.scales_shard_splitter,
+                              "logical_widths" : output_sizes_per_partition})
 
         return {
             "weight": weight,
@@ -242,37 +230,10 @@ def _quantize(self,
             x_q: Quantized activation at INT8
             activation_scales: Optional dynamic scales for each token.
         """
-        x_q = torch.empty_like(x, dtype=torch.int8)
+        x_q = torch.empty_like(x, dtype=torch.int8, device="cuda")
         x_q, activation_scales = sq_format.quantize_op(x, x_q)
         return x_q, activation_scales
 
-    def _dequantize(self, 
-                    x_q: torch.Tensor, 
-                    dynamic_scales: Optional[torch.Tensor],
-                    static_scales: torch.Tensor,
-                    logical_widths: List[int],
-                    dtype: torch.dtype,
-                    sq_format: SmoothQuantFormat) -> torch.Tensor:
-        """Dequantize activations.
-
-        Args:
-            x_q: quantized activations.
-            dynamic_scales: Optional dynamic scales.
-            static_scales: Static dequantization scales.
-            logical_widths: Width of each logical activation (for QKV case).
-            dtype: Datatype to dequantize to.
-        Returns:
-            x_dq: dequantized activation at output_dtype precision
-        """
-        # Split X_q and X_dq buffer into logical activations (for QKV case).
-        x_q_split = x_q.split(logical_widths, dim=-1)
-        x_dq = torch.empty_like(x_q, dtype=dtype)
-        x_dq_split = x_dq.split(logical_widths, dim=-1)
-        # Dequantize in place and return.
-        sq_format.dequantize_op(x_q_split, x_dq_split, dynamic_scales, static_scales)
-        return x_dq
-
-
     def apply_weights(self,
                       weights: Dict[str, torch.Tensor],
                       x: torch.Tensor,
@@ -290,17 +251,11 @@ def apply_weights(self,
             raise NotImplementedError
         weight_q = weights["weight"]
         static_scales = weights["dequant_scale"]
-        logical_widths = weights["logical_widths"]
         sq_format = weights["sq_format"]
 
         # Q
         x_q, activation_scales = self._quantize(x, sq_format)
 
-        # GEMM
-        x_q = x_q.view(-1, x_q.shape[-1])
-        a_q = torch.empty((x_q.shape[0], weight_q.shape[0]), dtype=torch.int32, device="cuda")
-        self.i8cugemm.linear_a8_w8_o32_(x_q, weight_q, a_q)
-        a_q = a_q.view(*x_q.shape[:-1], -1)
+        # GEMM and DQ
+        return cutlass_gemm_dq(x_q, weight_q, x.dtype, static_scales, activation_scales)
 
-        # DQ
-        return self._dequantize(a_q, activation_scales, static_scales, logical_widths, x.dtype, sq_format)
diff --git a/vllm/model_executor/layers/quantization/smoothquant/cutlass_gemm.py b/vllm/model_executor/layers/quantization/smoothquant/cutlass_gemm.py
new file mode 100644
index 000000000000..05ae38c3343e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/smoothquant/cutlass_gemm.py
@@ -0,0 +1,75 @@
+
+import cutlass
+from cutlass import Tensor as FakeTensor
+import cutlass.epilogue
+
+import torch
+from typing import Optional, Tuple, Dict
+
+from vllm.logger import init_logger
+
+logger = init_logger("cutlass_gemm")
+
+def setup_dequant_epilogue(plan : cutlass.op.Gemm,
+                           dq: torch.Tensor, 
+                           static_scales: Optional[torch.Tensor],
+                           activation_scales: Optional[torch.Tensor]) \
+                              -> Tuple[cutlass.op.Gemm, Dict]:
+
+    if all([static_scales is None, activation_scales is None]):
+        return plan, None
+    assert static_scales is not None
+
+    def epilog_with_scales_and_act_scales(accum, scales, act_scales):
+        D = accum * scales * act_scales
+        return D
+
+    def epilog_with_scales(accum, scales):
+        D = accum * scales
+        return D
+
+    epilog_tensors =  {
+        'scales' : static_scales,
+        'D' : dq
+    }
+    epilogue_trace_tensors = {
+        "accum": FakeTensor(element=torch.int32, shape=dq.shape,
+                            layout_tag=cutlass.LayoutType.RowMajor),
+        'scales' : static_scales,
+        'D' : dq,
+    }
+    epilog_fn = epilog_with_scales
+
+    if activation_scales is not None:
+        epilog_tensors['act_scales'] = activation_scales
+        epilogue_trace_tensors['act_scales'] = activation_scales
+        epilog_fn = epilog_with_scales_and_act_scales
+
+    plan.epilogue_visitor = cutlass.epilogue.trace(epilog_fn, epilogue_trace_tensors)
+    return plan, epilog_tensors 
+
+def cutlass_gemm_dq(x_q : torch.Tensor,
+                    w_q : torch.Tensor,
+                    dtype: torch.dtype,
+                    static_scales: torch.Tensor,
+                    activation_scales: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+    dq = torch.empty((x_q.shape[0], w_q.shape[0]),
+                      dtype=dtype, device="cuda")
+
+    plan = cutlass.op.Gemm(element_A=x_q.dtype, element_B=w_q.dtype,
+                           element_C=dq.dtype, element_D=dq.dtype,
+                           layout_A=cutlass.LayoutType.RowMajor, 
+                           layout_B=cutlass.LayoutType.ColumnMajor,
+                           layout_C=cutlass.LayoutType.RowMajor, 
+                           element_accumulator=torch.int32,
+                           # TODO (varun) : lets not have kernel cc here please.
+                           kernel_cc=80)
+
+    plan, visitor_args = setup_dequant_epilogue(plan, dq, static_scales, activation_scales)
+
+    plan.run(x_q, w_q.t(), dq, dq, alpha=1, beta=0,
+             visitor_args=visitor_args, print_module=False)
+
+    dq = dq.view(*x_q.shape[:-1], -1)
+    return dq
diff --git a/vllm/model_executor/layers/quantization/smoothquant/formats.py b/vllm/model_executor/layers/quantization/smoothquant/formats.py
index b8ddd642c888..4155ef64ffe3 100644
--- a/vllm/model_executor/layers/quantization/smoothquant/formats.py
+++ b/vllm/model_executor/layers/quantization/smoothquant/formats.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import List, Optional, Tuple, Type
+from typing import Optional, Tuple
 
 import torch
 
@@ -7,23 +7,6 @@
 
 
 class SmoothQuantFormat(ABC):
-    @abstractmethod
-    def dequantize_op(self,
-                      x_qs: List[torch.Tensor],
-                      x_dqs: List[torch.Tensor],
-                      dynamic_scales: Optional[torch.Tensor],
-                      static_scales: torch.Tensor) -> None:
-        """Dequantize the activations. x_dq is updated in place.
-
-        Args:
-            x_qs: List of N quantized activations.
-            x_dqs: List of N buffers to fill with dequantized values.
-            dynamic_scales: Optional dynamic scales for dequantization.
-            static_scales: Static scales for dequantization. N values.
-        """
-        raise NotImplementedError
-    
-
     @abstractmethod
     def quantize_op(self,
                     x: torch.Tensor,
@@ -41,55 +24,18 @@ def quantize_op(self,
 
 
 class SmoothQuantDynamicPerToken(SmoothQuantFormat):
-    def dequantize_op(self,
-                      x_qs: List[torch.Tensor],
-                      x_dqs: List[torch.Tensor],
-                      dynamic_scales: Optional[torch.Tensor],
-                      static_scales: torch.Tensor) -> None:
-        """Notes:
-        dynamic_scales: N scales for N tokens in the activation.
-        static_scales: K scales for K logical activations (equals just w_scale).
-        """
-        if dynamic_scales is None:
-            raise ValueError
-        
-        # Dequantize each logical activation.
-        # TODO: test this for case when logical_widths > 1 (may need to reshape)
-        for x_dq, x_q, dynamic_scale, static_scale in zip(
-            x_dqs, x_qs, dynamic_scales, static_scales):
-            
-            # Dequantize (updates x_dq in place).
-            ops.dequant(x_dq, x_q, dynamic_scale, static_scale)
-
-
     def quantize_op(self,
                     x: torch.Tensor,
                     x_q: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """Notes:
         Returns quantized activaiton and dynamic activation scales.
         """
-        activation_scales = torch.empty(x.numel() // x.shape[-1], dtype=x.dtype, device=x.device)
+        activation_scales = torch.empty((x.numel() // x.shape[-1], 1), dtype=x.dtype, device=x.device)
         ops.quant(x_q, x, activation_scales)
         return x_q, activation_scales
     
 
 class SmoothQuantStaticPerTensor(SmoothQuantFormat):
-    def dequantize_op(self,
-                      x_qs: List[torch.Tensor],
-                      x_dqs: List[torch.Tensor],
-                      dynamic_scales: Optional[torch.Tensor],
-                      static_scales: torch.Tensor) -> None:
-        """Notes:
-        dynamic_scales: None
-        static_scales: K scales for K logical activations (equals w_scale * a_scale).
-        """
-        if dynamic_scales is not None:
-            raise ValueError
-        
-        # Dequantize each logical activation.
-        for xdq, xq, static_scale in zip(x_dqs, x_qs, static_scales):
-            ops.dequant(xdq, xq, static_scale)
-
     def quantize_op(self,
                     x: torch.Tensor,
                     x_q: torch.Tensor) -> Tuple[torch.Tensor, None]: