diff --git a/CMakeLists.txt b/CMakeLists.txt index 4cb48c119e6c..ebea46103699 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -167,9 +167,6 @@ set(VLLM_EXT_SRC "csrc/layernorm_kernels.cu" "csrc/quantization/squeezellm/quant_cuda_kernel.cu" "csrc/quantization/gptq/q_gemm.cu" - "csrc/quantization/smoothquant/int8gemm/cublasAlgoMap.cc" - "csrc/quantization/smoothquant/int8gemm/cublasINT8MMWrapper.cc" - "csrc/quantization/smoothquant/int8gemm/cuda_utils.cc" "csrc/quantization/smoothquant/fused_kernels.cu" "csrc/cuda_utils_kernels.cu" "csrc/moe_align_block_size_kernels.cu" diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index 8c4fbdaed105..983455878023 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -1,7 +1,6 @@ #include "cache.h" #include "cuda_utils.h" #include "ops.h" -#include "quantization/smoothquant/int8gemm/int8_gemm.h" #include PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { @@ -50,21 +49,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { "fused_add_rms_norm", &fused_add_rms_norm, "In-place fused Add and RMS Normalization"); - ops.def( - "dequant", - py::overload_cast< - torch::Tensor&, - torch::Tensor&, - float>(&dequant), - "Dequant."); - ops.def( - "dequant", - py::overload_cast< - torch::Tensor&, - torch::Tensor&, - torch::Tensor&, - float>(&dequant), - "Per-token dequant."); ops.def( "quant", py::overload_cast< @@ -102,12 +86,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ"); ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ"); ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM"); - pybind11::class_(ops, "I8CUGEMM") - .def(pybind11::init<>()) - .def("linear_a8_w8_o32", &I8CUGEMM::linear_a8_w8_o32) - .def("linear_a8_w8_o8", &I8CUGEMM::linear_a8_w8_o8) - .def("linear_a8_w8_o8_", &I8CUGEMM::linear_a8_w8_o8_) - .def("linear_a8_w8_o32_", &I8CUGEMM::linear_a8_w8_o32_); ops.def( "moe_align_block_size", &moe_align_block_size, diff --git a/csrc/quantization/smoothquant/fused_kernels.cu b/csrc/quantization/smoothquant/fused_kernels.cu index 1e9d1acf8f47..1d23a5a0653c 100644 --- a/csrc/quantization/smoothquant/fused_kernels.cu +++ b/csrc/quantization/smoothquant/fused_kernels.cu @@ -7,27 +7,6 @@ #include "quant_utils.cuh" namespace vllm { -template -__global__ void dequant_kernel( - const int32_t* __restrict__ input, - scalar_t* __restrict__ out, - const float scale, - const int m, - const int hidden_size, - const int input_stride, - const int out_stride, - const float* __restrict__ act_scale = nullptr) { - const int tid = threadIdx.x; - const int token_idx = blockIdx.x; - float scale_ = scale; - if constexpr (use_per_token_dequant) { - scale_ = scale * act_scale[token_idx]; - } - for (int i = tid; i < hidden_size; i += blockDim.x) { - out[token_idx * out_stride + i] = - (scalar_t)(((float)input[token_idx * input_stride + i]) * scale_); - } -} template __global__ void quant_kernel( @@ -71,56 +50,6 @@ __global__ void quant_kernel( } } // namespace vllm -void dequant( - torch::Tensor& out, // [..., hidden_size] - torch::Tensor& input, // [..., hidden_size] - float scale) { - int hidden_size = input.size(-1); - int num_tokens = input.numel() / hidden_size; - dim3 grid(num_tokens); - dim3 block(std::min(hidden_size, 1024)); - int input_stride = input.stride(-2); - int out_stride = out.stride(-2); - - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES(out.scalar_type(), "dequant_kernel", [&] { - vllm::dequant_kernel<<>>( - input.data_ptr(), - out.data_ptr(), - scale, - num_tokens, - hidden_size, - input_stride, - out_stride); - }); -} - -void dequant( - torch::Tensor& out, // [..., hidden_size] - torch::Tensor& input, // [..., hidden_size] - torch::Tensor& scale, - float weight_dequant_scale) { - int hidden_size = input.size(-1); - int num_tokens = input.numel() / hidden_size; - dim3 grid(num_tokens); - dim3 block(std::min(hidden_size, 1024)); - int input_stride = input.stride(-2); - int out_stride = out.stride(-2); - - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES(out.scalar_type(), "dequant_kernel", [&] { - vllm::dequant_kernel<<>>( - input.data_ptr(), - out.data_ptr(), - weight_dequant_scale, - num_tokens, - hidden_size, - input_stride, - out_stride, - scale.data_ptr()); - }); -} - void quant( torch::Tensor& out, // [..., hidden_size] torch::Tensor& input, // [..., hidden_size] diff --git a/csrc/quantization/smoothquant/int8gemm/allocator.h b/csrc/quantization/smoothquant/int8gemm/allocator.h deleted file mode 100644 index 79be2e99e29c..000000000000 --- a/csrc/quantization/smoothquant/int8gemm/allocator.h +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Memory Allocator - **/ - -#pragma once - -#include "cuda_utils.h" -#include -#include -#include - -#if defined(CUDART_VERSION) && CUDART_VERSION < 11020 -#define CUDA_MEMORY_POOL_DISABLED -#endif - -enum class AllocatorType { CUDA, TF, TH }; - -enum class ReallocType { - INCREASE, - REUSE, - DECREASE, -}; - -class IAllocator { -public: - virtual ~IAllocator(){}; - - virtual void *malloc(size_t size, const bool is_set_zero = true, - bool is_host = false) = 0; - virtual void free(void **ptr, bool is_host = false) const = 0; - virtual void setStream(cudaStream_t stream) = 0; - virtual cudaStream_t returnStream() = 0; - virtual void memSet(void *ptr, const int val, const size_t size) = 0; - - template - void *reMalloc(T *ptr, size_t size, const bool is_set_zero = true, - bool is_host = false) { - // FT_LOG_DEBUG(__PRETTY_FUNCTION__); - size = ((size + 31) / 32) * 32; // make the buffer align with 32 bytes - void *void_ptr = (void *)ptr; - void *ptr_address = getAddress(void_ptr); - if (isExist(ptr_address)) { - ReallocType realloc_type = isReMalloc(ptr_address, size); - if (realloc_type == ReallocType::INCREASE) { - // FT_LOG_DEBUG("ReMalloc the buffer %p since it is too small.", - // void_ptr); - free((void **)(&void_ptr), is_host); - return malloc(size, is_set_zero, is_host); - } -#if !defined(CUDA_MEMORY_POOL_DISABLED) - else if (realloc_type == ReallocType::DECREASE) { - // FT_LOG_DEBUG("ReMalloc the buffer %p to release unused memory to - // memory pools.", void_ptr); - free((void **)(&void_ptr), is_host); - return malloc(size, is_set_zero, is_host); - } -#endif - else { - // FT_LOG_DEBUG("Reuse original buffer %p with size %d and do nothing - // for reMalloc.", void_ptr, size); - if (is_set_zero) { - memSet(void_ptr, 0, size); - } - return void_ptr; - } - } else { - // FT_LOG_DEBUG("Cannot find buffer %p, mallocing new one.", void_ptr); - return malloc(size, is_set_zero, is_host); - } - } - -protected: - virtual bool isExist(void *address) const = 0; - virtual ReallocType isReMalloc(void *address, size_t size) const = 0; - - void *getAddress(void *ptr) const { return ptr; } -}; - -template class Allocator; - -template <> class Allocator : public IAllocator { -private: - const int device_id_; - cudaStream_t stream_ = 0; // initialize as default stream - std::unordered_map *pointer_mapping_; - - bool isExist(void *address) const { - return pointer_mapping_->count(address) > 0; - } - ReallocType isReMalloc(void *address, size_t size) const { - FT_CHECK(isExist(address)); - if (pointer_mapping_->at(address) < size) { - return ReallocType::INCREASE; - } else if (pointer_mapping_->at(address) == size) { - return ReallocType::REUSE; - } else { - return ReallocType::DECREASE; - } - } - -public: - Allocator(int device_id) : device_id_(device_id) { - // FT_LOG_DEBUG(__PRETTY_FUNCTION__); - pointer_mapping_ = new std::unordered_map(); -#if defined(CUDA_MEMORY_POOL_DISABLED) - // FT_LOG_WARNING( - // "Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync - // cudaMalloc/Free." "Note this may lead to hang with NCCL kernels - // launched in parallel; if so, try NCCL_LAUNCH_MODE=GROUP"); -#else - int device_count = 1; - check_cuda_error(cudaGetDeviceCount(&device_count)); - cudaMemPool_t mempool; - check_cuda_error(cudaDeviceGetDefaultMemPool(&mempool, device_id)); - cudaMemAccessDesc desc = {}; - int peer_access_available = 0; - for (int i = 0; i < device_count; i++) { - if (i == device_id) { - continue; - } - check_cuda_error( - cudaDeviceCanAccessPeer(&peer_access_available, device_id, i)); - if (!peer_access_available) { - // FT_LOG_WARNING("Device " + std::to_string(device_id) + " peer access - // Device " + std::to_string(i) - // + " is not available."); - continue; - } - desc.location.type = cudaMemLocationTypeDevice; - desc.location.id = i; - desc.flags = cudaMemAccessFlagsProtReadWrite; - check_cuda_error(cudaMemPoolSetAccess(mempool, &desc, 1)); - } - // set memory pool threshold to avoid shrinking the pool - uint64_t setVal = UINT64_MAX; - check_cuda_error(cudaMemPoolSetAttribute( - mempool, cudaMemPoolAttrReleaseThreshold, &setVal)); -#endif - } - - virtual ~Allocator() { - // FT_LOG_DEBUG(__PRETTY_FUNCTION__); - while (!pointer_mapping_->empty()) { - free((void **)(&pointer_mapping_->begin()->first)); - } - delete pointer_mapping_; - } - - void setStream(cudaStream_t stream) { stream_ = stream; } - - cudaStream_t returnStream() { return stream_; }; - - void *malloc(size_t size, const bool is_set_zero = true, - bool is_host = false) { - // FT_LOG_DEBUG(__PRETTY_FUNCTION__); - if (size == 0) { - return nullptr; - } - void *ptr = nullptr; - int o_device = 0; - - check_cuda_error(getSetDevice(device_id_, &o_device)); - if (is_host) { - check_cuda_error(cudaMallocHost(&ptr, (size_t)(ceil(size / 32.)) * 32)); - } else { -#if defined(CUDA_MEMORY_POOL_DISABLED) - check_cuda_error(cudaMalloc(&ptr, (size_t)(ceil(size / 32.)) * 32)); -#else - check_cuda_error( - cudaMallocAsync(&ptr, (size_t)(ceil(size / 32.)) * 32, stream_)); -#endif - } - if (is_set_zero) { - check_cuda_error( - cudaMemsetAsync(ptr, 0, (size_t)(ceil(size / 32.)) * 32, stream_)); - } - check_cuda_error(getSetDevice(o_device)); - // FT_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size); - - pointer_mapping_->insert({getAddress(ptr), size}); - - return ptr; - } - - void free(void **ptr, bool is_host = false) const { - // FT_LOG_DEBUG(__PRETTY_FUNCTION__); - void *address = getAddress(*ptr); - if (*ptr != nullptr) { - int o_device = 0; - if (pointer_mapping_->count(address)) { - // FT_LOG_DEBUG("Free buffer %p", address); - check_cuda_error(getSetDevice(device_id_, &o_device)); - if (is_host) { - check_cuda_error(cudaFreeHost(*ptr)); - } else { -#if defined(CUDA_MEMORY_POOL_DISABLED) - check_cuda_error(cudaFree(*ptr)); -#else - check_cuda_error(cudaFreeAsync(*ptr, stream_)); - cudaStreamSynchronize(stream_); -#endif - } - check_cuda_error(getSetDevice(o_device)); - pointer_mapping_->erase(address); - } else { - // FT_LOG_WARNING("pointer_mapping_ does not have information of ptr at - // %p.", address); - } - } - *ptr = nullptr; - return; - } - - void memSet(void *ptr, const int val, const size_t size) { - check_cuda_error(cudaMemsetAsync(ptr, val, size, stream_)); - } -}; \ No newline at end of file diff --git a/csrc/quantization/smoothquant/int8gemm/cublasAlgoMap.cc b/csrc/quantization/smoothquant/int8gemm/cublasAlgoMap.cc deleted file mode 100644 index 61e41438c6a8..000000000000 --- a/csrc/quantization/smoothquant/int8gemm/cublasAlgoMap.cc +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "cublasAlgoMap.h" - -cublasAlgoMap::cublasAlgoMap(const std::string filename, - const std::string sp_config_filename) - : config_filename_(filename), sp_config_filename_(sp_config_filename) { - loadGemmConfig(); - loadSpGemmConfig(); -} - -cublasAlgoMap::cublasAlgoMap(const cublasAlgoMap &algo_map) - : config_filename_(algo_map.config_filename_), - sp_config_filename_(algo_map.sp_config_filename_), - algo_map_(algo_map.algo_map_), sp_algo_map_(algo_map.sp_algo_map_) {} - -cublasAlgoMap::~cublasAlgoMap() { algo_map_.clear(); } - -void cublasAlgoMap::loadGemmConfig() { - FILE *fd; - fd = fopen(config_filename_.c_str(), "r"); - if (fd == NULL) { - std::cout << "[WARNING] " << config_filename_ - << " is not found; using default GEMM algo" << std::endl; - return; - } - - int batchCount2, m2, n2, k2, algoId, customOption, tile, splitK_val; - int batch_size, seq_len, head_num, size_per_head, dataType; - int swizzle, reductionScheme, workspaceSize, stages; - int inner_shapeId, cluster_shapeId, mma_shapeId, cga_shapeId, sche_mode; - float exec_time; - char tmp[1024]; - if (!fgets(tmp, 1024, fd)) { - printf("[ERROR] fgets fail at %s:%d \n", __FILE__, __LINE__); - exit(-1); - } - while (fscanf(fd, - "%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "%d %d " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "%d %d %d " -#endif - "%f\n", - &batch_size, &seq_len, &head_num, &size_per_head, &dataType, - &batchCount2, &n2, &m2, &k2, &algoId, &customOption, &tile, - &splitK_val, &swizzle, &reductionScheme, &workspaceSize, - &stages, -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - &inner_shapeId, &cluster_shapeId, -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - &mma_shapeId, &cga_shapeId, &sche_mode, -#endif - &exec_time) != EOF) { - if (dataType != FLOAT_DATATYPE && dataType != HALF_DATATYPE && - dataType != BFLOAT16_DATATYPE && dataType != INT8_DATATYPE && - dataType != FP8_DATATYPE) { - printf("[WARNING][readAlgoFromConfig] wrong dataType %d!\n", dataType); - continue; - } - cublasAlgoConfig_t markStr{batchCount2, m2, n2, k2, - static_cast(dataType)}; - // workspaceSize should be zero - if (algo_map_.find(markStr) == algo_map_.end()) { - algo_map_[markStr].algoId = algoId; - algo_map_[markStr].customOption = customOption; - algo_map_[markStr].tile = tile; - algo_map_[markStr].splitK_val = splitK_val; - algo_map_[markStr].swizzle = swizzle; - algo_map_[markStr].reductionScheme = reductionScheme; - algo_map_[markStr].workspaceSize = workspaceSize; - algo_map_[markStr].stages = stages; -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - algo_map_[markStr].inner_shapeId = (uint16_t)inner_shapeId; - algo_map_[markStr].cluster_shapeId = (uint16_t)cluster_shapeId; -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - algo_map_[markStr].mma_shapeId = (uint16_t)mma_shapeId; - algo_map_[markStr].cga_shapeId = (uint16_t)cga_shapeId; - algo_map_[markStr].sche_mode = (uint16_t)sche_mode; -#endif - algo_map_[markStr].exec_time = exec_time; - } - } - fclose(fd); -} - -bool cublasAlgoMap::isExist(const int batch_count, const int m, const int n, - const int k, const CublasDataType data_type) { - cublasAlgoConfig_t mark{batch_count, n, m, k, data_type}; - return algo_map_.find(mark) != algo_map_.end(); -} - -cublasLtMatmulAlgo_info cublasAlgoMap::getAlgo(const int batch_count, - const int m, const int n, - const int k, - const CublasDataType data_type) { - cublasAlgoConfig_t mark{batch_count, n, m, k, data_type}; - if (algo_map_.find(mark) != algo_map_.end()) { - return algo_map_[mark]; - } else { - cublasLtMatmulAlgo_info tmp_algo; - tmp_algo.algoId = static_cast(data_type == FLOAT_DATATYPE - ? CUBLAS_GEMM_DEFAULT - : CUBLAS_GEMM_DEFAULT_TENSOR_OP); - tmp_algo.customOption = -1; - tmp_algo.tile = -1; - tmp_algo.splitK_val = -1; - tmp_algo.swizzle = -1; - tmp_algo.reductionScheme = -1; - tmp_algo.workspaceSize = -1; - tmp_algo.stages = -1; - tmp_algo.exec_time = -1.0f; - return tmp_algo; - } -} - -void cublasAlgoMap::loadSpGemmConfig() { - if (sp_config_filename_.empty()) { - return; - } - FILE *fd = fopen(sp_config_filename_.c_str(), "r"); - if (fd == NULL) { - printf("[WARNING] %s is not found; using SPGEMM algo id 0\n", - sp_config_filename_.c_str()); - return; - } - sp_algo_map_.clear(); - int batch_size, seq_len, head_num, size_per_head, data_type; - int batchCount, m, n, k, algoId; - float exec_time; - char tmp[1024]; - if (!fgets(tmp, 1024, fd)) { - printf("[ERROR] fgets fail at %s:%d \n", __FILE__, __LINE__); - exit(-1); - } - while (fscanf(fd, "%d %d %d %d %d ### %d %d %d %d %d %f\n", &batch_size, - &seq_len, &head_num, &size_per_head, &data_type, &batchCount, - &m, &n, &k, &algoId, &exec_time) != EOF) { - char mark[256]; - sprintf(mark, "%d_%d_%d_%d", batchCount, m, n, k); - std::string markStr(mark); - sp_algo_map_[markStr] = algoId; - } - fclose(fd); -} - -int cublasAlgoMap::getSpAlgo(const int batch_count, const int m, const int n, - const int k) { - char mark[256]; - sprintf(mark, "%d_%d_%d_%d", batch_count, m, n, k); - if (sp_algo_map_.find(mark) != sp_algo_map_.end()) { - return sp_algo_map_[mark]; - } else { - // for remove padding, select algo 1 for simplicity - return 0; - } -} - -bool cublasAlgoMap::isUseSparse(const int batch_count, const int m, const int n, - const int k) { - // not available to use cusparselt. - if (m % 8 != 0 || n % 8 != 0 || k % 8 != 0) { - return false; - } - char mark[256]; - sprintf(mark, "%d_%d_%d_%d", batch_count, m, n, k); - if (sp_algo_map_.find(mark) != sp_algo_map_.end()) { - return sp_algo_map_[mark] != -1; - } else { - // no gemm test case, choose sparse according to sparse flag - return true; - } -} diff --git a/csrc/quantization/smoothquant/int8gemm/cublasAlgoMap.h b/csrc/quantization/smoothquant/int8gemm/cublasAlgoMap.h deleted file mode 100644 index beb9d3a23d90..000000000000 --- a/csrc/quantization/smoothquant/int8gemm/cublasAlgoMap.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "cuda_utils.h" -#include -#include -#include -#include -#include -#include -#include - -#pragma once - -#define GEMM_NUM 6 -#define GEMM_CONFIG "gemm_config.in" -#define IGEMM_CONFIG "igemm_config.in" -#define SPGEMM_CONFIG "spgemm_config.in" -#define SPIGEMM_CONFIG "spigemm_config.in" - -typedef struct { - int algoId, customOption, tile, splitK_val; - int swizzle, reductionScheme, workspaceSize; - // only used in cublasLt >= 11.0 - int stages; -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - uint16_t inner_shapeId, cluster_shapeId; -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - uint16_t mma_shapeId, cga_shapeId, sche_mode; -#endif - float exec_time; -} cublasLtMatmulAlgo_info; - -/* Structure to store information about different run trials */ -typedef struct { - cublasLtMatmulAlgo_t algo; - cublasStatus_t status; - float time; - size_t workspaceSize; // actual memory workspace needed - cublasMath_t mathMode; - cublasLtReductionScheme_t reductionScheme; - int customOption; - float wavesCount; -} customMatmulPerf_t; - -struct cublasAlgoConfig_t { - int batch_count; - int m; - int n; - int k; - CublasDataType data_type; - bool operator==(cublasAlgoConfig_t const &config) const { - return (batch_count == config.batch_count) && (m == config.m) && - (n == config.n) && (k == config.k) && - (data_type == config.data_type); - } -}; - -class cublasAlgoConfig_hasher { -public: - std::size_t operator()(cublasAlgoConfig_t const &config) const { - return config.batch_count * 98317ull ^ config.m * 49157ull ^ - config.n * 24593ull ^ config.k * 196613ull ^ - static_cast(config.data_type) * 6151ull; - } -}; - -class cublasAlgoMap { -private: - std::unordered_map - algo_map_; - std::string config_filename_; - std::string sp_config_filename_; - std::map sp_algo_map_; - -public: - cublasAlgoMap(){}; - explicit cublasAlgoMap(const std::string filename, - const std::string sp_config_filename = ""); - cublasAlgoMap(const cublasAlgoMap &map); - ~cublasAlgoMap(); - void loadGemmConfig(); - void loadSpGemmConfig(); - int getSpAlgo(const int batch_count, const int m, const int n, const int k); - bool isUseSparse(const int batch_count, const int m, const int n, - const int k); - - bool isExist(const int batch_count, const int m, const int n, const int k, - const CublasDataType data_type); - - cublasLtMatmulAlgo_info getAlgo(const int batch_count, const int m, - const int n, const int k, - const CublasDataType data_type); -}; diff --git a/csrc/quantization/smoothquant/int8gemm/cublasINT8MMWrapper.cc b/csrc/quantization/smoothquant/int8gemm/cublasINT8MMWrapper.cc deleted file mode 100644 index 03c656b10cbd..000000000000 --- a/csrc/quantization/smoothquant/int8gemm/cublasINT8MMWrapper.cc +++ /dev/null @@ -1,676 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "cublasINT8MMWrapper.h" - -#ifndef CUDART_VERSION -#error CUDART_VERSION Undefined! -#endif - -cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle, - cudaStream_t stream, - cublasAlgoMap *cublas_algo_map, - std::mutex *mu, - bool use_ORDER_COL32_2R_4R4) - : cublas_handle_(nullptr), cublaslt_handle_(cublaslt_handle), - stream_(stream), cublas_algo_map_(cublas_algo_map), mu_(mu), - allocator_(nullptr), use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4) {} - -cublasINT8MMWrapper::cublasINT8MMWrapper(cublasHandle_t cublas_handle, - cublasLtHandle_t cublaslt_handle, - cudaStream_t stream, - cublasAlgoMap *cublas_algo_map, - std::mutex *mu, - bool use_ORDER_COL32_2R_4R4) - : cublas_handle_(cublas_handle), cublaslt_handle_(cublaslt_handle), - stream_(stream), cublas_algo_map_(cublas_algo_map), mu_(mu), - allocator_(nullptr), use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4) {} - - -cublasINT8MMWrapper::~cublasINT8MMWrapper() { mu_ = nullptr; } - -cublasINT8MMWrapper::cublasINT8MMWrapper(const cublasINT8MMWrapper &wrapper) - : cublas_handle_(nullptr), cublaslt_handle_(wrapper.cublaslt_handle_), - stream_(wrapper.stream_), cublas_algo_map_(wrapper.cublas_algo_map_), mu_(wrapper.mu_), - allocator_(wrapper.allocator_), use_ORDER_COL32_2R_4R4_(wrapper.use_ORDER_COL32_2R_4R4_) { -} - -// for int8 cublasLtMM with algo -// ATransform should be m*n, CUBLASLT_ORDER_COL32 -// kernel should be n*k, CUBLASLT_ORDER_COL4_4R2_8C or -// CUBLASLT_ORDER_COL32_2R_4R4 res is m*n, CUBLASLT_ORDER_COL32 -void cublasINT8MMWrapper::Gemm(int *res, int batchCount, int m, int n, int k, - int64_t stridea, int64_t strideb, - int64_t stridec, const int8_t *ATransform, - const int8_t *kernel) { - mu_->lock(); - cublasOperation_t opTranspose = CUBLAS_OP_T; -#if (CUDART_VERSION >= 11000) - cublasComputeType_t computeType = CUBLAS_COMPUTE_32I; -#else - cudaDataType_t computeType = CUDA_R_32I; -#endif - cublasLtMatmulDesc_t matmulDesc; - cublasLtMatrixLayout_t AtransformDesc = NULL; - cublasLtMatrixLayout_t BtransformDesc = NULL; - cublasLtMatrixLayout_t CtransformDesc = NULL; - cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32; - - cublasLtOrder_t order_matrixB; -#if (CUDART_VERSION >= 11000) - if (use_ORDER_COL32_2R_4R4_) { - order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4; - } else { - order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; - } -#else - order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; -#endif - - int ldaTransform = 32 * m; - int ldbTransform; - if (use_ORDER_COL32_2R_4R4_) { - ldbTransform = 32 * ((n + 32 - 1) / 32) * 32; - } else { - ldbTransform = 32 * ((n + 8 - 1) / 8) * 8; - } - int ldcTransform = 32 * m; - - // create matmulDesc -#if (CUDART_VERSION >= 11000) - cublasLtMatmulDescCreate(&matmulDesc, computeType, CUDA_R_32I); -#else - cublasLtMatmulDescCreate(&matmulDesc, computeType); -#endif - cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, - &opTranspose, sizeof(cublasOperation_t)); - cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldaTransform); - cublasLtMatrixLayoutSetAttribute(AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, - &order_COL32, sizeof(order_COL32)); - cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbTransform); - cublasLtMatrixLayoutSetAttribute(BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, - &order_matrixB, sizeof(order_matrixB)); - cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_32I, m, n, ldcTransform); - cublasLtMatrixLayoutSetAttribute(CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, - &order_COL32, sizeof(order_COL32)); - if (batchCount > 1) { - cublasLtMatrixLayoutSetAttribute(AtransformDesc, - CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, - &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, - sizeof(stridea)); - cublasLtMatrixLayoutSetAttribute(BtransformDesc, - CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, - &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, - sizeof(strideb)); - cublasLtMatrixLayoutSetAttribute(CtransformDesc, - CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, - &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, - sizeof(stridec)); - } - - int alphaI = 1; - int betaI = 0; - - // get algo - cublasLtMatmulAlgo_t algo; - int findAlgo = 0; - if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) { - // printf("find algo %s\n", markStr.c_str()); - findAlgo = 1; - - cublasLtMatmulAlgo_info tmp_info = - cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE); - - cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32I, CUDA_R_8I, - CUDA_R_8I, CUDA_R_32I, CUDA_R_32I, tmp_info.algoId, - &algo); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption), - sizeof(tmp_info.customOption)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, - &(tmp_info.tile), - sizeof(tmp_info.tile)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, - &(tmp_info.splitK_val), - sizeof(tmp_info.splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), - sizeof(tmp_info.swizzle)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, - &(tmp_info.reductionScheme), sizeof(int)); -#if (CUDART_VERSION >= 11000) - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, - &(tmp_info.stages), - sizeof(tmp_info.stages)); -#endif - } else { - findAlgo = 1; - int algoId; - if (use_ORDER_COL32_2R_4R4_) { - algoId = 7; - } else { - algoId = 6; - } - int swizzle = 0; - int customOption = 0; - int tile = 20; - int splitK_val = 0; - int reductionScheme = 0; - cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32I, CUDA_R_8I, - CUDA_R_8I, CUDA_R_32I, CUDA_R_32I, algoId, &algo); - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, - &(customOption), sizeof(customOption)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, - &(tile), sizeof(tile)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, - &(splitK_val), sizeof(splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, - &(reductionScheme), sizeof(int)); -#if (CUDART_VERSION >= 11000) - int stages; - if (use_ORDER_COL32_2R_4R4_) { - stages = 15; - } else { - stages = 13; - } - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, - &(stages), sizeof(stages)); -#endif - } - - cublasLtMatmul(cublaslt_handle_, matmulDesc, &alphaI, ATransform, - AtransformDesc, kernel, BtransformDesc, &betaI, res, - CtransformDesc, res, CtransformDesc, - (findAlgo == 1 ? (&algo) : NULL), NULL, 0, stream_); - - cublasLtMatmulDescDestroy(matmulDesc); - cublasLtMatrixLayoutDestroy(AtransformDesc); - cublasLtMatrixLayoutDestroy(BtransformDesc); - cublasLtMatrixLayoutDestroy(CtransformDesc); - sync_check_cuda_error(); - mu_->unlock(); -} - -// Atransform: mxk CUDA_R_8I -// kernel: nxk CUDA_R_8I -// res: mxn CUDA_R_32I -// alpha: CUDA_R_32I should be 1 -// beta: CUDA_R_32I should be 0 -// computeType: CUBLAS_COMPUTE_32I -void cublasINT8MMWrapper::Gemm_(int *res, int batchCount, int m, int n, int k, - int64_t stridea, int64_t strideb, - int64_t stridec, const int8_t *ATransform, - const int8_t *kernel) { - mu_->lock(); - cublasOperation_t opTranspose = CUBLAS_OP_T; -#if (CUDART_VERSION >= 11000) - cublasComputeType_t computeType = CUBLAS_COMPUTE_32I; -#else - cudaDataType_t computeType = CUDA_R_32I; -#endif - cublasLtMatmulDesc_t matmulDesc; - cublasLtMatrixLayout_t AtransformDesc = NULL; - cublasLtMatrixLayout_t BtransformDesc = NULL; - cublasLtMatrixLayout_t CtransformDesc = NULL; - - // create matmulDesc -#if (CUDART_VERSION >= 11000) - cublasLtMatmulDescCreate(&matmulDesc, computeType, CUDA_R_32I); -#else - cublasLtMatmulDescCreate(&matmulDesc, computeType); -#endif - cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, - &opTranspose, sizeof(cublasOperation_t)); - - cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, k, n, k); - - cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, k, m, k); - - cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_32I, n, m, n); - - if (batchCount > 1) { - cublasLtMatrixLayoutSetAttribute(AtransformDesc, - CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, - &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, - sizeof(stridea)); - cublasLtMatrixLayoutSetAttribute(BtransformDesc, - CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, - &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, - sizeof(strideb)); - cublasLtMatrixLayoutSetAttribute(CtransformDesc, - CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, - &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, - sizeof(stridec)); - } - - int alphaI = 1; - int betaI = 0; - - // get algo - cublasLtMatmulAlgo_t algo; - int findAlgo = 0; - if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) { - // printf("find algo %s\n", markStr.c_str()); - findAlgo = 1; - - cublasLtMatmulAlgo_info tmp_info = - cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE); - - cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32I, CUDA_R_8I, - CUDA_R_8I, CUDA_R_32I, CUDA_R_32I, tmp_info.algoId, - &algo); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption), - sizeof(tmp_info.customOption)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, - &(tmp_info.tile), - sizeof(tmp_info.tile)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, - &(tmp_info.splitK_val), - sizeof(tmp_info.splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), - sizeof(tmp_info.swizzle)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, - &(tmp_info.reductionScheme), sizeof(int)); -#if (CUDART_VERSION >= 11000) - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, - &(tmp_info.stages), - sizeof(tmp_info.stages)); -#endif - } else { - findAlgo = 1; - int algoId; - algoId = 21; - int swizzle = 0; - int customOption = 0; - int tile = 20; - int splitK_val = 0; - int reductionScheme = 0; - cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32I, CUDA_R_8I, - CUDA_R_8I, CUDA_R_32I, CUDA_R_32I, algoId, &algo); - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, - &(customOption), sizeof(customOption)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, - &(tile), sizeof(tile)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, - &(splitK_val), sizeof(splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, - &(reductionScheme), sizeof(int)); -#if (CUDART_VERSION >= 11000) - int stages; - stages = 17; - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, - &(stages), sizeof(stages)); -#endif - } - - cublasLtMatmul(cublaslt_handle_, matmulDesc, &alphaI, kernel, AtransformDesc, - ATransform, BtransformDesc, &betaI, res, CtransformDesc, res, - CtransformDesc, (findAlgo == 1 ? (&algo) : NULL), NULL, 0, - stream_); - - cublasLtMatmulDescDestroy(matmulDesc); - cublasLtMatrixLayoutDestroy(AtransformDesc); - cublasLtMatrixLayoutDestroy(BtransformDesc); - cublasLtMatrixLayoutDestroy(CtransformDesc); - sync_check_cuda_error(); - mu_->unlock(); -} - -// for int8 IO cublasLtMM with algo -// ATransform should be m*k CUBLASLT_ORDER_COL32 -// kernel should be n*k CUBLASLT_ORDER_COL4_4R2_8C -// res is m*n CUBLASLT_ORDER_COL32 -void cublasINT8MMWrapper::Gemm(int8_t *res, int batchCount, int m, int n, int k, - int64_t stridea, int64_t strideb, - int64_t stridec, const float alpha, - const int8_t *ATransform, const int8_t *kernel) { - mu_->lock(); - cublasOperation_t opTranspose = CUBLAS_OP_T; - // int8 gemm does not support CUBLAS_POINTER_MODE_DEVICE - // cublasLtPointerMode_t pointerMode = - // CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO; - cudaDataType_t scaleType = CUDA_R_32F; -#if (CUDART_VERSION >= 11000) - cublasComputeType_t computeType = CUBLAS_COMPUTE_32I; -#else - cudaDataType_t computeType = CUDA_R_32I; -#endif - cublasLtMatmulDesc_t matmulDesc; - cublasLtMatrixLayout_t AtransformDesc = NULL; - cublasLtMatrixLayout_t BtransformDesc = NULL; - cublasLtMatrixLayout_t CtransformDesc = NULL; - cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32; - - cublasLtOrder_t order_matrixB; -#if (CUDART_VERSION >= 11000) - if (use_ORDER_COL32_2R_4R4_) { - order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4; - } else { - order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; - } -#else - order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; -#endif - - int ldaTransform = 32 * m; - - int ldbTransform; - if (use_ORDER_COL32_2R_4R4_) { - ldbTransform = 32 * ((n + 32 - 1) / 32) * 32; - } else { - ldbTransform = 32 * ((n + 8 - 1) / 8) * 8; - } - - int ldcTransform = 32 * m; - - // create matmulDesc -#if (CUDART_VERSION >= 11000) - cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType); -#else - cublasLtMatmulDescCreate(&matmulDesc, computeType); -#endif - cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, - &opTranspose, sizeof(cublasOperation_t)); - cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, - &scaleType, sizeof(scaleType)); - // cublasLtMatmulDescSetAttribute(matmulDesc, - // CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointerMode, - // sizeof(cublasLtPointerMode_t)); - cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldaTransform); - cublasLtMatrixLayoutSetAttribute(AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, - &order_COL32, sizeof(order_COL32)); - cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbTransform); - cublasLtMatrixLayoutSetAttribute(BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, - &order_matrixB, sizeof(order_matrixB)); - cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_8I, m, n, ldcTransform); - cublasLtMatrixLayoutSetAttribute(CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, - &order_COL32, sizeof(order_COL32)); - if (batchCount > 1) { - cublasLtMatrixLayoutSetAttribute(AtransformDesc, - CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, - &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, - sizeof(stridea)); - cublasLtMatrixLayoutSetAttribute(BtransformDesc, - CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, - &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, - sizeof(strideb)); - cublasLtMatrixLayoutSetAttribute(CtransformDesc, - CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, - &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, - sizeof(stridec)); - } - - // get algo - cublasLtMatmulAlgo_t algo; - int findAlgo = 0; - if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) { - findAlgo = 1; - - cublasLtMatmulAlgo_info tmp_info = - cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE); - - cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32F, CUDA_R_8I, - CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, tmp_info.algoId, - &algo); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption), - sizeof(tmp_info.customOption)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, - &(tmp_info.tile), - sizeof(tmp_info.tile)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, - &(tmp_info.splitK_val), - sizeof(tmp_info.splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), - sizeof(tmp_info.swizzle)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, - &(tmp_info.reductionScheme), sizeof(int)); -#if (CUDART_VERSION >= 11000) - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, - &(tmp_info.stages), - sizeof(tmp_info.stages)); -#endif - } else { - findAlgo = 1; - int algoId; - if (use_ORDER_COL32_2R_4R4_) { - algoId = 7; - } else { - algoId = 6; - } - int swizzle = 0; - int customOption = 0; - int tile = 20; - int splitK_val = 0; - int reductionScheme = 0; - cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32F, CUDA_R_8I, - CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, algoId, &algo); - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, - &(customOption), sizeof(customOption)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, - &(tile), sizeof(tile)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, - &(splitK_val), sizeof(splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, - &(reductionScheme), sizeof(int)); -#if (CUDART_VERSION >= 11000) - int stages; - if (use_ORDER_COL32_2R_4R4_) { - stages = 15; - } else { - stages = 13; - } - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, - &(stages), sizeof(stages)); -#endif - } - - float beta = 0.0f; - cublasLtMatmul(cublaslt_handle_, matmulDesc, &alpha, kernel, AtransformDesc, - ATransform, BtransformDesc, &beta, res, CtransformDesc, res, - CtransformDesc, (findAlgo == 1 ? (&algo) : NULL), NULL, 0, - stream_); - - cublasLtMatmulDescDestroy(matmulDesc); - cublasLtMatrixLayoutDestroy(AtransformDesc); - cublasLtMatrixLayoutDestroy(BtransformDesc); - cublasLtMatrixLayoutDestroy(CtransformDesc); - sync_check_cuda_error(); - mu_->unlock(); -} - -// Atransform: mxk CUDA_R_8I -// kernel: nxk CUDA_R_8I -// res: mxn CUDA_R_8I -// alpha: CUDA_R_32F -// beta: CUDA_R_32F -// computeType: CUBLAS_COMPUTE_32I -void cublasINT8MMWrapper::Gemm_(int8_t *res, int batchCount, int m, int n, - int k, int64_t stridea, int64_t strideb, - int64_t stridec, const float alpha, - const int8_t *ATransform, - const int8_t *kernel) { - mu_->lock(); - cublasOperation_t opTranspose = CUBLAS_OP_T; - // int8 gemm does not support CUBLAS_POINTER_MODE_DEVICE - // cublasLtPointerMode_t pointerMode = - // CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO; - cudaDataType_t scaleType = CUDA_R_32F; -#if (CUDART_VERSION >= 11000) - cublasComputeType_t computeType = CUBLAS_COMPUTE_32I; -#else - cudaDataType_t computeType = CUDA_R_32I; -#endif - cublasLtMatmulDesc_t matmulDesc; - cublasLtMatrixLayout_t AtransformDesc = NULL; - cublasLtMatrixLayout_t BtransformDesc = NULL; - cublasLtMatrixLayout_t CtransformDesc = NULL; - - // create matmulDesc -#if (CUDART_VERSION >= 11000) - cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType); -#else - cublasLtMatmulDescCreate(&matmulDesc, computeType); -#endif - cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, - &opTranspose, sizeof(cublasOperation_t)); - cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, - &scaleType, sizeof(scaleType)); - // cublasLtMatmulDescSetAttribute(matmulDesc, - // CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointerMode, - // sizeof(cublasLtPointerMode_t)); - cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, k, n, k); - - cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, k, m, k); - - cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_8I, n, m, n); - - if (batchCount > 1) { - cublasLtMatrixLayoutSetAttribute(AtransformDesc, - CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, - &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, - sizeof(stridea)); - cublasLtMatrixLayoutSetAttribute(BtransformDesc, - CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, - &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, - sizeof(strideb)); - cublasLtMatrixLayoutSetAttribute(CtransformDesc, - CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, - &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, - sizeof(stridec)); - } - - // get algo - cublasLtMatmulAlgo_t algo; - int findAlgo = 0; - if (cublas_algo_map_->isExist(batchCount, n, m, k, INT8_DATATYPE)) { - findAlgo = 1; - cublasLtMatmulAlgo_info tmp_info = - cublas_algo_map_->getAlgo(batchCount, n, m, k, INT8_DATATYPE); - - cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32F, CUDA_R_8I, - CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, tmp_info.algoId, - &algo); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption), - sizeof(tmp_info.customOption)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, - &(tmp_info.tile), - sizeof(tmp_info.tile)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, - &(tmp_info.splitK_val), - sizeof(tmp_info.splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), - sizeof(tmp_info.swizzle)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, - &(tmp_info.reductionScheme), sizeof(int)); -#if (CUDART_VERSION >= 11000) - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, - &(tmp_info.stages), - sizeof(tmp_info.stages)); -#endif - } else { - findAlgo = 1; - int algoId; - algoId = 21; - int swizzle = 0; - int customOption = 0; - int tile = 20; - int splitK_val = 0; - int reductionScheme = 0; - cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, CUDA_R_32F, CUDA_R_8I, - CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, algoId, &algo); - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, - &(customOption), sizeof(customOption)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, - &(tile), sizeof(tile)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, - &(splitK_val), sizeof(splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, - &(reductionScheme), sizeof(int)); -#if (CUDART_VERSION >= 11000) - int stages; - stages = 17; - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, - &(stages), sizeof(stages)); -#endif - } - - float beta = 0.0f; - cublasLtMatmul(cublaslt_handle_, matmulDesc, &alpha, kernel, AtransformDesc, - ATransform, BtransformDesc, &beta, res, CtransformDesc, res, - CtransformDesc, (findAlgo == 1 ? (&algo) : NULL), NULL, 0, - stream_); - - cublasLtMatmulDescDestroy(matmulDesc); - cublasLtMatrixLayoutDestroy(AtransformDesc); - cublasLtMatrixLayoutDestroy(BtransformDesc); - cublasLtMatrixLayoutDestroy(CtransformDesc); - sync_check_cuda_error(); - mu_->unlock(); -} - -bool cublasINT8MMWrapper::getUseOrderCol322R4R4() { - return use_ORDER_COL32_2R_4R4_; -} diff --git a/csrc/quantization/smoothquant/int8gemm/cublasINT8MMWrapper.h b/csrc/quantization/smoothquant/int8gemm/cublasINT8MMWrapper.h deleted file mode 100644 index 8bc209f58b91..000000000000 --- a/csrc/quantization/smoothquant/int8gemm/cublasINT8MMWrapper.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "allocator.h" -#include "cublasAlgoMap.h" -#include -#include -#include -#include -#include -#include - -#pragma once - -class cublasINT8MMWrapper{ -protected: - cublasHandle_t cublas_handle_; - cublasLtHandle_t cublaslt_handle_; - cudaStream_t stream_; - cublasAlgoMap *cublas_algo_map_; - std::mutex *mu_; - IAllocator *allocator_ = nullptr; - -private: - bool use_ORDER_COL32_2R_4R4_; - -public: - cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle_, cudaStream_t stream, - cublasAlgoMap *map, std::mutex *mu, - bool use_ORDER_COL32_2R_4R4); - - cublasINT8MMWrapper(cublasHandle_t cublas_handle, - cublasLtHandle_t cublaslt_handle, cudaStream_t stream, - cublasAlgoMap *map, std::mutex *mu, - bool use_ORDER_COL32_2R_4R4); - - ~cublasINT8MMWrapper(); - - cublasINT8MMWrapper(const cublasINT8MMWrapper &wrapper); - - void Gemm(int *res, int batchCount, int m, int n, int k, int64_t stridea, - int64_t strideb, int64_t stridec, const int8_t *ATransform, - const int8_t *kernel); - - void Gemm_(int *res, int batchCount, int m, int n, int k, int64_t stridea, - int64_t strideb, int64_t stridec, const int8_t *ATransform, - const int8_t *kernel); - - void Gemm(int8_t *res, int batchCount, int m, int n, int k, int64_t stridea, - int64_t strideb, int64_t stridec, const float alpha, - const int8_t *ATransform, const int8_t *kernel); - - void Gemm_(int8_t *res, int batchCount, int m, int n, int k, int64_t stridea, - int64_t strideb, int64_t stridec, const float alpha, - const int8_t *ATransform, const int8_t *kernel); - - bool getUseOrderCol322R4R4(); -}; \ No newline at end of file diff --git a/csrc/quantization/smoothquant/int8gemm/cuda_utils.cc b/csrc/quantization/smoothquant/int8gemm/cuda_utils.cc deleted file mode 100644 index 588375570937..000000000000 --- a/csrc/quantization/smoothquant/int8gemm/cuda_utils.cc +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "cuda_utils.h" - -cudaError_t getSetDevice(int i_device, int *o_device) { - int current_dev_id = 0; - cudaError_t err = cudaSuccess; - - if (o_device != NULL) { - err = cudaGetDevice(¤t_dev_id); - if (err != cudaSuccess) { - return err; - } - if (current_dev_id == i_device) { - *o_device = i_device; - } else { - err = cudaSetDevice(i_device); - if (err != cudaSuccess) { - return err; - } - *o_device = current_dev_id; - } - } else { - err = cudaSetDevice(i_device); - if (err != cudaSuccess) { - return err; - } - } - - return cudaSuccess; -} diff --git a/csrc/quantization/smoothquant/int8gemm/cuda_utils.h b/csrc/quantization/smoothquant/int8gemm/cuda_utils.h deleted file mode 100644 index f1d9bba4ab06..000000000000 --- a/csrc/quantization/smoothquant/int8gemm/cuda_utils.h +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - - -enum CublasDataType { - FLOAT_DATATYPE = 0, - HALF_DATATYPE = 1, - BFLOAT16_DATATYPE = 2, - INT8_DATATYPE = 3, - FP8_DATATYPE = 4 -}; - -static const char *_cudaGetErrorEnum(cudaError_t error) { - return cudaGetErrorString(error); -} - -static const char *_cudaGetErrorEnum(cublasStatus_t error) { - switch (error) { - case CUBLAS_STATUS_SUCCESS: - return "CUBLAS_STATUS_SUCCESS"; - - case CUBLAS_STATUS_NOT_INITIALIZED: - return "CUBLAS_STATUS_NOT_INITIALIZED"; - - case CUBLAS_STATUS_ALLOC_FAILED: - return "CUBLAS_STATUS_ALLOC_FAILED"; - - case CUBLAS_STATUS_INVALID_VALUE: - return "CUBLAS_STATUS_INVALID_VALUE"; - - case CUBLAS_STATUS_ARCH_MISMATCH: - return "CUBLAS_STATUS_ARCH_MISMATCH"; - - case CUBLAS_STATUS_MAPPING_ERROR: - return "CUBLAS_STATUS_MAPPING_ERROR"; - - case CUBLAS_STATUS_EXECUTION_FAILED: - return "CUBLAS_STATUS_EXECUTION_FAILED"; - - case CUBLAS_STATUS_INTERNAL_ERROR: - return "CUBLAS_STATUS_INTERNAL_ERROR"; - - case CUBLAS_STATUS_NOT_SUPPORTED: - return "CUBLAS_STATUS_NOT_SUPPORTED"; - - case CUBLAS_STATUS_LICENSE_ERROR: - return "CUBLAS_STATUS_LICENSE_ERROR"; - } - return ""; -} - -template -void check(T result, char const *const func, const char *const file, - int const line) { - if (result) { - throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") + - (_cudaGetErrorEnum(result)) + " " + file + ":" + - std::to_string(line) + " \n"); - } -} - -#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__) -#define check_cuda_error_2(val, file, line) check((val), #val, file, line) - -inline void syncAndCheck(const char *const file, int const line) { - // When FT_DEBUG_LEVEL=DEBUG, must check error - static char *level_name = std::getenv("FT_DEBUG_LEVEL"); - if (level_name != nullptr) { - static std::string level = std::string(level_name); - if (level == "DEBUG") { - cudaDeviceSynchronize(); - cudaError_t result = cudaGetLastError(); - if (result) { - throw std::runtime_error( - std::string("[FT][ERROR] CUDA runtime error: ") + - (_cudaGetErrorEnum(result)) + " " + file + ":" + - std::to_string(line) + " \n"); - } - // FT_LOG_DEBUG(fmtstr("run syncAndCheck at %s:%d", file, line)); - } - } - -#ifndef NDEBUG - cudaDeviceSynchronize(); - cudaError_t result = cudaGetLastError(); - if (result) { - throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") + - (_cudaGetErrorEnum(result)) + " " + file + ":" + - std::to_string(line) + " \n"); - } -#endif -} - -#define sync_check_cuda_error() syncAndCheck(__FILE__, __LINE__) - - -[[noreturn]] inline void throwRuntimeError(const char *const file, - int const line, - std::string const &info = "") { - throw std::runtime_error(std::string("[FT][ERROR] ") + info + - " Assertion fail: " + file + ":" + - std::to_string(line) + " \n"); -} - -inline void myAssert(bool result, const char *const file, int const line, - std::string const &info = "") { - if (!result) { - throwRuntimeError(file, line, info); - } -} - -#define FT_CHECK(val) myAssert(val, __FILE__, __LINE__) -#define FT_CHECK_WITH_INFO(val, info) \ - do { \ - bool is_valid_val = (val); \ - if (!is_valid_val) { \ - fastertransformer::myAssert(is_valid_val, __FILE__, __LINE__, (info)); \ - } \ - } while (0) - -#define FT_THROW(info) throwRuntimeError(__FILE__, __LINE__, info) - -cudaError_t getSetDevice(int i_device, int *o_device = NULL); - -inline int getDevice() { - int current_dev_id = 0; - check_cuda_error(cudaGetDevice(¤t_dev_id)); - return current_dev_id; -} - -inline int getDeviceCount() { - int count = 0; - check_cuda_error(cudaGetDeviceCount(&count)); - return count; -} \ No newline at end of file diff --git a/csrc/quantization/smoothquant/int8gemm/int8_gemm.h b/csrc/quantization/smoothquant/int8gemm/int8_gemm.h deleted file mode 100644 index 2e80d4efe22a..000000000000 --- a/csrc/quantization/smoothquant/int8gemm/int8_gemm.h +++ /dev/null @@ -1,127 +0,0 @@ -/* - gemm methods are adapted from ft -*/ -#include -#include "cublasAlgoMap.h" -#include "cublasINT8MMWrapper.h" - -class I8CUGEMM { -private: - cublasINT8MMWrapper *int8_gemm_wrapper = nullptr; - cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - -public: - I8CUGEMM(); - ~I8CUGEMM(); - - void linear_a8_w8_o32( - torch::Tensor& input, - torch::Tensor& weight, - torch::Tensor& output); - void linear_a8_w8_o32_( - torch::Tensor& input, - torch::Tensor& weight, - torch::Tensor& output); - void linear_a8_w8_o8( - torch::Tensor& input, - torch::Tensor& weight, - torch::Tensor& output, - float alpha); - void linear_a8_w8_o8_( - torch::Tensor& input, - torch::Tensor& weight, - torch::Tensor& output, - float alpha); -}; -I8CUGEMM::I8CUGEMM() { - // cublasAlgoMap *cublas_algo_map = new cublasAlgoMap("igemm_config.in"); - cublasAlgoMap *cublas_algo_map = new cublasAlgoMap(); - std::mutex *cublas_wrapper_mutex = new std::mutex(); - bool use_ORDER_COL32_2R_4R4 = true; - - cublasLtHandle_t cublaslt_handle; - cublasLtCreate(&cublaslt_handle); - - int8_gemm_wrapper = new cublasINT8MMWrapper( - cublaslt_handle, - this->stream, - cublas_algo_map, - cublas_wrapper_mutex, - use_ORDER_COL32_2R_4R4); -} - -I8CUGEMM::~I8CUGEMM() {} - -void I8CUGEMM::linear_a8_w8_o32( - torch::Tensor& input, // INT8 - torch::Tensor& weight, // INT8 - torch::Tensor& out // INT32 -) { - int m = input.size(0); - int n = weight.size(0); - int k = input.size(1); - - // Set data types - int8_t* input_ptr = input.data_ptr(); - int8_t* weight_ptr = weight.data_ptr(); - int32_t* output_ptr = out.data_ptr(); - - int8_gemm_wrapper->Gemm(output_ptr, 1, m, n, k, 0, 0, 0, input_ptr, - weight_ptr); -} - -void I8CUGEMM::linear_a8_w8_o32_( - torch::Tensor& input, // INT8 - torch::Tensor& weight, // INT8 - torch::Tensor& out // INT32 -) { - int m = input.size(0); - int n = weight.size(0); - int k = input.size(1); - - // Set data types - int8_t* input_ptr = input.data_ptr(); - int8_t* weight_ptr = weight.data_ptr(); - int32_t* output_ptr = out.data_ptr(); - - int8_gemm_wrapper->Gemm_(output_ptr, 1, m, n, k, 0, 0, 0, input_ptr, - weight_ptr); -} - -void I8CUGEMM::linear_a8_w8_o8( - torch::Tensor& input, // INT8 - torch::Tensor& weight, // INT8 - torch::Tensor& out, // INT8 - float alpha // FP32 -) { - int m = input.size(0); - int n = weight.size(0); - int k = input.size(1); - - // Set data types - int8_t* input_ptr = input.data_ptr(); - int8_t* weight_ptr = weight.data_ptr(); - int8_t* output_ptr = out.data_ptr(); - - int8_gemm_wrapper->Gemm(output_ptr, 1, m, n, k, 0, 0, 0, alpha, input_ptr, - weight_ptr); -} - -void I8CUGEMM::linear_a8_w8_o8_( - torch::Tensor& input, // INT8 - torch::Tensor& weight, // INT8 - torch::Tensor& out, // INT8 - float alpha // FP32 -) { - int m = input.size(0); - int n = weight.size(0); - int k = input.size(1); - - // Set data types - int8_t* input_ptr = input.data_ptr(); - int8_t* weight_ptr = weight.data_ptr(); - int8_t* output_ptr = out.data_ptr(); - - int8_gemm_wrapper->Gemm_(output_ptr, 1, m, n, k, 0, 0, 0, alpha, input_ptr, - weight_ptr); -} diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 6ee75e8139c0..f3adcb519ed9 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -8,3 +8,4 @@ vllm-nccl-cu12>=2.18,<2.19 # for downloading nccl library torch == 2.2.1 xformers == 0.0.25 # Requires PyTorch 2.2.1 triton >= 2.1.0 +nvidia-cutlass == 3.5.0 diff --git a/tests/kernels/test_fusion.py b/tests/kernels/test_fusion.py index 07d9ce60a403..7cebc76b248b 100644 --- a/tests/kernels/test_fusion.py +++ b/tests/kernels/test_fusion.py @@ -9,54 +9,6 @@ SEEDS = [0] SCALE = [0.1, 0.5, 0.8, 1.2, 2.1] -@pytest.mark.parametrize("num_tokens", NUM_TOKENS) -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("scale", SCALE) -@torch.inference_mode() -def test_dequant(num_tokens: int, hidden_size: int, dtype: torch.dtype, - seed: int, scale: float) -> None: - torch.random.manual_seed(seed) - torch.cuda.manual_seed(seed) - x = torch.randint( - torch.iinfo(torch.int32).min, - torch.iinfo(torch.int32).max, - (num_tokens, hidden_size), - dtype=torch.int32, - device="cuda", - ) - - out1 = (x * scale).to(dtype) - out2 = torch.empty_like(x, dtype=dtype) - ops.dequant(out2, x, scale) - assert torch.allclose(out1, out2, atol=0.001) - - -@pytest.mark.parametrize("num_tokens", NUM_TOKENS) -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@torch.inference_mode() -def test_per_token_dequant(num_tokens: int, hidden_size: int, - dtype: torch.dtype, seed: int) -> None: - torch.random.manual_seed(seed) - torch.cuda.manual_seed(seed) - x = torch.randint( - torch.iinfo(torch.int32).min, - torch.iinfo(torch.int32).max, - (num_tokens, hidden_size), - dtype=torch.int32, - device="cuda", - ) - scale = torch.rand(num_tokens, 1, dtype=torch.float32, device="cuda") - out1 = (x * scale).to(dtype) - out2 = torch.empty_like(x, dtype=dtype) - scale = torch.squeeze(scale) - ops.dequant(out2, x, scale) - assert torch.allclose(out1, out2, atol=0.001) - - @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("dtype", DTYPES) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 2598156bbed3..95b36b782b88 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -166,9 +166,6 @@ class ColumnParallelLinear(torch.nn.Module): skip adding bias but instead return it. params_dtype: Data type for the parameters. linear_method: (Maybe quantized) linear method. - logical_widths: Optional list of widths for logical weight matrices. - E.g. for QKVParallelLinear, this parameter defines - the width """ def __init__( @@ -308,6 +305,7 @@ def weight_loader(self, param_data = param.data output_dim = getattr(param, "output_dim", None) param_shard_splitter = getattr(param, "shard_splitter", None) + if output_dim is not None and param_shard_splitter is not None: raise NotImplementedError( "We do not currently support output_dim != None and " @@ -373,8 +371,11 @@ def weight_loader(self, shard_size) # If a param_shard_splitter is defined by the LinearMethod, use it. elif param_shard_splitter is not None: + logical_widths = getattr(param, "logical_widths") param_data, loaded_weight = param_shard_splitter( - param_data, loaded_weight, loaded_shard_id) + param_data, loaded_weight, loaded_shard_id, logical_widths + ) + else: ignore_warning = getattr(param, "ignore_warning", False) if not ignore_warning: @@ -466,7 +467,7 @@ def weight_loader(self, param_data = param.data output_dim = getattr(param, "output_dim", None) param_shard_splitter = getattr(param, "shard_splitter", None) - + if output_dim is not None and param_shard_splitter is not None: raise NotImplementedError( "We do not currently support output_dim != None and " @@ -548,8 +549,10 @@ def weight_loader(self, shard_size) # If a param_shard_splitter is defined by the LinearMethod, use it. elif param_shard_splitter is not None: + logical_widths = getattr(param, "logical_widths") param_data, loaded_weight = param_shard_splitter( - param_data, loaded_weight, loaded_shard_id) + param_data, loaded_weight, loaded_shard_id, logical_widths) + else: ignore_warning = getattr(param, "ignore_warning", False) if not ignore_warning: diff --git a/vllm/model_executor/layers/quantization/smoothquant/config.py b/vllm/model_executor/layers/quantization/smoothquant/config.py index 885ffce3e36d..788481a51ae1 100644 --- a/vllm/model_executor/layers/quantization/smoothquant/config.py +++ b/vllm/model_executor/layers/quantization/smoothquant/config.py @@ -1,10 +1,8 @@ from typing import Any, Dict, List, Tuple, Type, Optional, Union -import threading import torch from torch.nn.parameter import Parameter -from vllm._C import ops from vllm.model_executor.layers.linear import ( LinearMethodBase, set_weight_attrs) @@ -15,6 +13,9 @@ SmoothQuantDynamicPerToken, SmoothQuantStaticPerTensor, ) +from vllm.model_executor.layers.quantization.smoothquant.cutlass_gemm import ( + cutlass_gemm_dq +) LAYER_KEYS = ["qkv", "out", "fc1", "fc2"] FORMAT_REGISTRY = { @@ -85,31 +86,10 @@ def from_config(cls, config: Dict[str, Any]) -> "SmoothQuantConfig": def get_linear_method(self) -> "SmoothQuantLinearMethod": return SmoothQuantLinearMethod(self) - -# TODO: why is this needed? -class Int8GEMM(object): - _instance_lock = threading.Lock() - - def __init__(self): - if not hasattr(self, "i8cugemm"): - self.i8cugemm = ops.I8CUGEMM() - - def __new__(cls, *args, **kwargs): - if not hasattr(Int8GEMM, "_instance"): - with Int8GEMM._instance_lock: - if not hasattr(Int8GEMM, "_instance"): - Int8GEMM._instance = object.__new__(cls) - return Int8GEMM._instance - - def get_i8cugemm(self): - return self.i8cugemm - - class SmoothQuantLinearMethod(LinearMethodBase): def __init__(self, sq_config: SmoothQuantConfig) -> None: self.sq_config = sq_config self.sq_type = None - self.i8cugemm = Int8GEMM().get_i8cugemm() def maybe_update_loaded_weight_name(self, name: str) -> str: @@ -123,24 +103,26 @@ def maybe_update_loaded_weight_name(self, name.replace(suffix, "dequant_scale") return name - def scales_shard_splitter(self, - param: torch.Tensor, - loaded_weight: torch.Tensor, - shard_id: Union[str, int]) -> Tuple[torch.Tensor, torch.Tensor]: - """Index into param for for loading. - - This function is called by QKVColumnLinear and MergedColumnParallelLinear - during weight loading to put the scales from disk in the right spot. - """ - if type(shard_id) == str: - qkv_idxs = { "q": 0, "k": 1, "v": 2 } - if shard_id not in qkv_idxs: - raise ValueError(f"Invalid shard_id {shard_id}") - shard_id = qkv_idxs[shard_id] - elif type(shard_id) != int: - raise ValueError(f"Invalid shard id {shard_id}") + def shard_id_as_int(self, shard_id: Union[str, int]) -> int: + if isinstance(shard_id, int): + return shard_id - return param[shard_id], loaded_weight + assert isinstance(shard_id, str) + qkv_idxs = { "q": 0, "k": 1, "v": 2 } + assert shard_id in qkv_idxs + return qkv_idxs[shard_id] + + def scales_shard_splitter(self, + param: torch.Tensor, + loaded_weight: torch.Tensor, + shard_id: Union[str, int], + logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + shard_id = self.shard_id_as_int(shard_id) + offset = sum(logical_widths[:shard_id]) + size = logical_widths[shard_id] + # update loaded weight with copies for broadcast. + loaded_weight = loaded_weight.repeat(size) + return param[offset : offset + size], loaded_weight def get_layer_format(self, layer_name: str) -> SmoothQuantFormat: """ @@ -213,16 +195,22 @@ def create_weights(self, "output_dim": 0, }) - # Static scale for each logical weight (e.g. 3 for QKV). - dequant_scale = Parameter( - torch.empty( - len(output_sizes_per_partition), - device='cuda', dtype=params_dtype, - ), requires_grad=False - ) - set_weight_attrs(dequant_scale, { - "shard_splitter": self.scales_shard_splitter, - }) + if len(output_sizes_per_partition) == 1: + # Single static scale for the entire tensor. + dequant_scale = Parameter( + torch.empty((1),device='cuda', dtype=params_dtype), + requires_grad=False + ) + else: + # Static scale for each logical weight (e.g. 3 for QKV). + dequant_scale = Parameter( + torch.empty((sum(output_sizes_per_partition)), + device='cuda', dtype=params_dtype), + requires_grad=False + ) + set_weight_attrs(dequant_scale, + {"shard_splitter": self.scales_shard_splitter, + "logical_widths" : output_sizes_per_partition}) return { "weight": weight, @@ -242,37 +230,10 @@ def _quantize(self, x_q: Quantized activation at INT8 activation_scales: Optional dynamic scales for each token. """ - x_q = torch.empty_like(x, dtype=torch.int8) + x_q = torch.empty_like(x, dtype=torch.int8, device="cuda") x_q, activation_scales = sq_format.quantize_op(x, x_q) return x_q, activation_scales - def _dequantize(self, - x_q: torch.Tensor, - dynamic_scales: Optional[torch.Tensor], - static_scales: torch.Tensor, - logical_widths: List[int], - dtype: torch.dtype, - sq_format: SmoothQuantFormat) -> torch.Tensor: - """Dequantize activations. - - Args: - x_q: quantized activations. - dynamic_scales: Optional dynamic scales. - static_scales: Static dequantization scales. - logical_widths: Width of each logical activation (for QKV case). - dtype: Datatype to dequantize to. - Returns: - x_dq: dequantized activation at output_dtype precision - """ - # Split X_q and X_dq buffer into logical activations (for QKV case). - x_q_split = x_q.split(logical_widths, dim=-1) - x_dq = torch.empty_like(x_q, dtype=dtype) - x_dq_split = x_dq.split(logical_widths, dim=-1) - # Dequantize in place and return. - sq_format.dequantize_op(x_q_split, x_dq_split, dynamic_scales, static_scales) - return x_dq - - def apply_weights(self, weights: Dict[str, torch.Tensor], x: torch.Tensor, @@ -290,17 +251,11 @@ def apply_weights(self, raise NotImplementedError weight_q = weights["weight"] static_scales = weights["dequant_scale"] - logical_widths = weights["logical_widths"] sq_format = weights["sq_format"] # Q x_q, activation_scales = self._quantize(x, sq_format) - # GEMM - x_q = x_q.view(-1, x_q.shape[-1]) - a_q = torch.empty((x_q.shape[0], weight_q.shape[0]), dtype=torch.int32, device="cuda") - self.i8cugemm.linear_a8_w8_o32_(x_q, weight_q, a_q) - a_q = a_q.view(*x_q.shape[:-1], -1) + # GEMM and DQ + return cutlass_gemm_dq(x_q, weight_q, x.dtype, static_scales, activation_scales) - # DQ - return self._dequantize(a_q, activation_scales, static_scales, logical_widths, x.dtype, sq_format) diff --git a/vllm/model_executor/layers/quantization/smoothquant/cutlass_gemm.py b/vllm/model_executor/layers/quantization/smoothquant/cutlass_gemm.py new file mode 100644 index 000000000000..05ae38c3343e --- /dev/null +++ b/vllm/model_executor/layers/quantization/smoothquant/cutlass_gemm.py @@ -0,0 +1,75 @@ + +import cutlass +from cutlass import Tensor as FakeTensor +import cutlass.epilogue + +import torch +from typing import Optional, Tuple, Dict + +from vllm.logger import init_logger + +logger = init_logger("cutlass_gemm") + +def setup_dequant_epilogue(plan : cutlass.op.Gemm, + dq: torch.Tensor, + static_scales: Optional[torch.Tensor], + activation_scales: Optional[torch.Tensor]) \ + -> Tuple[cutlass.op.Gemm, Dict]: + + if all([static_scales is None, activation_scales is None]): + return plan, None + assert static_scales is not None + + def epilog_with_scales_and_act_scales(accum, scales, act_scales): + D = accum * scales * act_scales + return D + + def epilog_with_scales(accum, scales): + D = accum * scales + return D + + epilog_tensors = { + 'scales' : static_scales, + 'D' : dq + } + epilogue_trace_tensors = { + "accum": FakeTensor(element=torch.int32, shape=dq.shape, + layout_tag=cutlass.LayoutType.RowMajor), + 'scales' : static_scales, + 'D' : dq, + } + epilog_fn = epilog_with_scales + + if activation_scales is not None: + epilog_tensors['act_scales'] = activation_scales + epilogue_trace_tensors['act_scales'] = activation_scales + epilog_fn = epilog_with_scales_and_act_scales + + plan.epilogue_visitor = cutlass.epilogue.trace(epilog_fn, epilogue_trace_tensors) + return plan, epilog_tensors + +def cutlass_gemm_dq(x_q : torch.Tensor, + w_q : torch.Tensor, + dtype: torch.dtype, + static_scales: torch.Tensor, + activation_scales: Optional[torch.Tensor] = None) -> torch.Tensor: + + dq = torch.empty((x_q.shape[0], w_q.shape[0]), + dtype=dtype, device="cuda") + + plan = cutlass.op.Gemm(element_A=x_q.dtype, element_B=w_q.dtype, + element_C=dq.dtype, element_D=dq.dtype, + layout_A=cutlass.LayoutType.RowMajor, + layout_B=cutlass.LayoutType.ColumnMajor, + layout_C=cutlass.LayoutType.RowMajor, + element_accumulator=torch.int32, + # TODO (varun) : lets not have kernel cc here please. + kernel_cc=80) + + plan, visitor_args = setup_dequant_epilogue(plan, dq, static_scales, activation_scales) + + plan.run(x_q, w_q.t(), dq, dq, alpha=1, beta=0, + visitor_args=visitor_args, print_module=False) + + dq = dq.view(*x_q.shape[:-1], -1) + return dq diff --git a/vllm/model_executor/layers/quantization/smoothquant/formats.py b/vllm/model_executor/layers/quantization/smoothquant/formats.py index b8ddd642c888..4155ef64ffe3 100644 --- a/vllm/model_executor/layers/quantization/smoothquant/formats.py +++ b/vllm/model_executor/layers/quantization/smoothquant/formats.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import List, Optional, Tuple, Type +from typing import Optional, Tuple import torch @@ -7,23 +7,6 @@ class SmoothQuantFormat(ABC): - @abstractmethod - def dequantize_op(self, - x_qs: List[torch.Tensor], - x_dqs: List[torch.Tensor], - dynamic_scales: Optional[torch.Tensor], - static_scales: torch.Tensor) -> None: - """Dequantize the activations. x_dq is updated in place. - - Args: - x_qs: List of N quantized activations. - x_dqs: List of N buffers to fill with dequantized values. - dynamic_scales: Optional dynamic scales for dequantization. - static_scales: Static scales for dequantization. N values. - """ - raise NotImplementedError - - @abstractmethod def quantize_op(self, x: torch.Tensor, @@ -41,55 +24,18 @@ def quantize_op(self, class SmoothQuantDynamicPerToken(SmoothQuantFormat): - def dequantize_op(self, - x_qs: List[torch.Tensor], - x_dqs: List[torch.Tensor], - dynamic_scales: Optional[torch.Tensor], - static_scales: torch.Tensor) -> None: - """Notes: - dynamic_scales: N scales for N tokens in the activation. - static_scales: K scales for K logical activations (equals just w_scale). - """ - if dynamic_scales is None: - raise ValueError - - # Dequantize each logical activation. - # TODO: test this for case when logical_widths > 1 (may need to reshape) - for x_dq, x_q, dynamic_scale, static_scale in zip( - x_dqs, x_qs, dynamic_scales, static_scales): - - # Dequantize (updates x_dq in place). - ops.dequant(x_dq, x_q, dynamic_scale, static_scale) - - def quantize_op(self, x: torch.Tensor, x_q: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """Notes: Returns quantized activaiton and dynamic activation scales. """ - activation_scales = torch.empty(x.numel() // x.shape[-1], dtype=x.dtype, device=x.device) + activation_scales = torch.empty((x.numel() // x.shape[-1], 1), dtype=x.dtype, device=x.device) ops.quant(x_q, x, activation_scales) return x_q, activation_scales class SmoothQuantStaticPerTensor(SmoothQuantFormat): - def dequantize_op(self, - x_qs: List[torch.Tensor], - x_dqs: List[torch.Tensor], - dynamic_scales: Optional[torch.Tensor], - static_scales: torch.Tensor) -> None: - """Notes: - dynamic_scales: None - static_scales: K scales for K logical activations (equals w_scale * a_scale). - """ - if dynamic_scales is not None: - raise ValueError - - # Dequantize each logical activation. - for xdq, xq, static_scale in zip(x_dqs, x_qs, static_scales): - ops.dequant(xdq, xq, static_scale) - def quantize_op(self, x: torch.Tensor, x_q: torch.Tensor) -> Tuple[torch.Tensor, None]: